Spaces:

MLVKU
/

Human_Object_Interaction

Runtime error

App Files Files Community

root commited on Jun 27, 2022

Commit

5e0b9df

0 Parent(s):

initial commit

Browse files

Files changed (48) hide show

.DS_Store +0 -0
.gitignore +135 -0
LICENSE +201 -0
NOTICE +39 -0
README.md +130 -0
configs/hico_train.sh +40 -0
configs/vcoco_train.sh +42 -0
hico_20160224_det +1 -0
hotr/data/datasets/__init__.py +24 -0
hotr/data/datasets/builtin_meta.py +110 -0
hotr/data/datasets/coco.py +156 -0
hotr/data/datasets/hico.py +243 -0
hotr/data/datasets/vcoco.py +467 -0
hotr/data/evaluators/coco_eval.py +256 -0
hotr/data/evaluators/hico_eval.py +242 -0
hotr/data/evaluators/vcoco_eval.py +57 -0
hotr/data/transforms/transforms.py +387 -0
hotr/engine/__init__.py +14 -0
hotr/engine/arg_parser.py +163 -0
hotr/engine/evaluator_coco.py +62 -0
hotr/engine/evaluator_hico.py +55 -0
hotr/engine/evaluator_vcoco.py +87 -0
hotr/engine/trainer.py +73 -0
hotr/metrics/utils.py +90 -0
hotr/metrics/vcoco/ap_agent.py +104 -0
hotr/metrics/vcoco/ap_role.py +193 -0
hotr/models/__init__.py +5 -0
hotr/models/backbone.py +118 -0
hotr/models/criterion.py +349 -0
hotr/models/detr.py +187 -0
hotr/models/detr_matcher.py +81 -0
hotr/models/feed_forward.py +16 -0
hotr/models/hotr.py +241 -0
hotr/models/hotr_matcher.py +216 -0
hotr/models/position_encoding.py +89 -0
hotr/models/post_process.py +162 -0
hotr/models/transformer.py +320 -0
hotr/util/__init__.py +0 -0
hotr/util/box_ops.py +110 -0
hotr/util/logger.py +145 -0
hotr/util/misc.py +401 -0
hotr/util/ramp.py +23 -0
imgs/mainfig.png +0 -0
main.py +240 -0
tools/launch.py +192 -0
tools/run_dist_launch.sh +29 -0
tools/run_dist_slurm.sh +33 -0
v-coco +1 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

.gitignore ADDED Viewed

	@@ -0,0 +1,135 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+wandb/
+checkpoints/
+# old version
+hotr/models/hotr_v1.py
+Makefile

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2021 KAKAO BRAIN Corp. All Rights Reserved.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

NOTICE ADDED Viewed

	@@ -0,0 +1,39 @@

+===============================================================================
+DETR' Apache License 2.0
+===============================================================================
+The implementation code is based on the implementation in DETR
+(https://github.com/facebookresearch/detr).
+- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+Copyright (c) 2020 Facebook
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+===============================================================================
+QPIC' Apache License 2.0
+===============================================================================
+The implementation code is based on the implementation in QPIC
+(https://github.com/hitachi-rd-cv/qpic).
+- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+Copyright (c) 2021 Hitachi
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,130 @@

+# CPC_HOTR
+This repository contains the application of [Cross-Path Consistency Learning](https://arxiv.org/abs/2204.04836) at [HOTR](https://arxiv.org/abs/2104.13682), based on the official implementation of QPIC in [here](https://github.com/kakaobrain/HOTR).
+<div align="center">
+  <img src=".github/mainfig.png" width="900px" />
+</div>
+## 1. Environmental Setup
+```bash
+$ conda create -n HOTR_CPC python=3.7
+$ conda install -c pytorch pytorch torchvision # PyTorch 1.7.1, torchvision 0.8.2, CUDA=11.0
+$ conda install cython scipy
+$ pip install pycocotools
+$ pip install opencv-python
+$ pip install wandb
+```
+## 2. HOI dataset setup
+Our current version of HOTR supports the experiments for both [V-COCO](https://github.com/s-gupta/v-coco) and [HICO-DET](https://drive.google.com/file/d/1QZcJmGVlF9f4h-XLWe9Gkmnmj2z1gSnk/view) dataset.
+Download the dataset under the pulled directory.
+For HICO-DET, we use the [annotation files](https://drive.google.com/file/d/1QZcJmGVlF9f4h-XLWe9Gkmnmj2z1gSnk/view) provided by the PPDM authors.
+Download the [list of actions](https://drive.google.com/open?id=1EeHNHuYyJI-qqDk_-5nay7Mb07tzZLsl) as `list_action.txt` and place them under the unballed hico-det directory.
+Below we present how you should place the files.
+```bash
+# V-COCO setup
+$ git clone https://github.com/s-gupta/v-coco.git
+$ cd v-coco
+$ ln -s [:COCO_DIR] coco/images # COCO_DIR contains images of train2014 & val2014
+$ python script_pick_annotations.py [:COCO_DIR]/annotations
+# HICO-DET setup
+$ tar -zxvf hico_20160224_det.tar.gz # move the unballed folder under the pulled repository
+# dataset setup
+HOTR
+ │─ v-coco
+ │   │─ data
+ │   │   │─ instances_vcoco_all_2014.json
+ │   │   :
+ │   └─ coco
+ │       │─ images
+ │       │   │─ train2014
+ │       │   │   │─ COCO_train2014_000000000009.jpg
+ │       │   │   :
+ │       │   └─ val2014
+ │       │       │─ COCO_val2014_000000000042.jpg
+ :       :       :
+ │─ hico_20160224_det
+ │       │─ list_action.txt
+ │       │─ annotations
+ │       │   │─ trainval_hico.json
+ │       │   │─ test_hico.json
+ │       │   └─ corre_hico.npy
+ :       :
+```
+If you wish to download the datasets on our own directory, simply change the 'data_path' argument to the directory you have downloaded the datasets.
+```bash
+--data_path [:your_own_directory]/[v-coco/hico_20160224_det]
+```
+## 3. Training
+After the preparation, you can start the training with the following command.
+For the HICO-DET training.
+```
+GPUS_PER_NODE=8 ./tools/run_dist_launch.sh 8 ./configs/hico_train.sh
+```
+For the V-COCO training.
+```
+GPUS_PER_NODE=8 ./tools/run_dist_launch.sh 8 ./configs/vcoco_train.sh
+```
+## 4. Evaluation
+For evaluation of main inference path P1 (x->HOI), `--path_id` should be set to 0.
+Indexes of Augmented paths are range to 1~3. (1: x->HO->I, 2: x->HI->O, 3: x->OI->H)
+HICODET
+```
+python -m torch.distributed.launch \
+    --nproc_per_node=8 \
+    --use_env main.py \
+    --batch_size 2 \
+    --HOIDet \
+    --path_id 0 \
+    --share_enc \
+    --pretrained_dec \
+    --share_dec_param \
+    --num_hoi_queries [:query_num] \
+    --object_threshold 0 \
+    --temperature 0.2 \ # use the exact same temperature value that you used during training!
+    --no_aux_loss \
+    --eval \
+    --dataset_file hico-det \
+    --data_path hico_20160224_det \
+    --resume checkpoints/hico_det/hico_[:query_num].pth
+```
+VCOCO
+```
+python -m torch.distributed.launch \
+    --nproc_per_node=8 \
+    --use_env vcoco_main.py \
+    --batch_size 2 \
+    --HOIDet \
+    --path_id 0 \
+    --share_enc \
+    --share_dec_param \
+    --pretrained_dec \
+    --num_hoi_queries [:query_num] \
+    --temperature 0.05 \ # use the exact same temperature value that you used during training!
+    --object_threshold 0 \
+    --no_aux_loss \
+    --eval \
+    --dataset_file vcoco \
+    --data_path v-coco \
+    --resume checkpoints/vcoco/vcoco_[:query_num].pth
+```
+## Citation
+```
+@inproceedings{park2022consistency,
+  title={Consistency Learning via Decoding Path Augmentation for Transformers in Human Object Interaction Detection},
+  author={Park, Jihwan and Lee, SeungJun and Heo, Hwan and Choi, Hyeong Kyu and Kim, Hyunwoo J},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  year={2022}
+}
+```

configs/hico_train.sh ADDED Viewed

	@@ -0,0 +1,40 @@

+#!/usr/bin/env bash
+set -x
+EXP_DIR=logs_run_001
+PY_ARGS=${@:1}
+python -u main.py \
+    --project_name CPC_HOTR_HICODET \
+    --run_name ${EXP_DIR} \
+    --HOIDet \
+    --validate \
+    --share_enc \
+    --pretrained_dec \
+    --use_consis \
+    --share_dec_param \
+    --epochs 90 \
+    --lr_drop 60 \
+    --lr 1e-4 \
+    --lr_backbone 1e-5 \
+    --ramp_up_epoch 30 \
+    --path_id 0 \
+    --num_hoi_queries 16 \
+    --set_cost_idx 20 \
+    --hoi_idx_loss_coef 1 \
+    --hoi_act_loss_coef 10 \
+    --backbone resnet50 \
+    --hoi_consistency_loss_coef 0.2 \
+    --hoi_idx_consistency_loss_coef 1 \
+    --hoi_act_consistency_loss_coef 2 \
+    --hoi_eos_coef 0.1 \
+    --temperature 0.2 \
+    --no_aux_loss \
+    --hoi_aux_loss \
+    --dataset_file hico-det \
+    --data_path hico_20160224_det \
+    --frozen_weights https://dl.fbaipublicfiles.com/detr/detr-r50-e632da11.pth \
+    --output_dir checkpoints/hico_det/ \
+    --augpath_name [\'p2\',\'p3\',\'p4\'] \
+    ${PY_ARGS}

configs/vcoco_train.sh ADDED Viewed

	@@ -0,0 +1,42 @@

+#!/usr/bin/env bash
+set -x
+EXP_DIR=logs_run_001
+PY_ARGS=${@:1}
+python -u main.py \
+    --project_name CPC_HOTR_VCOCO \
+    --run_name ${EXP_DIR} \
+    --HOIDet \
+    --validate \
+    --share_enc \
+    --pretrained_dec \
+    --use_consis \
+    --share_dec_param \
+    --epochs 90 \
+    --lr_drop 60 \
+    --lr 1e-4 \
+    --lr_backbone 1e-5 \
+    --ramp_up_epoch 30 \
+    --path_id 0 \
+    --num_hoi_queries 16 \
+    --set_cost_idx 10 \
+    --hoi_idx_loss_coef 1 \
+    --hoi_act_loss_coef 10 \
+    --backbone resnet50 \
+    --hoi_consistency_loss_coef 1 \
+    --hoi_idx_consistency_loss_coef 1 \
+    --hoi_act_consistency_loss_coef 10 \
+    --stop_grad_stage \
+    --hoi_eos_coef 0.1 \
+    --temperature 0.05 \
+    --no_aux_loss \
+    --hoi_aux_loss \
+    --dataset_file vcoco \
+    --data_path v-coco \
+    --frozen_weights https://dl.fbaipublicfiles.com/detr/detr-r50-e632da11.pth \
+    --output_dir checkpoints/vcoco/ \
+    --augpath_name [\'p2\',\'p3\',\'p4\'] \
+    ${PY_ARGS}

hico_20160224_det ADDED Viewed

	@@ -0,0 +1 @@


1	+ /data/public/rw/datasets/hico_20160224_det/

hotr/data/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,24 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import torch.utils.data
+import torchvision
+from hotr.data.datasets.coco import build as build_coco
+from hotr.data.datasets.vcoco import build as build_vcoco
+from hotr.data.datasets.hico import build as build_hico
+def get_coco_api_from_dataset(dataset):
+    for _ in range(10): # what is this for?
+        if isinstance(dataset, torch.utils.data.Subset):
+            dataset = dataset.dataset
+    if isinstance(dataset, torchvision.datasets.CocoDetection):
+        return dataset.coco
+def build_dataset(image_set, args):
+    if args.dataset_file == 'coco':
+        return build_coco(image_set, args)
+    elif args.dataset_file == 'vcoco':
+        return build_vcoco(image_set, args)
+    elif args.dataset_file == 'hico-det':
+        return build_hico(image_set, args)
+    raise ValueError(f'dataset {args.dataset_file} not supported')

hotr/data/datasets/builtin_meta.py ADDED Viewed

	@@ -0,0 +1,110 @@

+COCO_CATEGORIES = [
+    {"color": [], "isthing": 0, "id": 0, "name": "N/A"},
+    {"color": [220, 20, 60], "isthing": 1, "id": 1, "name": "person"},
+    {"color": [119, 11, 32], "isthing": 1, "id": 2, "name": "bicycle"},
+    {"color": [0, 0, 142], "isthing": 1, "id": 3, "name": "car"},
+    {"color": [0, 0, 230], "isthing": 1, "id": 4, "name": "motorcycle"},
+    {"color": [106, 0, 228], "isthing": 1, "id": 5, "name": "airplane"},
+    {"color": [0, 60, 100], "isthing": 1, "id": 6, "name": "bus"},
+    {"color": [0, 80, 100], "isthing": 1, "id": 7, "name": "train"},
+    {"color": [0, 0, 70], "isthing": 1, "id": 8, "name": "truck"},
+    {"color": [0, 0, 192], "isthing": 1, "id": 9, "name": "boat"},
+    {"color": [250, 170, 30], "isthing": 1, "id": 10, "name": "traffic light"},
+    {"color": [100, 170, 30], "isthing": 1, "id": 11, "name": "fire hydrant"},
+    {"color": [], "isthing": 0, "id": 12, "name": "N/A"},
+    {"color": [220, 220, 0], "isthing": 1, "id": 13, "name": "stop sign"},
+    {"color": [175, 116, 175], "isthing": 1, "id": 14, "name": "parking meter"},
+    {"color": [250, 0, 30], "isthing": 1, "id": 15, "name": "bench"},
+    {"color": [165, 42, 42], "isthing": 1, "id": 16, "name": "bird"},
+    {"color": [255, 77, 255], "isthing": 1, "id": 17, "name": "cat"},
+    {"color": [0, 226, 252], "isthing": 1, "id": 18, "name": "dog"},
+    {"color": [182, 182, 255], "isthing": 1, "id": 19, "name": "horse"},
+    {"color": [0, 82, 0], "isthing": 1, "id": 20, "name": "sheep"},
+    {"color": [120, 166, 157], "isthing": 1, "id": 21, "name": "cow"},
+    {"color": [110, 76, 0], "isthing": 1, "id": 22, "name": "elephant"},
+    {"color": [174, 57, 255], "isthing": 1, "id": 23, "name": "bear"},
+    {"color": [199, 100, 0], "isthing": 1, "id": 24, "name": "zebra"},
+    {"color": [72, 0, 118], "isthing": 1, "id": 25, "name": "giraffe"},
+    {"color": [], "isthing": 0, "id": 26, "name": "N/A"},
+    {"color": [255, 179, 240], "isthing": 1, "id": 27, "name": "backpack"},
+    {"color": [0, 125, 92], "isthing": 1, "id": 28, "name": "umbrella"},
+    {"color": [], "isthing": 0, "id": 29, "name": "N/A"},
+    {"color": [], "isthing": 0, "id": 30, "name": "N/A"},
+    {"color": [209, 0, 151], "isthing": 1, "id": 31, "name": "handbag"},
+    {"color": [188, 208, 182], "isthing": 1, "id": 32, "name": "tie"},
+    {"color": [0, 220, 176], "isthing": 1, "id": 33, "name": "suitcase"},
+    {"color": [255, 99, 164], "isthing": 1, "id": 34, "name": "frisbee"},
+    {"color": [92, 0, 73], "isthing": 1, "id": 35, "name": "skis"},
+    {"color": [133, 129, 255], "isthing": 1, "id": 36, "name": "snowboard"},
+    {"color": [78, 180, 255], "isthing": 1, "id": 37, "name": "sports ball"},
+    {"color": [0, 228, 0], "isthing": 1, "id": 38, "name": "kite"},
+    {"color": [174, 255, 243], "isthing": 1, "id": 39, "name": "baseball bat"},
+    {"color": [45, 89, 255], "isthing": 1, "id": 40, "name": "baseball glove"},
+    {"color": [134, 134, 103], "isthing": 1, "id": 41, "name": "skateboard"},
+    {"color": [145, 148, 174], "isthing": 1, "id": 42, "name": "surfboard"},
+    {"color": [255, 208, 186], "isthing": 1, "id": 43, "name": "tennis racket"},
+    {"color": [197, 226, 255], "isthing": 1, "id": 44, "name": "bottle"},
+    {"color": [], "isthing": 0, "id": 45, "name": "N/A"},
+    {"color": [171, 134, 1], "isthing": 1, "id": 46, "name": "wine glass"},
+    {"color": [109, 63, 54], "isthing": 1, "id": 47, "name": "cup"},
+    {"color": [207, 138, 255], "isthing": 1, "id": 48, "name": "fork"},
+    {"color": [151, 0, 95], "isthing": 1, "id": 49, "name": "knife"},
+    {"color": [9, 80, 61], "isthing": 1, "id": 50, "name": "spoon"},
+    {"color": [84, 105, 51], "isthing": 1, "id": 51, "name": "bowl"},
+    {"color": [74, 65, 105], "isthing": 1, "id": 52, "name": "banana"},
+    {"color": [166, 196, 102], "isthing": 1, "id": 53, "name": "apple"},
+    {"color": [208, 195, 210], "isthing": 1, "id": 54, "name": "sandwich"},
+    {"color": [255, 109, 65], "isthing": 1, "id": 55, "name": "orange"},
+    {"color": [0, 143, 149], "isthing": 1, "id": 56, "name": "broccoli"},
+    {"color": [179, 0, 194], "isthing": 1, "id": 57, "name": "carrot"},
+    {"color": [209, 99, 106], "isthing": 1, "id": 58, "name": "hot dog"},
+    {"color": [5, 121, 0], "isthing": 1, "id": 59, "name": "pizza"},
+    {"color": [227, 255, 205], "isthing": 1, "id": 60, "name": "donut"},
+    {"color": [147, 186, 208], "isthing": 1, "id": 61, "name": "cake"},
+    {"color": [153, 69, 1], "isthing": 1, "id": 62, "name": "chair"},
+    {"color": [3, 95, 161], "isthing": 1, "id": 63, "name": "couch"},
+    {"color": [163, 255, 0], "isthing": 1, "id": 64, "name": "potted plant"},
+    {"color": [119, 0, 170], "isthing": 1, "id": 65, "name": "bed"},
+    {"color": [], "isthing": 0, "id": 66, "name": "N/A"},
+    {"color": [0, 182, 199], "isthing": 1, "id": 67, "name": "dining table"},
+    {"color": [], "isthing": 0, "id": 68, "name": "N/A"},
+    {"color": [], "isthing": 0, "id": 69, "name": "N/A"},
+    {"color": [0, 165, 120], "isthing": 1, "id": 70, "name": "toilet"},
+    {"color": [], "isthing": 0, "id": 71, "name": "N/A"},
+    {"color": [183, 130, 88], "isthing": 1, "id": 72, "name": "tv"},
+    {"color": [95, 32, 0], "isthing": 1, "id": 73, "name": "laptop"},
+    {"color": [130, 114, 135], "isthing": 1, "id": 74, "name": "mouse"},
+    {"color": [110, 129, 133], "isthing": 1, "id": 75, "name": "remote"},
+    {"color": [166, 74, 118], "isthing": 1, "id": 76, "name": "keyboard"},
+    {"color": [219, 142, 185], "isthing": 1, "id": 77, "name": "cell phone"},
+    {"color": [79, 210, 114], "isthing": 1, "id": 78, "name": "microwave"},
+    {"color": [178, 90, 62], "isthing": 1, "id": 79, "name": "oven"},
+    {"color": [65, 70, 15], "isthing": 1, "id": 80, "name": "toaster"},
+    {"color": [127, 167, 115], "isthing": 1, "id": 81, "name": "sink"},
+    {"color": [59, 105, 106], "isthing": 1, "id": 82, "name": "refrigerator"},
+    {"color": [], "isthing": 0, "id": 83, "name": "N/A"},
+    {"color": [142, 108, 45], "isthing": 1, "id": 84, "name": "book"},
+    {"color": [196, 172, 0], "isthing": 1, "id": 85, "name": "clock"},
+    {"color": [95, 54, 80], "isthing": 1, "id": 86, "name": "vase"},
+    {"color": [128, 76, 255], "isthing": 1, "id": 87, "name": "scissors"},
+    {"color": [201, 57, 1], "isthing": 1, "id": 88, "name": "teddy bear"},
+    {"color": [246, 0, 122], "isthing": 1, "id": 89, "name": "hair drier"},
+    {"color": [191, 162, 208], "isthing": 1, "id": 90, "name": "toothbrush"},
+]
+def _get_coco_instances_meta():
+    thing_ids = [k["id"] for k in COCO_CATEGORIES if k["isthing"] == 1]
+    assert len(thing_ids) == 80, f"Length of thing ids : {len(thing_ids)}"
+    thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)}
+    thing_classes = [k["name"] for k in COCO_CATEGORIES if k["isthing"] == 1]
+    thing_colors = [k["color"] for k in COCO_CATEGORIES if k["isthing"] == 1]
+    coco_classes = [k["name"] for k in COCO_CATEGORIES]
+    return {
+        "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id,
+        "thing_classes": thing_classes,
+        "thing_colors": thing_colors,
+        "coco_classes": coco_classes,
+    }

hotr/data/datasets/coco.py ADDED Viewed

	@@ -0,0 +1,156 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+COCO dataset which returns image_id for evaluation.
+Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py
+"""
+from pathlib import Path
+import torch
+import torch.utils.data
+import torchvision
+from pycocotools import mask as coco_mask
+import hotr.data.transforms.transforms as T
+class CocoDetection(torchvision.datasets.CocoDetection):
+    def __init__(self, img_folder, ann_file, transforms, return_masks):
+        super(CocoDetection, self).__init__(img_folder, ann_file)
+        self._transforms = transforms
+        self.prepare = ConvertCocoPolysToMask(return_masks)
+    def __getitem__(self, idx):
+        img, target = super(CocoDetection, self).__getitem__(idx)
+        image_id = self.ids[idx]
+        target = {'image_id': image_id, 'annotations': target}
+        img, target = self.prepare(img, target)
+        if self._transforms is not None:
+            img, target = self._transforms(img, target)
+        return img, target
+def convert_coco_poly_to_mask(segmentations, height, width):
+    masks = []
+    for polygons in segmentations:
+        rles = coco_mask.frPyObjects(polygons, height, width)
+        mask = coco_mask.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        mask = torch.as_tensor(mask, dtype=torch.uint8)
+        mask = mask.any(dim=2)
+        masks.append(mask)
+    if masks:
+        masks = torch.stack(masks, dim=0)
+    else:
+        masks = torch.zeros((0, height, width), dtype=torch.uint8)
+    return masks
+class ConvertCocoPolysToMask(object):
+    def __init__(self, return_masks=False):
+        self.return_masks = return_masks
+    def __call__(self, image, target):
+        w, h = image.size
+        image_id = target["image_id"]
+        image_id = torch.tensor([image_id])
+        anno = target["annotations"]
+        anno = [obj for obj in anno if 'iscrowd' not in obj or obj['iscrowd'] == 0]
+        boxes = [obj["bbox"] for obj in anno]
+        # guard against no boxes via resizing
+        boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
+        boxes[:, 2:] += boxes[:, :2] # (x1, y1, w, h) -> (x1, y1, x2, y2)
+        boxes[:, 0::2].clamp_(min=0, max=w)
+        boxes[:, 1::2].clamp_(min=0, max=h)
+        classes = [obj["category_id"] for obj in anno]
+        classes = torch.tensor(classes, dtype=torch.int64)
+        if self.return_masks:
+            segmentations = [obj["segmentation"] for obj in anno]
+            masks = convert_coco_poly_to_mask(segmentations, h, w)
+        keypoints = None
+        if anno and "keypoints" in anno[0]:
+            keypoints = [obj["keypoints"] for obj in anno]
+            keypoints = torch.as_tensor(keypoints, dtype=torch.float32)
+            num_keypoints = keypoints.shape[0]
+            if num_keypoints:
+                keypoints = keypoints.view(num_keypoints, -1, 3)
+        keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+        boxes = boxes[keep]
+        classes = classes[keep]
+        if self.return_masks:
+            masks = masks[keep]
+        if keypoints is not None:
+            keypoints = keypoints[keep]
+        target = {}
+        target["boxes"] = boxes
+        target["labels"] = classes
+        if self.return_masks:
+            target["masks"] = masks
+        target["image_id"] = image_id
+        if keypoints is not None:
+            target["keypoints"] = keypoints
+        # for conversion to coco api
+        area = torch.tensor([obj["area"] for obj in anno])
+        iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno])
+        target["area"] = area[keep]
+        target["iscrowd"] = iscrowd[keep]
+        target["orig_size"] = torch.as_tensor([int(h), int(w)])
+        target["size"] = torch.as_tensor([int(h), int(w)])
+        return image, target
+def make_coco_transforms(image_set):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.RandomSelect(
+                T.RandomResize(scales, max_size=1333),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=1333),
+                ])
+            ),
+            normalize,
+        ])
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([800], max_size=1333),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.data_path)
+    assert root.exists(), f'provided COCO path {root} does not exist'
+    mode = 'instances'
+    PATHS = {
+        "train": (root / "train2017", root / "annotations" / f'{mode}_train2017.json'),
+        "val": (root / "val2017", root / "annotations" / f'{mode}_val2017.json'),
+    }
+    img_folder, ann_file = PATHS[image_set]
+    dataset = CocoDetection(img_folder, ann_file, transforms=make_coco_transforms(image_set), return_masks=args.masks)
+    return dataset

hotr/data/datasets/hico.py ADDED Viewed

	@@ -0,0 +1,243 @@

+# ------------------------------------------------------------------------
+# HOTR official code : hotr/data/datasets/hico.py
+# Copyright (c) Kakao Brain, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+# Modified from QPIC (https://github.com/hitachi-rd-cv/qpic)
+# Copyright (c) Hitachi, Ltd. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+from pathlib import Path
+from PIL import Image
+import json
+from collections import defaultdict
+import numpy as np
+import torch
+import torch.utils.data
+import torchvision
+from hotr.data.datasets import builtin_meta
+import hotr.data.transforms.transforms as T
+class HICODetection(torch.utils.data.Dataset):
+    def __init__(self, img_set, img_folder, anno_file, action_list_file, transforms, num_queries):
+        self.img_set = img_set
+        self.img_folder = img_folder
+        with open(anno_file, 'r') as f:
+            self.annotations = json.load(f)
+        with open(action_list_file, 'r') as f:
+            self.action_lines = f.readlines()
+        self._transforms = transforms
+        self.num_queries = num_queries
+        self.get_metadata()
+        if img_set == 'train':
+            self.ids = []
+            for idx, img_anno in enumerate(self.annotations):
+                for hoi in img_anno['hoi_annotation']:
+                    if hoi['subject_id'] >= len(img_anno['annotations']) or hoi['object_id'] >= len(img_anno['annotations']):
+                        break
+                else:
+                    self.ids.append(idx)
+        else:
+            self.ids = list(range(len(self.annotations)))
+    ############################################################################
+    # Number Method
+    ############################################################################
+    def get_metadata(self):
+        meta = builtin_meta._get_coco_instances_meta()
+        self.COCO_CLASSES = meta['coco_classes']
+        self._valid_obj_ids = [id for id in meta['thing_dataset_id_to_contiguous_id'].keys()]
+        self._valid_verb_ids, self._valid_verb_names = [], []
+        for action_line in self.action_lines[2:]:
+            act_id, act_name = action_line.split()
+            self._valid_verb_ids.append(int(act_id))
+            self._valid_verb_names.append(act_name)
+    def get_valid_obj_ids(self):
+        return self._valid_obj_ids
+    def get_actions(self):
+        return self._valid_verb_names
+    def num_category(self):
+        return len(self.COCO_CLASSES)
+    def num_action(self):
+        return len(self._valid_verb_ids)
+    ############################################################################
+    def __len__(self):
+        return len(self.ids)
+    def __getitem__(self, idx):
+        img_anno = self.annotations[self.ids[idx]]
+        img = Image.open(self.img_folder / img_anno['file_name']).convert('RGB')
+        w, h = img.size
+        # cut out the GTs that exceed the number of object queries
+        if self.img_set == 'train' and len(img_anno['annotations']) > self.num_queries:
+            img_anno['annotations'] = img_anno['annotations'][:self.num_queries]
+        boxes = [obj['bbox'] for obj in img_anno['annotations']]
+        # guard against no boxes via resizing
+        boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
+        if self.img_set == 'train':
+            # Add index for confirming which boxes are kept after image transformation
+            classes = [(i, self._valid_obj_ids.index(obj['category_id'])) for i, obj in enumerate(img_anno['annotations'])]
+        else:
+            classes = [self._valid_obj_ids.index(obj['category_id']) for obj in img_anno['annotations']]
+        classes = torch.tensor(classes, dtype=torch.int64)
+        target = {}
+        target['orig_size'] = torch.as_tensor([int(h), int(w)])
+        target['size'] = torch.as_tensor([int(h), int(w)])
+        if self.img_set == 'train':
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+            boxes = boxes[keep]
+            classes = classes[keep]
+            target['boxes'] = boxes
+            target['labels'] = classes
+            target['iscrowd'] = torch.tensor([0 for _ in range(boxes.shape[0])])
+            target['area'] = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+            if self._transforms is not None:
+                img, target = self._transforms(img, target)
+            kept_box_indices = [label[0] for label in target['labels']]
+            target['labels'] = target['labels'][:, 1]
+            obj_labels, verb_labels, sub_boxes, obj_boxes = [], [], [], []
+            sub_obj_pairs = []
+            for hoi in img_anno['hoi_annotation']:
+                if hoi['subject_id'] not in kept_box_indices or hoi['object_id'] not in kept_box_indices:
+                    continue
+                sub_obj_pair = (hoi['subject_id'], hoi['object_id'])
+                if sub_obj_pair in sub_obj_pairs:
+                    verb_labels[sub_obj_pairs.index(sub_obj_pair)][self._valid_verb_ids.index(hoi['category_id'])] = 1
+                else:
+                    sub_obj_pairs.append(sub_obj_pair)
+                    obj_labels.append(target['labels'][kept_box_indices.index(hoi['object_id'])])
+                    verb_label = [0 for _ in range(len(self._valid_verb_ids))]
+                    verb_label[self._valid_verb_ids.index(hoi['category_id'])] = 1
+                    sub_box = target['boxes'][kept_box_indices.index(hoi['subject_id'])]
+                    obj_box = target['boxes'][kept_box_indices.index(hoi['object_id'])]
+                    verb_labels.append(verb_label)
+                    sub_boxes.append(sub_box)
+                    obj_boxes.append(obj_box)
+            if len(sub_obj_pairs) == 0:
+                target['pair_targets'] = torch.zeros((0,), dtype=torch.int64)
+                target['pair_actions'] = torch.zeros((0, len(self._valid_verb_ids)), dtype=torch.float32)
+                target['sub_boxes'] = torch.zeros((0, 4), dtype=torch.float32)
+                target['obj_boxes'] = torch.zeros((0, 4), dtype=torch.float32)
+            else:
+                target['pair_targets'] = torch.stack(obj_labels)
+                target['pair_actions'] = torch.as_tensor(verb_labels, dtype=torch.float32)
+                target['sub_boxes'] = torch.stack(sub_boxes)
+                target['obj_boxes'] = torch.stack(obj_boxes)
+        else:
+            target['boxes'] = boxes
+            target['labels'] = classes
+            target['id'] = idx
+            if self._transforms is not None:
+                img, _ = self._transforms(img, None)
+            hois = []
+            for hoi in img_anno['hoi_annotation']:
+                hois.append((hoi['subject_id'], hoi['object_id'], self._valid_verb_ids.index(hoi['category_id'])))
+            target['hois'] = torch.as_tensor(hois, dtype=torch.int64)
+        return img, target
+    def set_rare_hois(self, anno_file):
+        with open(anno_file, 'r') as f:
+            annotations = json.load(f)
+        counts = defaultdict(lambda: 0)
+        for img_anno in annotations:
+            hois = img_anno['hoi_annotation']
+            bboxes = img_anno['annotations']
+            for hoi in hois:
+                triplet = (self._valid_obj_ids.index(bboxes[hoi['subject_id']]['category_id']),
+                           self._valid_obj_ids.index(bboxes[hoi['object_id']]['category_id']),
+                           self._valid_verb_ids.index(hoi['category_id']))
+                counts[triplet] += 1
+        self.rare_triplets = []
+        self.non_rare_triplets = []
+        for triplet, count in counts.items():
+            if count < 10:
+                self.rare_triplets.append(triplet)
+            else:
+                self.non_rare_triplets.append(triplet)
+    def load_correct_mat(self, path):
+        self.correct_mat = np.load(path)
+# Add color jitter to coco transforms
+def make_hico_transforms(image_set):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.ColorJitter(.4, .4, .4),
+            T.RandomSelect(
+                T.RandomResize(scales, max_size=1333),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=1333),
+                ])
+            ),
+            normalize,
+        ])
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([800], max_size=1333),
+            normalize,
+        ])
+    if image_set == 'test':
+        return T.Compose([
+            T.RandomResize([800], max_size=1333),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.data_path)
+    assert root.exists(), f'provided HOI path {root} does not exist'
+    PATHS = {
+        'train': (root / 'images' / 'train2015', root / 'annotations' / 'trainval_hico.json'),
+        'val': (root / 'images' / 'test2015', root / 'annotations' / 'test_hico.json'),
+        'test': (root / 'images' / 'test2015', root / 'annotations' / 'test_hico.json')
+    }
+    CORRECT_MAT_PATH = root / 'annotations' / 'corre_hico.npy'
+    action_list_file = root / 'list_action.txt'
+    img_folder, anno_file = PATHS[image_set]
+    dataset = HICODetection(image_set, img_folder, anno_file, action_list_file, transforms=make_hico_transforms(image_set),
+                            num_queries=args.num_queries)
+    if image_set == 'val' or image_set == 'test':
+        dataset.set_rare_hois(PATHS['train'][1])
+        dataset.load_correct_mat(CORRECT_MAT_PATH)
+    return dataset

hotr/data/datasets/vcoco.py ADDED Viewed

	@@ -0,0 +1,467 @@

+# Copyright (c) Kakaobrain, Inc. and its affiliates. All Rights Reserved
+"""
+V-COCO dataset which returns image_id for evaluation.
+"""
+from pathlib import Path
+from PIL import Image
+import os
+import numpy as np
+import json
+import torch
+import torch.utils.data
+import torchvision
+from torch.utils.data import Dataset
+from pycocotools.coco import COCO
+from pycocotools import mask as coco_mask
+from hotr.data.datasets import builtin_meta
+import hotr.data.transforms.transforms as T
+class VCocoDetection(Dataset):
+    def __init__(self,
+                 img_folder,
+                 ann_file,
+                 all_file,
+                 filter_empty_gt=True,
+                 transforms=None):
+        self.img_folder = img_folder
+        self.file_meta = dict()
+        self._transforms = transforms
+        self.ann_file = ann_file
+        self.all_file = all_file
+        self.filter_empty_gt = filter_empty_gt
+        # COCO initialize
+        self.coco = COCO(self.all_file)
+        self.COCO_CLASSES = builtin_meta._get_coco_instances_meta()['coco_classes']
+        self.file_meta['coco_classes'] = self.COCO_CLASSES
+        # Load V-COCO Dataset
+        self.vcoco_all = self.load_vcoco(self.ann_file)
+        # Save COCO annotation data
+        self.image_ids = sorted(list(set(self.vcoco_all[0]['image_id'].reshape(-1))))
+        # Filter Data
+        if filter_empty_gt:
+            self.filter_image_id()
+        self.img_infos = self.load_annotations()
+        # Refine Data
+        self.save_action_name()
+        self.mapping_inst_action_to_action()
+        self.load_subobj_classes()
+        self.CLASSES = self.act_list
+    ############################################################################
+    # Load V-COCO Dataset
+    ############################################################################
+    def load_vcoco(self, dir_name=None):
+        with open(dir_name, 'rt') as f:
+            vsrl_data = json.load(f)
+        for i in range(len(vsrl_data)):
+            vsrl_data[i]['role_object_id'] = np.array(vsrl_data[i]['role_object_id']).reshape((len(vsrl_data[i]['role_name']),-1)).T
+            for j in ['ann_id', 'label', 'image_id']:
+                vsrl_data[i][j] = np.array(vsrl_data[i][j]).reshape((-1,1))
+        return vsrl_data
+    ############################################################################
+    # Refine Data
+    ############################################################################
+    def save_action_name(self):
+        self.inst_act_list = list()
+        self.act_list = list()
+        # add instance action human classes
+        self.num_subject_act = 0
+        for vcoco in self.vcoco_all:
+            self.inst_act_list.append('human_' + vcoco['action_name'])
+            self.num_subject_act += 1
+        # add instance action object classes
+        for vcoco in self.vcoco_all:
+            if len(vcoco['role_name']) == 3:
+                self.inst_act_list.append('object_' + vcoco['action_name']+'_'+vcoco['role_name'][1])
+                self.inst_act_list.append('object_' + vcoco['action_name']+'_'+vcoco['role_name'][2])
+            elif len(vcoco['role_name']) < 2:
+                continue
+            else:
+                self.inst_act_list.append('object_' + vcoco['action_name']+'_'+vcoco['role_name'][-1]) # when only two roles
+        # add action classes
+        for vcoco in self.vcoco_all:
+            if len(vcoco['role_name']) == 3:
+                self.act_list.append(vcoco['action_name']+'_'+vcoco['role_name'][1])
+                self.act_list.append(vcoco['action_name']+'_'+vcoco['role_name'][2])
+            else:
+                self.act_list.append(vcoco['action_name']+'_'+vcoco['role_name'][-1])
+        # add to meta
+        self.file_meta['action_classes'] = self.act_list
+    def mapping_inst_action_to_action(self):
+        sub_idx = 0
+        obj_idx = self.num_subject_act
+        self.sub_label_to_action = list()
+        self.obj_label_to_action = list()
+        for vcoco in self.vcoco_all:
+            role_name = vcoco['role_name']
+            self.sub_label_to_action.append(sub_idx)
+            if len(role_name) == 3 :
+                self.sub_label_to_action.append(sub_idx)
+                self.obj_label_to_action.append(obj_idx)
+                self.obj_label_to_action.append(obj_idx+1)
+                obj_idx += 2
+            elif len(role_name) == 2:
+                self.obj_label_to_action.append(obj_idx)
+                obj_idx += 1
+            else:
+                self.obj_label_to_action.append(0)
+            sub_idx += 1
+    def load_subobj_classes(self):
+        self.vcoco_labels = dict()
+        for img in self.image_ids:
+            self.vcoco_labels[img] = dict()
+            self.vcoco_labels[img]['boxes'] = np.empty((0, 4), dtype=np.float32)
+            self.vcoco_labels[img]['categories'] = np.empty((0), dtype=np.int32)
+            ann_ids = self.coco.getAnnIds(imgIds=img, iscrowd=None)
+            objs = self.coco.loadAnns(ann_ids)
+            valid_ann_ids = []
+            for i, obj in enumerate(objs):
+                if 'ignore' in obj and obj['ignore'] == 1: continue
+                x1 = obj['bbox'][0]
+                y1 = obj['bbox'][1]
+                x2 = x1 + np.maximum(0., obj['bbox'][2] - 1.)
+                y2 = y1 + np.maximum(0., obj['bbox'][3] - 1.)
+                if obj['area'] > 0 and x2 > x1 and y2 > y1:
+                    bbox = np.array([x1, y1, x2, y2]).reshape(1, -1)
+                    cls = obj['category_id']
+                    self.vcoco_labels[img]['boxes'] = np.concatenate([self.vcoco_labels[img]['boxes'], bbox], axis=0)
+                    self.vcoco_labels[img]['categories'] = np.concatenate([self.vcoco_labels[img]['categories'], [cls]], axis=0)
+                    valid_ann_ids.append(ann_ids[i])
+            num_valid_objs = len(valid_ann_ids)
+            self.vcoco_labels[img]['agent_actions'] = -np.ones((num_valid_objs, self.num_action()), dtype=np.int32)
+            self.vcoco_labels[img]['obj_actions'] = np.zeros((num_valid_objs, self.num_action()), dtype=np.int32)
+            self.vcoco_labels[img]['role_id'] = -np.ones((num_valid_objs, self.num_action()), dtype=np.int32)
+            for ix, ann_id in enumerate(valid_ann_ids):
+                in_vcoco = np.where(self.vcoco_all[0]['ann_id'] == ann_id)[0]
+                if in_vcoco.size > 0:
+                    self.vcoco_labels[img]['agent_actions'][ix, :] = 0
+                    agent_act_id = 0
+                    obj_act_id = -1
+                    for i, x in enumerate(self.vcoco_all):
+                        has_label = np.where(np.logical_and(x['ann_id'] == ann_id, x['label'] == 1))[0]
+                        if has_label.size > 0:
+                            assert has_label.size == 1
+                            rids = x['role_object_id'][has_label]
+                            if rids.shape[1] == 3:
+                                self.vcoco_labels[img]['agent_actions'][ix, agent_act_id] = 1
+                                self.vcoco_labels[img]['agent_actions'][ix, agent_act_id+1] = 1
+                                agent_act_id += 2
+                            else:
+                                self.vcoco_labels[img]['agent_actions'][ix, agent_act_id] = 1
+                                agent_act_id += 1
+                                if rids.shape[1] == 1 : obj_act_id += 1
+                            for j in range(1, rids.shape[1]):
+                                obj_act_id += 1
+                                if rids[0, j] == 0: continue # no role
+                                aid = np.where(valid_ann_ids == rids[0, j])[0]
+                                self.vcoco_labels[img]['role_id'][ix, obj_act_id] = aid
+                                self.vcoco_labels[img]['obj_actions'][aid, obj_act_id] = 1
+                        else:
+                            rids = x['role_object_id'][0]
+                            if rids.shape[0] == 3:
+                                agent_act_id += 2
+                                obj_act_id += 2
+                            else:
+                                agent_act_id += 1
+                                obj_act_id += 1
+    ############################################################################
+    # Annotation Loader
+    ############################################################################
+    # >>> 1. instance
+    def load_instance_annotations(self, image_index):
+        num_ann = self.vcoco_labels[image_index]['boxes'].shape[0]
+        inst_action = np.zeros((num_ann, self.num_inst_action()), np.int)
+        inst_bbox = np.zeros((num_ann, 4), dtype=np.float32)
+        inst_category = np.zeros((num_ann, ), dtype=np.int)
+        for idx in range(num_ann):
+            inst_bbox[idx] = self.vcoco_labels[image_index]['boxes'][idx]
+            inst_category[idx]= self.vcoco_labels[image_index]['categories'][idx] #+ 1 # category 1 ~ 81
+            if inst_category[idx] == 1:
+                act = self.vcoco_labels[image_index]['agent_actions'][idx]
+                inst_action[idx, :self.num_subject_act] = act[np.unique(self.sub_label_to_action, return_index=True)[1]]
+                # when person is the obj
+                act = self.vcoco_labels[image_index]['obj_actions'][idx] # when person is the obj
+                if act.any():
+                    inst_action[idx, self.num_subject_act:] = act[np.nonzero(self.obj_label_to_action)[0]]
+                    if inst_action[idx, :self.num_subject_act].sum(axis=-1) < 0:
+                        inst_action[idx, :self.num_subject_act] = 0
+            else:
+                act = self.vcoco_labels[image_index]['obj_actions'][idx]
+                inst_action[idx, self.num_subject_act:] = act[np.nonzero(self.obj_label_to_action)[0]]
+        # >>> For Objects that are in COCO but not in V-COCO,
+        # >>> Human -> [-1 * 26, 0 * 25]
+        # >>> Object -> [0 * 51]
+        # >>> Don't return anything for actions with max 0 or max -1
+        max_val = inst_action.max(axis=1)
+        if (max_val > 0).sum() == 0:
+            print(f"No Annotations for {image_index}")
+            print(inst_action)
+            print(self.vcoco_labels[image_index]['agent_actions'][idx])
+            print(self.vcoco_labels[image_index]['obj_actions'][idx])
+        return inst_bbox[max_val > 0], inst_category[max_val > 0], inst_action[max_val > 0]
+    # >>> 2. pair
+    def load_pair_annotations(self, image_index):
+        num_ann = self.vcoco_labels[image_index]['boxes'].shape[0]
+        pair_action = np.zeros((0, self.num_action()), np.int)
+        pair_bbox = np.zeros((0, 8), dtype=np.float32)
+        pair_target = np.zeros((0, ), dtype=np.int)
+        for idx in range(num_ann):
+            h_box = self.vcoco_labels[image_index]['boxes'][idx]
+            h_cat = self.vcoco_labels[image_index]['categories'][idx]
+            if h_cat != 1 : continue # human_id = 1
+            h_act = self.vcoco_labels[image_index]['agent_actions'][idx]
+            if np.any((h_act==-1)) : continue
+            o_act = dict()
+            for aid in range(self.num_action()):
+                if h_act[aid] == 0 : continue
+                o_id = self.vcoco_labels[image_index]['role_id'][idx, aid]
+                if o_id not in o_act : o_act[o_id] = list()
+                o_act[o_id].append(aid)
+            for o_id in o_act.keys():
+                if o_id == -1:
+                    o_box = -np.ones((4, ))
+                    o_cat = -1 # target is background
+                else:
+                    o_box = self.vcoco_labels[image_index]['boxes'][o_id]
+                    o_cat = self.vcoco_labels[image_index]['categories'][o_id] # category 0 ~ 80
+                box = np.concatenate([h_box, o_box]).astype(np.float32)
+                act = np.zeros((1, self.num_action()), np.int)
+                tar = np.zeros((1, ), np.int)
+                tar[0] = o_cat #+ 1 # category 1 ~ 81
+                for o_aid in o_act[o_id] : act[0, o_aid] = 1
+                pair_action = np.concatenate([pair_action, act], axis=0)
+                pair_bbox = np.concatenate([pair_bbox, np.expand_dims(box, axis=0)], axis=0)
+                pair_target = np.concatenate([pair_target, tar], axis=0)
+        return pair_bbox, pair_action, pair_target
+    # >>> 3. image infos
+    def load_annotations(self):
+        img_infos = []
+        for i in self.image_ids:
+            info = self.coco.loadImgs([i])[0]
+            img_infos.append(info)
+        return img_infos
+    ############################################################################
+    # Check Method
+    ############################################################################
+    def sum_action_ann_for_id(self, find_idx):
+        sum = 0
+        for action_ann in self.vcoco_all:
+            img_ids = action_ann['image_id']
+            img_labels = action_ann['label']
+            final_inds = img_ids[img_labels == 1]
+            if (find_idx in final_inds):
+                sum += 1
+        # sum of class-wise existence
+        return (sum > 0)
+    def filter_image_id(self):
+        empty_gt_list = []
+        for img_id in self.image_ids:
+            if not self.sum_action_ann_for_id(img_id):
+                empty_gt_list.append(img_id)
+        for remove_id in empty_gt_list:
+            rm_idx = self.image_ids.index(remove_id)
+            self.image_ids.remove(remove_id)
+    ############################################################################
+    # Preprocessing
+    ############################################################################
+    def prepare_img(self, idx):
+        img_info = self.img_infos[idx]
+        image = Image.open(os.path.join(self.img_folder, img_info['file_name'])).convert('RGB')
+        target = self.get_ann_info(idx)
+        w, h = image.size
+        target["orig_size"] = torch.as_tensor([int(h), int(w)])
+        target["size"] = torch.as_tensor([int(h), int(w)])
+        if self._transforms is not None:
+            img, target = self._transforms(image, target) # "size" gets converted here
+        return img, target
+    ############################################################################
+    # Get Method
+    ############################################################################
+    def __getitem__(self, idx):
+        img, target = self.prepare_img(idx)
+        return img, target
+    def __len__(self):
+        return len(self.image_ids)
+    def get_human_label_idx(self):
+        return self.sub_label_to_action
+    def get_object_label_idx(self):
+        return self.obj_label_to_action
+    def get_image_ids(self):
+        return self.image_ids
+    def get_categories(self):
+        return self.COCO_CLASSES
+    def get_inst_action(self):
+        return self.inst_act_list
+    def get_actions(self):
+        return self.act_list
+    def get_human_action(self):
+        return self.inst_act_list[:self.num_subject_act]
+    def get_object_action(self):
+        return self.inst_act_list[self.num_subject_act:]
+    def get_ann_info(self, idx):
+        img_idx = int(self.image_ids[idx])
+        # load each annotation
+        inst_bbox, inst_label, inst_actions = self.load_instance_annotations(img_idx)
+        pair_bbox, pair_actions, pair_targets = self.load_pair_annotations(img_idx)
+        sample = {
+            'image_id' : torch.tensor([img_idx]),
+            'boxes': torch.as_tensor(inst_bbox, dtype=torch.float32),
+            'labels': torch.tensor(inst_label, dtype=torch.int64),
+            'inst_actions': torch.tensor(inst_actions, dtype=torch.int64),
+            'pair_boxes': torch.as_tensor(pair_bbox, dtype=torch.float32),
+            'pair_actions': torch.tensor(pair_actions, dtype=torch.int64),
+            'pair_targets': torch.tensor(pair_targets, dtype=torch.int64),
+        }
+        return sample
+    ############################################################################
+    # Number Method
+    ############################################################################
+    def num_category(self):
+        return len(self.COCO_CLASSES)
+    def num_action(self):
+        return len(self.act_list)
+    def num_inst_action(self):
+        return len(self.inst_act_list)
+    def num_human_act(self):
+        return len(self.inst_act_list[:self.num_subject_act])
+    def num_object_act(self):
+        return len(self.inst_act_list[self.num_subject_act:])
+def make_hoi_transforms(image_set):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.ColorJitter(.4, .4, .4),
+            T.RandomSelect(
+                T.RandomResize(scales, max_size=1333),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=1333),
+                ])
+            ),
+            normalize,
+        ])
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([800], max_size=1333),
+            normalize,
+        ])
+    if image_set == 'test':
+        return T.Compose([
+            T.RandomResize([800], max_size=1333),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.data_path)
+    assert root.exists(), f'provided V-COCO path {root} does not exist'
+    PATHS = {
+        "train": (root / "coco/images/train2014/", root / "data/vcoco" / 'vcoco_trainval.json'),
+        "val": (root / "coco/images/val2014", root / "data/vcoco" / 'vcoco_test.json'),
+        "test": (root / "coco/images/val2014", root / "data/vcoco" / 'vcoco_test.json'),
+    }
+    img_folder, ann_file = PATHS[image_set]
+    all_file = root / "data/instances_vcoco_all_2014.json"
+    dataset = VCocoDetection(
+        img_folder = img_folder,
+        ann_file = ann_file,
+        all_file = all_file,
+        filter_empty_gt=True,
+        transforms = make_hoi_transforms(image_set)
+    )
+    dataset.file_meta['dataset_file'] = args.dataset_file
+    dataset.file_meta['image_set'] = image_set
+    return dataset

hotr/data/evaluators/coco_eval.py ADDED Viewed

	@@ -0,0 +1,256 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+COCO evaluator that works in distributed mode.
+Mostly copy-paste from https://github.com/pytorch/vision/blob/edfd5a7/references/detection/coco_eval.py
+The difference is that there is less copy-pasting from pycocotools
+in the end of the file, as python3 can suppress prints with contextlib
+"""
+import os
+import contextlib
+import copy
+import numpy as np
+import torch
+from pycocotools.cocoeval import COCOeval
+from pycocotools.coco import COCO
+import pycocotools.mask as mask_util
+from hotr.util.misc import all_gather
+class CocoEvaluator(object):
+    def __init__(self, coco_gt, iou_types):
+        assert isinstance(iou_types, (list, tuple))
+        coco_gt = copy.deepcopy(coco_gt)
+        self.coco_gt = coco_gt
+        self.iou_types = iou_types
+        self.coco_eval = {}
+        for iou_type in iou_types:
+            self.coco_eval[iou_type] = COCOeval(coco_gt, iouType=iou_type)
+        self.img_ids = []
+        self.eval_imgs = {k: [] for k in iou_types}
+    def update(self, predictions):
+        img_ids = list(np.unique(list(predictions.keys())))
+        self.img_ids.extend(img_ids)
+        for iou_type in self.iou_types:
+            results = self.prepare(predictions, iou_type)
+            # suppress pycocotools prints
+            with open(os.devnull, 'w') as devnull:
+                with contextlib.redirect_stdout(devnull):
+                    coco_dt = COCO.loadRes(self.coco_gt, results) if results else COCO()
+            coco_eval = self.coco_eval[iou_type]
+            coco_eval.cocoDt = coco_dt
+            coco_eval.params.imgIds = list(img_ids)
+            img_ids, eval_imgs = evaluate(coco_eval)
+            self.eval_imgs[iou_type].append(eval_imgs)
+    def synchronize_between_processes(self):
+        for iou_type in self.iou_types:
+            self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2)
+            create_common_coco_eval(self.coco_eval[iou_type], self.img_ids, self.eval_imgs[iou_type])
+    def accumulate(self):
+        for coco_eval in self.coco_eval.values():
+            coco_eval.accumulate()
+    def summarize(self):
+        for iou_type, coco_eval in self.coco_eval.items():
+            print("IoU metric: {}".format(iou_type))
+            coco_eval.summarize()
+    def prepare(self, predictions, iou_type):
+        if iou_type == "bbox":
+            return self.prepare_for_coco_detection(predictions)
+        elif iou_type == "segm":
+            return self.prepare_for_coco_segmentation(predictions)
+        elif iou_type == "keypoints":
+            return self.prepare_for_coco_keypoint(predictions)
+        else:
+            raise ValueError("Unknown iou type {}".format(iou_type))
+    def prepare_for_coco_detection(self, predictions):
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+            boxes = prediction["boxes"]
+            boxes = convert_to_xywh(boxes).tolist()
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        "bbox": box,
+                        "score": scores[k],
+                    }
+                    for k, box in enumerate(boxes)
+                ]
+            )
+        return coco_results
+    def prepare_for_coco_segmentation(self, predictions):
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+            scores = prediction["scores"]
+            labels = prediction["labels"]
+            masks = prediction["masks"]
+            masks = masks > 0.5
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+            rles = [
+                mask_util.encode(np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order="F"))[0]
+                for mask in masks
+            ]
+            for rle in rles:
+                rle["counts"] = rle["counts"].decode("utf-8")
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        "segmentation": rle,
+                        "score": scores[k],
+                    }
+                    for k, rle in enumerate(rles)
+                ]
+            )
+        return coco_results
+    def prepare_for_coco_keypoint(self, predictions):
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+            boxes = prediction["boxes"]
+            boxes = convert_to_xywh(boxes).tolist()
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+            keypoints = prediction["keypoints"]
+            keypoints = keypoints.flatten(start_dim=1).tolist()
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        'keypoints': keypoint,
+                        "score": scores[k],
+                    }
+                    for k, keypoint in enumerate(keypoints)
+                ]
+            )
+        return coco_results
+def convert_to_xywh(boxes):
+    xmin, ymin, xmax, ymax = boxes.unbind(1)
+    return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1)
+def merge(img_ids, eval_imgs):
+    all_img_ids = all_gather(img_ids)
+    all_eval_imgs = all_gather(eval_imgs)
+    merged_img_ids = []
+    for p in all_img_ids:
+        merged_img_ids.extend(p)
+    merged_eval_imgs = []
+    for p in all_eval_imgs:
+        merged_eval_imgs.append(p)
+    merged_img_ids = np.array(merged_img_ids)
+    merged_eval_imgs = np.concatenate(merged_eval_imgs, 2)
+    # keep only unique (and in sorted order) images
+    merged_img_ids, idx = np.unique(merged_img_ids, return_index=True)
+    merged_eval_imgs = merged_eval_imgs[..., idx]
+    return merged_img_ids, merged_eval_imgs
+def create_common_coco_eval(coco_eval, img_ids, eval_imgs):
+    img_ids, eval_imgs = merge(img_ids, eval_imgs)
+    img_ids = list(img_ids)
+    eval_imgs = list(eval_imgs.flatten())
+    coco_eval.evalImgs = eval_imgs
+    coco_eval.params.imgIds = img_ids
+    coco_eval._paramsEval = copy.deepcopy(coco_eval.params)
+#################################################################
+# From pycocotools, just removed the prints and fixed
+# a Python3 bug about unicode not defined
+#################################################################
+def evaluate(self):
+    '''
+    Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
+    :return: None
+    '''
+    # tic = time.time()
+    # print('Running per image evaluation...')
+    p = self.params
+    # add backward compatibility if useSegm is specified in params
+    if p.useSegm is not None:
+        p.iouType = 'segm' if p.useSegm == 1 else 'bbox'
+        print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType))
+    # print('Evaluate annotation type *{}*'.format(p.iouType))
+    p.imgIds = list(np.unique(p.imgIds))
+    if p.useCats:
+        p.catIds = list(np.unique(p.catIds))
+    p.maxDets = sorted(p.maxDets)
+    self.params = p
+    self._prepare()
+    # loop through images, area range, max detection number
+    catIds = p.catIds if p.useCats else [-1]
+    if p.iouType == 'segm' or p.iouType == 'bbox':
+        computeIoU = self.computeIoU
+    elif p.iouType == 'keypoints':
+        computeIoU = self.computeOks
+    self.ious = {
+        (imgId, catId): computeIoU(imgId, catId)
+        for imgId in p.imgIds
+        for catId in catIds}
+    evaluateImg = self.evaluateImg
+    maxDet = p.maxDets[-1]
+    evalImgs = [
+        evaluateImg(imgId, catId, areaRng, maxDet)
+        for catId in catIds
+        for areaRng in p.areaRng
+        for imgId in p.imgIds
+    ]
+    # this is NOT in the pycocotools code, but could be done outside
+    evalImgs = np.asarray(evalImgs).reshape(len(catIds), len(p.areaRng), len(p.imgIds))
+    self._paramsEval = copy.deepcopy(self.params)
+    # toc = time.time()
+    # print('DONE (t={:0.2f}s).'.format(toc-tic))
+    return p.imgIds, evalImgs
+#################################################################
+# end of straight copy from pycocotools, just removing the prints
+#################################################################

hotr/data/evaluators/hico_eval.py ADDED Viewed

	@@ -0,0 +1,242 @@

+# ------------------------------------------------------------------------
+# HOTR official code : hotr/data/evaluators/hico_eval.py
+# Copyright (c) Kakao Brain, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+# Modified from QPIC (https://github.com/hitachi-rd-cv/qpic)
+# Copyright (c) Hitachi, Ltd. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+import numpy as np
+from collections import defaultdict
+class HICOEvaluator():
+    def __init__(self, preds, gts, rare_triplets, non_rare_triplets, correct_mat):
+        self.overlap_iou = 0.5
+        self.max_hois = 100
+        self.rare_triplets = rare_triplets
+        self.non_rare_triplets = non_rare_triplets
+        self.fp = defaultdict(list)
+        self.tp = defaultdict(list)
+        self.score = defaultdict(list)
+        self.sum_gts = defaultdict(lambda: 0)
+        self.gt_triplets = []
+        self.preds = []
+        for img_preds in preds:
+            img_preds = {k: v.to('cpu').numpy() for k, v in img_preds.items() if k != 'hoi_recognition_time'}
+            bboxes = [{'bbox': bbox, 'category_id': label} for bbox, label in zip(img_preds['boxes'], img_preds['labels'])]
+            hoi_scores = img_preds['verb_scores']
+            verb_labels = np.tile(np.arange(hoi_scores.shape[1]), (hoi_scores.shape[0], 1))
+            subject_ids = np.tile(img_preds['sub_ids'], (hoi_scores.shape[1], 1)).T
+            object_ids = np.tile(img_preds['obj_ids'], (hoi_scores.shape[1], 1)).T
+            hoi_scores = hoi_scores.ravel()
+            verb_labels = verb_labels.ravel()
+            subject_ids = subject_ids.ravel()
+            object_ids = object_ids.ravel()
+            if len(subject_ids) > 0:
+                object_labels = np.array([bboxes[object_id]['category_id'] for object_id in object_ids])
+                masks = correct_mat[verb_labels, object_labels]
+                hoi_scores *= masks
+                hois = [{'subject_id': subject_id, 'object_id': object_id, 'category_id': category_id, 'score': score} for
+                        subject_id, object_id, category_id, score in zip(subject_ids, object_ids, verb_labels, hoi_scores)]
+                hois.sort(key=lambda k: (k.get('score', 0)), reverse=True)
+                hois = hois[:self.max_hois]
+            else:
+                hois = []
+            self.preds.append({
+                'predictions': bboxes,
+                'hoi_prediction': hois
+            })
+        self.gts = []
+        for img_gts in gts:
+            img_gts = {k: v.to('cpu').numpy() for k, v in img_gts.items() if k != 'id'}
+            self.gts.append({
+                'annotations': [{'bbox': bbox, 'category_id': label} for bbox, label in zip(img_gts['boxes'], img_gts['labels'])],
+                'hoi_annotation': [{'subject_id': hoi[0], 'object_id': hoi[1], 'category_id': hoi[2]} for hoi in img_gts['hois']]
+            })
+            for hoi in self.gts[-1]['hoi_annotation']:
+                triplet = (self.gts[-1]['annotations'][hoi['subject_id']]['category_id'],
+                           self.gts[-1]['annotations'][hoi['object_id']]['category_id'],
+                           hoi['category_id'])
+                if triplet not in self.gt_triplets:
+                    self.gt_triplets.append(triplet)
+                self.sum_gts[triplet] += 1
+    def evaluate(self):
+        for img_id, (img_preds, img_gts) in enumerate(zip(self.preds, self.gts)):
+            print(f"Evaluating Score Matrix... : [{(img_id+1):>4}/{len(self.gts):<4}]" ,flush=True, end="\r")
+            pred_bboxes = img_preds['predictions']
+            gt_bboxes = img_gts['annotations']
+            pred_hois = img_preds['hoi_prediction']
+            gt_hois = img_gts['hoi_annotation']
+            if len(gt_bboxes) != 0:
+                bbox_pairs, bbox_overlaps = self.compute_iou_mat(gt_bboxes, pred_bboxes)
+                self.compute_fptp(pred_hois, gt_hois, bbox_pairs, pred_bboxes, bbox_overlaps)
+            else:
+                for pred_hoi in pred_hois:
+                    triplet = [pred_bboxes[pred_hoi['subject_id']]['category_id'],
+                               pred_bboxes[pred_hoi['object_id']]['category_id'], pred_hoi['category_id']]
+                    if triplet not in self.gt_triplets:
+                        continue
+                    self.tp[triplet].append(0)
+                    self.fp[triplet].append(1)
+                    self.score[triplet].append(pred_hoi['score'])
+        print(f"[stats] Score Matrix Generation completed!!          ")
+        map = self.compute_map()
+        return map
+    def compute_map(self):
+        ap = defaultdict(lambda: 0)
+        rare_ap = defaultdict(lambda: 0)
+        non_rare_ap = defaultdict(lambda: 0)
+        max_recall = defaultdict(lambda: 0)
+        for triplet in self.gt_triplets:
+            sum_gts = self.sum_gts[triplet]
+            if sum_gts == 0:
+                continue
+            tp = np.array((self.tp[triplet]))
+            fp = np.array((self.fp[triplet]))
+            if len(tp) == 0:
+                ap[triplet] = 0
+                max_recall[triplet] = 0
+                if triplet in self.rare_triplets:
+                    rare_ap[triplet] = 0
+                elif triplet in self.non_rare_triplets:
+                    non_rare_ap[triplet] = 0
+                else:
+                    print('Warning: triplet {} is neither in rare triplets nor in non-rare triplets'.format(triplet))
+                continue
+            score = np.array(self.score[triplet])
+            sort_inds = np.argsort(-score)
+            fp = fp[sort_inds]
+            tp = tp[sort_inds]
+            fp = np.cumsum(fp)
+            tp = np.cumsum(tp)
+            rec = tp / sum_gts
+            prec = tp / (fp + tp)
+            ap[triplet] = self.voc_ap(rec, prec)
+            max_recall[triplet] = np.amax(rec)
+            if triplet in self.rare_triplets:
+                rare_ap[triplet] = ap[triplet]
+            elif triplet in self.non_rare_triplets:
+                non_rare_ap[triplet] = ap[triplet]
+            else:
+                print('Warning: triplet {} is neither in rare triplets nor in non-rare triplets'.format(triplet))
+        m_ap = np.mean(list(ap.values())) * 100 # percentage
+        m_ap_rare = np.mean(list(rare_ap.values())) * 100 # percentage
+        m_ap_non_rare = np.mean(list(non_rare_ap.values())) * 100 # percentage
+        m_max_recall = np.mean(list(max_recall.values()))
+        return {'mAP': m_ap, 'mAP rare': m_ap_rare, 'mAP non-rare': m_ap_non_rare, 'mean max recall': m_max_recall}
+    def voc_ap(self, rec, prec):
+        ap = 0.
+        for t in np.arange(0., 1.1, 0.1):
+            if np.sum(rec >= t) == 0:
+                p = 0
+            else:
+                p = np.max(prec[rec >= t])
+            ap = ap + p / 11.
+        return ap
+    def compute_fptp(self, pred_hois, gt_hois, match_pairs, pred_bboxes, bbox_overlaps):
+        pos_pred_ids = match_pairs.keys()
+        vis_tag = np.zeros(len(gt_hois))
+        pred_hois.sort(key=lambda k: (k.get('score', 0)), reverse=True)
+        if len(pred_hois) != 0:
+            for pred_hoi in pred_hois:
+                is_match = 0
+                if len(match_pairs) != 0 and pred_hoi['subject_id'] in pos_pred_ids and pred_hoi['object_id'] in pos_pred_ids:
+                    pred_sub_ids = match_pairs[pred_hoi['subject_id']]
+                    pred_obj_ids = match_pairs[pred_hoi['object_id']]
+                    pred_sub_overlaps = bbox_overlaps[pred_hoi['subject_id']]
+                    pred_obj_overlaps = bbox_overlaps[pred_hoi['object_id']]
+                    pred_category_id = pred_hoi['category_id']
+                    max_overlap = 0
+                    max_gt_hoi = 0
+                    for gt_hoi in gt_hois:
+                        if gt_hoi['subject_id'] in pred_sub_ids and gt_hoi['object_id'] in pred_obj_ids \
+                           and pred_category_id == gt_hoi['category_id']:
+                            is_match = 1
+                            min_overlap_gt = min(pred_sub_overlaps[pred_sub_ids.index(gt_hoi['subject_id'])],
+                                                 pred_obj_overlaps[pred_obj_ids.index(gt_hoi['object_id'])])
+                            if min_overlap_gt > max_overlap:
+                                max_overlap = min_overlap_gt
+                                max_gt_hoi = gt_hoi
+                triplet = (pred_bboxes[pred_hoi['subject_id']]['category_id'], pred_bboxes[pred_hoi['object_id']]['category_id'],
+                           pred_hoi['category_id'])
+                if triplet not in self.gt_triplets:
+                    continue
+                if is_match == 1 and vis_tag[gt_hois.index(max_gt_hoi)] == 0:
+                    self.fp[triplet].append(0)
+                    self.tp[triplet].append(1)
+                    vis_tag[gt_hois.index(max_gt_hoi)] =1
+                else:
+                    self.fp[triplet].append(1)
+                    self.tp[triplet].append(0)
+                self.score[triplet].append(pred_hoi['score'])
+    def compute_iou_mat(self, bbox_list1, bbox_list2):
+        iou_mat = np.zeros((len(bbox_list1), len(bbox_list2)))
+        if len(bbox_list1) == 0 or len(bbox_list2) == 0:
+            return {}
+        for i, bbox1 in enumerate(bbox_list1):
+            for j, bbox2 in enumerate(bbox_list2):
+                iou_i = self.compute_IOU(bbox1, bbox2)
+                iou_mat[i, j] = iou_i
+        iou_mat_ov=iou_mat.copy()
+        iou_mat[iou_mat>=self.overlap_iou] = 1
+        iou_mat[iou_mat<self.overlap_iou] = 0
+        match_pairs = np.nonzero(iou_mat)
+        match_pairs_dict = {}
+        match_pair_overlaps = {}
+        if iou_mat.max() > 0:
+            for i, pred_id in enumerate(match_pairs[1]):
+                if pred_id not in match_pairs_dict.keys():
+                    match_pairs_dict[pred_id] = []
+                    match_pair_overlaps[pred_id]=[]
+                match_pairs_dict[pred_id].append(match_pairs[0][i])
+                match_pair_overlaps[pred_id].append(iou_mat_ov[match_pairs[0][i],pred_id])
+        return match_pairs_dict, match_pair_overlaps
+    def compute_IOU(self, bbox1, bbox2):
+        if isinstance(bbox1['category_id'], str):
+            bbox1['category_id'] = int(bbox1['category_id'].replace('\n', ''))
+        if isinstance(bbox2['category_id'], str):
+            bbox2['category_id'] = int(bbox2['category_id'].replace('\n', ''))
+        if bbox1['category_id'] == bbox2['category_id']:
+            rec1 = bbox1['bbox']
+            rec2 = bbox2['bbox']
+            # computing area of each rectangles
+            S_rec1 = (rec1[2] - rec1[0]+1) * (rec1[3] - rec1[1]+1)
+            S_rec2 = (rec2[2] - rec2[0]+1) * (rec2[3] - rec2[1]+1)
+            # computing the sum_area
+            sum_area = S_rec1 + S_rec2
+            # find the each edge of intersect rectangle
+            left_line = max(rec1[1], rec2[1])
+            right_line = min(rec1[3], rec2[3])
+            top_line = max(rec1[0], rec2[0])
+            bottom_line = min(rec1[2], rec2[2])
+            # judge if there is an intersect
+            if left_line >= right_line or top_line >= bottom_line:
+                return 0
+            else:
+                intersect = (right_line - left_line+1) * (bottom_line - top_line+1)
+                return intersect / (sum_area - intersect)
+        else:
+            return 0

hotr/data/evaluators/vcoco_eval.py ADDED Viewed

	@@ -0,0 +1,57 @@

+# Copyright (c) KakaoBrain, Inc. and its affiliates. All Rights Reserved
+"""
+V-COCO evaluator that works in distributed mode.
+"""
+import os
+import numpy as np
+import torch
+from hotr.util.misc import all_gather
+from hotr.metrics.vcoco.ap_role import APRole
+from functools import partial
+def init_vcoco_evaluators(human_act_name, object_act_name):
+    role_eval1 = APRole(act_name=object_act_name, scenario_flag=True, iou_threshold=0.5)
+    role_eval2 = APRole(act_name=object_act_name, scenario_flag=False, iou_threshold=0.5)
+    return role_eval1, role_eval2
+class VCocoEvaluator(object):
+    def __init__(self, args):
+        self.img_ids = []
+        self.eval_imgs = []
+        self.role_eval1, self.role_eval2 = init_vcoco_evaluators(args.human_actions, args.object_actions)
+        self.num_human_act = args.num_human_act
+        self.action_idx = args.valid_ids
+    def update(self, outputs):
+        img_ids = list(np.unique(list(outputs.keys())))
+        for img_num, img_id in enumerate(img_ids):
+            print(f"Evaluating Score Matrix... : [{(img_num+1):>4}/{len(img_ids):<4}]" ,flush=True, end="\r")
+            prediction = outputs[img_id]['prediction']
+            target = outputs[img_id]['target']
+            # score with prediction
+            hbox, hcat, obox, ocat = list(map(lambda x: prediction[x], \
+                ['h_box', 'h_cat', 'o_box', 'o_cat']))
+            assert 'pair_score' in prediction
+            score = prediction['pair_score']
+            hbox, hcat, obox, ocat, score =\
+                    list(map(lambda x: x.cpu().numpy(), [hbox, hcat, obox, ocat, score]))
+            # ground-truth
+            gt_h_inds = (target['labels'] == 1)
+            gt_h_box = target['boxes'][gt_h_inds, :4].cpu().numpy()
+            gt_h_act = target['inst_actions'][gt_h_inds, :self.num_human_act].cpu().numpy()
+            gt_p_box = target['pair_boxes'].cpu().numpy()
+            gt_p_act = target['pair_actions'].cpu().numpy()
+            score = score[self.action_idx, :, :]
+            gt_p_act = gt_p_act[:, self.action_idx]
+            self.role_eval1.add_data(hbox, obox, score, gt_h_box, gt_h_act, gt_p_box, gt_p_act)
+            self.role_eval2.add_data(hbox, obox, score, gt_h_box, gt_h_act, gt_p_box, gt_p_act)
+            self.img_ids.append(img_id)

hotr/data/transforms/transforms.py ADDED Viewed

	@@ -0,0 +1,387 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Transforms and data augmentation for both image + bbox.
+"""
+import random
+import PIL
+import torch
+import torchvision.transforms as T
+import torchvision.transforms.functional as F
+from hotr.util.box_ops import box_xyxy_to_cxcywh
+from hotr.util.misc import interpolate
+def crop(image, target, region):
+    cropped_image = F.crop(image, *region)
+    target = target.copy()
+    i, j, h, w = region
+    # should we do something wrt the original size?
+    target["size"] = torch.tensor([h, w])
+    max_size = torch.as_tensor([w, h], dtype=torch.float32)
+    fields = ["labels", "area", "iscrowd"] # add additional fields
+    if "inst_actions" in target.keys():
+        fields.append("inst_actions")
+    if "boxes" in target:
+        boxes = target["boxes"]
+        cropped_boxes = boxes - torch.as_tensor([j, i, j, i])
+        cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size)
+        cropped_boxes = cropped_boxes.clamp(min=0)
+        area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1)
+        target["boxes"] = cropped_boxes.reshape(-1, 4)
+        target["area"] = area
+        fields.append("boxes")
+    if "pair_boxes" in target or ("sub_boxes" in target and "obj_boxes" in target):
+        if "pair_boxes" in target:
+            pair_boxes = target["pair_boxes"]
+            hboxes = pair_boxes[:, :4]
+            oboxes = pair_boxes[:, 4:]
+        if ("sub_boxes" in target and "obj_boxes" in target):
+            hboxes = target["sub_boxes"]
+            oboxes = target["obj_boxes"]
+        cropped_hboxes = hboxes - torch.as_tensor([j, i, j, i])
+        cropped_hboxes = torch.min(cropped_hboxes.reshape(-1, 2, 2), max_size)
+        cropped_hboxes = cropped_hboxes.clamp(min=0)
+        hboxes = cropped_hboxes.reshape(-1, 4)
+        obj_mask = (oboxes[:, 0] != -1)
+        if obj_mask.sum() != 0:
+            cropped_oboxes = oboxes[obj_mask] - torch.as_tensor([j, i, j, i])
+            cropped_oboxes = torch.min(cropped_oboxes.reshape(-1, 2, 2), max_size)
+            cropped_oboxes = cropped_oboxes.clamp(min=0)
+            oboxes[obj_mask] = cropped_oboxes.reshape(-1, 4)
+        else:
+            cropped_oboxes = oboxes
+        cropped_pair_boxes = torch.cat([hboxes, oboxes], dim=-1)
+        target["pair_boxes"] = cropped_pair_boxes
+        pair_fields = ["pair_boxes", "pair_actions", "pair_targets"]
+    if "masks" in target:
+        # FIXME should we update the area here if there are no boxes[?
+        target['masks'] = target['masks'][:, i:i + h, j:j + w]
+        fields.append("masks")
+    # remove elements for which the boxes or masks that have zero area
+    if "boxes" in target or "masks" in target:
+        # favor boxes selection when defining which elements to keep
+        # this is compatible with previous implementation
+        if "boxes" in target:
+            cropped_boxes = target['boxes'].reshape(-1, 2, 2)
+            keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1)
+        else:
+            keep = target['masks'].flatten(1).any(1)
+        for field in fields:
+            if field in target: # added this because there is no 'iscrowd' field in v-coco dataset
+                target[field] = target[field][keep]
+    # remove elements that have redundant area
+    if "boxes" in target and "labels" in target:
+        cropped_boxes = target['boxes']
+        cropped_labels = target['labels']
+        cnr, keep_idx = [], []
+        for idx, (cropped_box, cropped_lbl) in enumerate(zip(cropped_boxes, cropped_labels)):
+            if str((cropped_box, cropped_lbl)) not in cnr:
+                cnr.append(str((cropped_box, cropped_lbl)))
+                keep_idx.append(True)
+            else: keep_idx.append(False)
+        for field in fields:
+            if field in target:
+                target[field] = target[field][keep_idx]
+    # remove elements for which pair boxes have zero area
+    if "pair_boxes" in target:
+        cropped_hboxes = target["pair_boxes"][:, :4].reshape(-1, 2, 2)
+        cropped_oboxes = target["pair_boxes"][:, 4:].reshape(-1, 2, 2)
+        keep_h = torch.all(cropped_hboxes[:, 1, :] > cropped_hboxes[:, 0, :], dim=1)
+        keep_o = torch.all(cropped_oboxes[:, 1, :] > cropped_oboxes[:, 0, :], dim=1)
+        not_empty_o = torch.all(target["pair_boxes"][:, 4:] >= 0, dim=1)
+        discard_o = (~keep_o) & not_empty_o
+        if (discard_o).sum() > 0:
+            target["pair_boxes"][discard_o, 4:] = -1
+        for pair_field in pair_fields:
+            target[pair_field] = target[pair_field][keep_h]
+    return cropped_image, target
+def hflip(image, target):
+    flipped_image = F.hflip(image)
+    w, h = image.size
+    target = target.copy()
+    if "boxes" in target:
+        boxes = target["boxes"]
+        boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0])
+        target["boxes"] = boxes
+    if "pair_boxes" in target:
+        pair_boxes = target["pair_boxes"]
+        hboxes = pair_boxes[:, :4]
+        oboxes = pair_boxes[:, 4:]
+        # human flip
+        hboxes = hboxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0])
+        # object flip
+        obj_mask = (oboxes[:, 0] != -1)
+        if obj_mask.sum() != 0:
+            o_tmp = oboxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0])
+            oboxes[obj_mask] = o_tmp[obj_mask]
+        pair_boxes = torch.cat([hboxes, oboxes], dim=-1)
+        target["pair_boxes"] = pair_boxes
+    if "masks" in target:
+        target['masks'] = target['masks'].flip(-1)
+    return flipped_image, target
+def resize(image, target, size, max_size=None):
+    # size can be min_size (scalar) or (w, h) tuple
+    def get_size_with_aspect_ratio(image_size, size, max_size=None):
+        w, h = image_size
+        if max_size is not None:
+            min_original_size = float(min((w, h)))
+            max_original_size = float(max((w, h)))
+            if max_original_size / min_original_size * size > max_size:
+                size = int(round(max_size * min_original_size / max_original_size))
+        if (w <= h and w == size) or (h <= w and h == size):
+            return (h, w)
+        if w < h:
+            ow = size
+            oh = int(size * h / w)
+        else:
+            oh = size
+            ow = int(size * w / h)
+        return (oh, ow)
+    def get_size(image_size, size, max_size=None):
+        if isinstance(size, (list, tuple)):
+            return size[::-1]
+        else:
+            return get_size_with_aspect_ratio(image_size, size, max_size)
+    size = get_size(image.size, size, max_size)
+    rescaled_image = F.resize(image, size)
+    if target is None:
+        return rescaled_image, None
+    ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size))
+    ratio_width, ratio_height = ratios
+    target = target.copy()
+    if "boxes" in target:
+        boxes = target["boxes"]
+        scaled_boxes = boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height])
+        target["boxes"] = scaled_boxes
+    if "pair_boxes" in target:
+        hboxes = target["pair_boxes"][:, :4]
+        scaled_hboxes = hboxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height])
+        hboxes = scaled_hboxes
+        oboxes = target["pair_boxes"][:, 4:]
+        obj_mask = (oboxes[:, 0] != -1)
+        if obj_mask.sum() != 0:
+            scaled_oboxes = oboxes[obj_mask] * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height])
+            oboxes[obj_mask] = scaled_oboxes
+        target["pair_boxes"] = torch.cat([hboxes, oboxes], dim=-1)
+    if "area" in target:
+        area = target["area"]
+        scaled_area = area * (ratio_width * ratio_height)
+        target["area"] = scaled_area
+    h, w = size
+    target["size"] = torch.tensor([h, w])
+    if "masks" in target:
+        target['masks'] = interpolate(
+            target['masks'][:, None].float(), size, mode="nearest")[:, 0] > 0.5
+    return rescaled_image, target
+def pad(image, target, padding):
+    # assumes that we only pad on the bottom right corners
+    padded_image = F.pad(image, (0, 0, padding[0], padding[1]))
+    if target is None:
+        return padded_image, None
+    target = target.copy()
+    # should we do something wrt the original size?
+    target["size"] = torch.tensor(padded_image[::-1])
+    if "masks" in target:
+        target['masks'] = torch.nn.functional.pad(target['masks'], (0, padding[0], 0, padding[1]))
+    return padded_image, target
+class RandomCrop(object):
+    def __init__(self, size):
+        self.size = size
+    def __call__(self, img, target):
+        region = T.RandomCrop.get_params(img, self.size)
+        return crop(img, target, region)
+class RandomSizeCrop(object):
+    def __init__(self, min_size: int, max_size: int):
+        self.min_size = min_size
+        self.max_size = max_size
+    def __call__(self, img: PIL.Image.Image, target: dict):
+        w = random.randint(self.min_size, min(img.width, self.max_size))
+        h = random.randint(self.min_size, min(img.height, self.max_size))
+        region = T.RandomCrop.get_params(img, [h, w])
+        return crop(img, target, region)
+class CenterCrop(object):
+    def __init__(self, size):
+        self.size = size
+    def __call__(self, img, target):
+        image_width, image_height = img.size
+        crop_height, crop_width = self.size
+        crop_top = int(round((image_height - crop_height) / 2.))
+        crop_left = int(round((image_width - crop_width) / 2.))
+        return crop(img, target, (crop_top, crop_left, crop_height, crop_width))
+class RandomHorizontalFlip(object):
+    def __init__(self, p=0.5):
+        self.p = p
+    def __call__(self, img, target):
+        if random.random() < self.p:
+            return hflip(img, target)
+        return img, target
+class RandomResize(object):
+    def __init__(self, sizes, max_size=None):
+        assert isinstance(sizes, (list, tuple))
+        self.sizes = sizes
+        self.max_size = max_size
+    def __call__(self, img, target=None):
+        size = random.choice(self.sizes)
+        return resize(img, target, size, self.max_size)
+class RandomPad(object):
+    def __init__(self, max_pad):
+        self.max_pad = max_pad
+    def __call__(self, img, target):
+        pad_x = random.randint(0, self.max_pad)
+        pad_y = random.randint(0, self.max_pad)
+        return pad(img, target, (pad_x, pad_y))
+class RandomSelect(object):
+    """
+    Randomly selects between transforms1 and transforms2,
+    with probability p for transforms1 and (1 - p) for transforms2
+    """
+    def __init__(self, transforms1, transforms2, p=0.5):
+        self.transforms1 = transforms1
+        self.transforms2 = transforms2
+        self.p = p
+    def __call__(self, img, target):
+        if random.random() < self.p:
+            return self.transforms1(img, target)
+        return self.transforms2(img, target)
+class ToTensor(object):
+    def __call__(self, img, target):
+        return F.to_tensor(img), target
+class RandomErasing(object):
+    def __init__(self, *args, **kwargs):
+        self.eraser = T.RandomErasing(*args, **kwargs)
+    def __call__(self, img, target):
+        return self.eraser(img), target
+class Normalize(object):
+    def __init__(self, mean, std):
+        self.mean = mean
+        self.std = std
+    def __call__(self, image, target=None):
+        image = F.normalize(image, mean=self.mean, std=self.std)
+        if target is None:
+            return image, None
+        target = target.copy()
+        h, w = image.shape[-2:]
+        if "boxes" in target:
+            boxes = target["boxes"]
+            boxes = box_xyxy_to_cxcywh(boxes)
+            boxes = boxes / torch.tensor([w, h, w, h], dtype=torch.float32)
+            target["boxes"] = boxes
+        if "pair_boxes" in target:
+            hboxes = target["pair_boxes"][:, :4]
+            hboxes = box_xyxy_to_cxcywh(hboxes)
+            hboxes = hboxes / torch.tensor([w, h, w, h], dtype=torch.float32)
+            oboxes = target["pair_boxes"][:, 4:]
+            obj_mask = (oboxes[:, 0] != -1)
+            if obj_mask.sum() != 0:
+                oboxes[obj_mask] = box_xyxy_to_cxcywh(oboxes[obj_mask])
+                oboxes[obj_mask] = oboxes[obj_mask] / torch.tensor([w, h, w, h], dtype=torch.float32)
+            pair_boxes = torch.cat([hboxes, oboxes], dim=-1)
+            target["pair_boxes"] = pair_boxes
+        return image, target
+class ColorJitter(object):
+    def __init__(self, brightness=0, contrast=0, saturatio=0, hue=0):
+        self.color_jitter = T.ColorJitter(brightness, contrast, saturatio, hue)
+    def __call__(self, img, target):
+        return self.color_jitter(img), target
+class Compose(object):
+    def __init__(self, transforms):
+        self.transforms = transforms
+    def __call__(self, image, target):
+        for t in self.transforms:
+            image, target = t(image, target)
+        return image, target
+    def __repr__(self):
+        format_string = self.__class__.__name__ + "("
+        for t in self.transforms:
+            format_string += "\n"
+            format_string += "    {0}".format(t)
+        format_string += "\n)"
+        return format_string

hotr/engine/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from .evaluator_vcoco import vcoco_evaluate, vcoco_accumulate
+from .evaluator_hico import hico_evaluate
+def hoi_evaluator(args, model, criterion, postprocessors, data_loader, device, thr=0):
+    if args.dataset_file == 'vcoco':
+        return vcoco_evaluate(model, criterion, postprocessors, data_loader, device, args.output_dir, thr,args=args)
+    elif args.dataset_file == 'hico-det':
+        return hico_evaluate(model, postprocessors, data_loader, device, thr,args=args)
+    else: raise NotImplementedError
+def hoi_accumulator(args, total_res, print_results=False, wandb=False):
+    if args.dataset_file == 'vcoco':
+        return vcoco_accumulate(total_res, args, print_results, wandb)
+    else: raise NotImplementedError

hotr/engine/arg_parser.py ADDED Viewed

	@@ -0,0 +1,163 @@

+# ------------------------------------------------------------------------
+# HOTR official code : engine/arg_parser.py
+# Copyright (c) Kakao Brain, Inc. and its affiliates. All Rights Reserved
+# Modified arguments are represented with *
+# ------------------------------------------------------------------------
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+import argparse
+import hotr.util.misc as utils
+def get_args_parser():
+    parser = argparse.ArgumentParser('Set transformer detector', add_help=False)
+    parser.add_argument('--lr', default=1e-4, type=float)
+    parser.add_argument('--lr_backbone', default=1e-5, type=float)
+    parser.add_argument('--batch_size', default=2, type=int)
+    parser.add_argument('--weight_decay', default=1e-4, type=float)
+    parser.add_argument('--epochs', default=100, type=int)
+    parser.add_argument('--lr_drop', default=80, type=int)
+    parser.add_argument('--clip_max_norm', default=0.1, type=float,
+                        help='gradient clipping max norm')
+    # DETR Model parameters
+    parser.add_argument('--frozen_weights', type=str, default=None,
+                        help="Path to the pretrained model. If set, only the mask head will be trained")
+    parser.add_argument('--pretrain_interaction_tf', type=str, default=None,
+                        help="Path to the pretrained model. If set, only the mask head will be trained")
+    # DETR Backbone
+    parser.add_argument('--backbone', default='resnet50', type=str,
+                        help="Name of the convolutional backbone to use")
+    parser.add_argument('--dilation', action='store_true',
+                        help="If true, we replace stride with dilation in the last convolutional block (DC5)")
+    parser.add_argument('--position_embedding', default='sine', type=str, choices=('sine', 'learned'),
+                        help="Type of positional embedding to use on top of the image features")
+    # DETR Transformer (= Encoder, Instance Decoder)
+    parser.add_argument('--enc_layers', default=6, type=int,
+                        help="Number of encoding layers in the transformer")
+    parser.add_argument('--dec_layers', default=6, type=int,
+                        help="Number of decoding layers in the transformer")
+    parser.add_argument('--dim_feedforward', default=2048, type=int,
+                        help="Intermediate size of the feedforward layers in the transformer blocks")
+    parser.add_argument('--hidden_dim', default=256, type=int,
+                        help="Size of the embeddings (dimension of the transformer)")
+    parser.add_argument('--dropout', default=0.1, type=float,
+                        help="Dropout applied in the transformer")
+    parser.add_argument('--nheads', default=8, type=int,
+                        help="Number of attention heads inside the transformer's attentions")
+    parser.add_argument('--num_queries', default=100, type=int,
+                        help="Number of query slots")
+    parser.add_argument('--pre_norm', action='store_true')
+    parser.add_argument('--decoder_form', default=2, type=int,
+                        help="1-decoder or 2-decoder")
+    # Segmentation
+    parser.add_argument('--masks', action='store_true',
+                        help="Train segmentation head if the flag is provided")
+    # Loss Option
+    parser.add_argument('--no_aux_loss', dest='aux_loss', action='store_false',
+                        help="Disables auxiliary decoding losses (loss at each layer)")
+    # Loss coefficients (DETR)
+    parser.add_argument('--mask_loss_coef', default=1, type=float)
+    parser.add_argument('--dice_loss_coef', default=1, type=float)
+    parser.add_argument('--bbox_loss_coef', default=5, type=float)
+    parser.add_argument('--giou_loss_coef', default=2, type=float)
+    parser.add_argument('--eos_coef', default=0.1, type=float,
+                        help="Relative classification weight of the no-object class")
+    # Matcher (DETR)
+    parser.add_argument('--set_cost_class', default=1, type=float,
+                        help="Class coefficient in the matching cost")
+    parser.add_argument('--set_cost_bbox', default=5, type=float,
+                        help="L1 box coefficient in the matching cost")
+    parser.add_argument('--set_cost_giou', default=2, type=float,
+                        help="giou box coefficient in the matching cost")
+    # * HOI Detection
+    parser.add_argument('--HOIDet', action='store_true',
+                        help="Train HOI Detection head if the flag is provided")
+    parser.add_argument('--share_enc', action='store_true',
+                        help="Share the Encoder in DETR for HOI Detection if the flag is provided")
+    parser.add_argument('--pretrained_dec', action='store_true',
+                        help="Use Pre-trained Decoder in DETR for Interaction Decoder if the flag is provided")
+    parser.add_argument('--hoi_enc_layers', default=1, type=int,
+                        help="Number of decoding layers in HOI transformer")
+    parser.add_argument('--hoi_dec_layers', default=1, type=int,
+                        help="Number of decoding layers in HOI transformer")
+    parser.add_argument('--hoi_nheads', default=8, type=int,
+                        help="Number of decoding layers in HOI transformer")
+    parser.add_argument('--hoi_dim_feedforward', default=2048, type=int,
+                        help="Number of decoding layers in HOI transformer")
+    # parser.add_argument('--hoi_mode', type=str, default=None, help='[inst | pair | all]')
+    parser.add_argument('--num_hoi_queries', default=100, type=int,
+                        help="Number of Queries for Interaction Decoder")
+    parser.add_argument('--hoi_aux_loss', action='store_true')
+    # * HOTR Matcher
+    parser.add_argument('--set_cost_idx', default=1, type=float,
+                        help="IDX coefficient in the matching cost")
+    parser.add_argument('--set_cost_act', default=1, type=float,
+                        help="Action coefficient in the matching cost")
+    parser.add_argument('--set_cost_tgt', default=1, type=float,
+                        help="Target coefficient in the matching cost")
+    # * HOTR Loss coefficients
+    parser.add_argument('--temperature', default=0.05, type=float, help="temperature")
+    parser.add_argument('--hoi_consistency_loss_coef', default=1, type=float)
+    parser.add_argument('--hoi_idx_loss_coef', default=1, type=float)
+    parser.add_argument('--hoi_idx_consistency_loss_coef', default=1, type=float)
+    parser.add_argument('--hoi_act_loss_coef', default=1, type=float)
+    parser.add_argument('--hoi_act_consistency_loss_coef', default=1, type=float)
+    parser.add_argument('--hoi_tgt_loss_coef', default=1, type=float)
+    parser.add_argument('--hoi_tgt_consistency_loss_coef', default=1, type=float)
+    parser.add_argument('--hoi_eos_coef', default=0.1, type=float, help="Relative classification weight of the no-object class")
+    parser.add_argument('--ramp_down_epoch',default=10000,type=int)
+    parser.add_argument('--ramp_up_epoch',default=0,type=int)
+    #consistency
+    parser.add_argument('--use_consis',action='store_true',help='use consistency regularization')
+    parser.add_argument('--share_dec_param',action='store_true',help = 'share decoder parameters of all stages')
+    parser.add_argument("--augpath_name", type=utils.arg_as_list,default=[],
+                        help='choose which augmented inference paths to use. (p2:x->HO->I,p3:x->HI->O,p4:x->OI->H)')
+    parser.add_argument('--stop_grad_stage',action='store_true',help='Do not back propogate loss to previous stage')
+    parser.add_argument('--path_id', default=0, type=int)
+    parser.add_argument('--sep_enc_forward',action='store_true')
+    # * dataset parameters
+    parser.add_argument('--dataset_file', help='[coco | vcoco]')
+    parser.add_argument('--data_path', type=str)
+    parser.add_argument('--object_threshold', type=float, default=0, help='Threshold for object confidence')
+    # machine parameters
+    parser.add_argument('--output_dir', default='',
+                        help='path where to save, empty for no saving')
+    parser.add_argument('--custom_path', default='',
+                        help="Data path for custom inference. Only required for custom_main.py")
+    parser.add_argument('--device', default='cuda',
+                        help='device to use for training / testing')
+    parser.add_argument('--seed', default=42, type=int)
+    parser.add_argument('--resume', default='', help='resume from checkpoint')
+    parser.add_argument('--start_epoch', default=0, type=int, metavar='N',
+                        help='start epoch')
+    parser.add_argument('--num_workers', default=2, type=int)
+    # mode
+    parser.add_argument('--eval', action='store_true', help="Only evaluate results if the flag is provided")
+    parser.add_argument('--validate', action='store_true', help="Validate after every epoch")
+    # distributed training parameters
+    parser.add_argument('--world_size', default=1, type=int,
+                        help='number of distributed processes')
+    parser.add_argument('--dist_url', default='env://', help='url used to set up distributed training')
+    # * WanDB
+    parser.add_argument('--wandb', action='store_true')
+    parser.add_argument('--project_name', default='hotr_cpc')
+    parser.add_argument('--group_name', default='mlv')
+    parser.add_argument('--run_name', default='run_000001')
+    return parser

hotr/engine/evaluator_coco.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import os
+import torch
+import hotr.util.misc as utils
+import hotr.util.logger as loggers
+from hotr.data.evaluators.coco_eval import CocoEvaluator
+@torch.no_grad()
+def coco_evaluate(model, criterion, postprocessors, data_loader, base_ds, device, output_dir):
+    model.eval()
+    criterion.eval()
+    metric_logger = loggers.MetricLogger(delimiter="  ")
+    metric_logger.add_meter('class_error', utils.SmoothedValue(window_size=1, fmt='{value:.2f}'))
+    header = 'Evaluation'
+    iou_types = tuple(k for k in ('segm', 'bbox') if k in postprocessors.keys())
+    coco_evaluator = CocoEvaluator(base_ds, iou_types)
+    print_freq = len(data_loader)
+    # coco_evaluator.coco_eval[iou_types[0]].params.iouThrs = [0, 0.1, 0.5, 0.75]
+    print("\n>>> [MS-COCO Evaluation] <<<")
+    for samples, targets in metric_logger.log_every(data_loader, print_freq, header):
+        samples = samples.to(device)
+        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
+        outputs = model(samples)
+        loss_dict = criterion(outputs, targets)
+        weight_dict = criterion.weight_dict
+        # reduce losses over all GPUs for logging purposes
+        loss_dict_reduced = utils.reduce_dict(loss_dict)
+        loss_dict_reduced_scaled = {k: v * weight_dict[k]
+                                    for k, v in loss_dict_reduced.items() if k in weight_dict}
+        loss_dict_reduced_unscaled = {f'{k}_unscaled': v
+                                      for k, v in loss_dict_reduced.items()}
+        metric_logger.update(loss=sum(loss_dict_reduced_scaled.values()),
+                             **loss_dict_reduced_scaled,
+                             **loss_dict_reduced_unscaled)
+        metric_logger.update(class_error=loss_dict_reduced['class_error'])
+        orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0)
+        results = postprocessors['bbox'](outputs, orig_target_sizes)
+        res = {target['image_id'].item(): output for target, output in zip(targets, results)}
+        if coco_evaluator is not None:
+            coco_evaluator.update(res)
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes()
+    print("\n>>> [Averaged stats] <<<\n", metric_logger)
+    if coco_evaluator is not None:
+        coco_evaluator.synchronize_between_processes()
+    # accumulate predictions from all images
+    if coco_evaluator is not None:
+        coco_evaluator.accumulate()
+        coco_evaluator.summarize()
+    stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+    if coco_evaluator is not None:
+        if 'bbox' in postprocessors.keys():
+            stats['coco_eval_bbox'] = coco_evaluator.coco_eval['bbox'].stats.tolist()
+    return stats, coco_evaluator

hotr/engine/evaluator_hico.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import math
+import os
+import sys
+from typing import Iterable
+import numpy as np
+import copy
+import itertools
+import torch
+import hotr.util.misc as utils
+import hotr.util.logger as loggers
+from hotr.data.evaluators.hico_eval import HICOEvaluator
+@torch.no_grad()
+def hico_evaluate(model, postprocessors, data_loader, device, thr, args=None):
+    model.eval()
+    metric_logger = loggers.MetricLogger(mode="test", delimiter="  ")
+    header = 'Evaluation Inference (HICO-DET)'
+    preds = []
+    gts = []
+    indices = []
+    hoi_recognition_time = []
+    for samples, targets in metric_logger.log_every(data_loader, 50, header):
+        samples = samples.to(device)
+        targets = [{k: (v.to(device) if k != 'id' else v) for k, v in t.items()} for t in targets]
+        outputs = model(samples)
+        orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0)
+        results = postprocessors['hoi'](outputs, orig_target_sizes, threshold=thr, dataset='hico-det', args=args)
+        hoi_recognition_time.append(results[0]['hoi_recognition_time'] * 1000)
+        preds.extend(list(itertools.chain.from_iterable(utils.all_gather(results))))
+        # For avoiding a runtime error, the copy is used
+        gts.extend(list(itertools.chain.from_iterable(utils.all_gather(copy.deepcopy(targets)))))
+    print(f"[stats] HOI Recognition Time (avg) : {sum(hoi_recognition_time)/len(hoi_recognition_time):.4f} ms")
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes()
+    img_ids = [img_gts['id'] for img_gts in gts]
+    _, indices = np.unique(img_ids, return_index=True)
+    preds = [img_preds for i, img_preds in enumerate(preds) if i in indices]
+    gts = [img_gts for i, img_gts in enumerate(gts) if i in indices]
+    evaluator = HICOEvaluator(preds, gts, data_loader.dataset.rare_triplets,
+                                  data_loader.dataset.non_rare_triplets, data_loader.dataset.correct_mat)
+    stats = evaluator.evaluate()
+    return stats

hotr/engine/evaluator_vcoco.py ADDED Viewed

	@@ -0,0 +1,87 @@

+# ------------------------------------------------------------------------
+# HOTR official code : hotr/engine/evaluator_vcoco.py
+# Copyright (c) Kakao Brain, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+import os
+import torch
+import time
+import datetime
+import hotr.util.misc as utils
+import hotr.util.logger as loggers
+from hotr.data.evaluators.vcoco_eval import VCocoEvaluator
+from hotr.util.box_ops import rescale_bboxes, rescale_pairs
+import wandb
+@torch.no_grad()
+def vcoco_evaluate(model, criterion, postprocessors, data_loader, device, output_dir, thr,args=None):
+    model.eval()
+    criterion.eval()
+    metric_logger = loggers.MetricLogger(mode="test", delimiter="  ")
+    header = 'Evaluation Inference (V-COCO)'
+    print_freq = 1 # len(data_loader)
+    res = {}
+    hoi_recognition_time = []
+    for samples, targets in metric_logger.log_every(data_loader, print_freq, header):
+        samples = samples.to(device)
+        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
+        outputs = model(samples)
+        loss_dict = criterion(outputs, targets)
+        loss_dict_reduced = utils.reduce_dict(loss_dict) # ddp gathering
+        orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0)
+        results = postprocessors['hoi'](outputs, orig_target_sizes, threshold=thr, dataset='vcoco',args=args)
+        targets = process_target(targets, orig_target_sizes)
+        hoi_recognition_time.append(results[0]['hoi_recognition_time'] * 1000)
+        res.update(
+            {target['image_id'].item():\
+                {'target': target, 'prediction': output} for target, output in zip(targets, results)
+            }
+        )
+    print(f"[stats] HOI Recognition Time (avg) : {sum(hoi_recognition_time)/len(hoi_recognition_time):.4f} ms")
+    start_time = time.time()
+    gather_res = utils.all_gather(res)
+    total_res = {}
+    for dist_res in gather_res:
+        total_res.update(dist_res)
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    print(f"[stats] Distributed Gathering Time : {total_time_str}")
+    return total_res
+def vcoco_accumulate(total_res, args, print_results, wandb_log):
+    vcoco_evaluator = VCocoEvaluator(args)
+    vcoco_evaluator.update(total_res)
+    print(f"[stats] Score Matrix Generation completed!!          ")
+    scenario1 = vcoco_evaluator.role_eval1.evaluate(print_results)
+    scenario2 = vcoco_evaluator.role_eval2.evaluate(print_results)
+    if wandb_log:
+        wandb.log({
+            'scenario1': scenario1,
+            'scenario2': scenario2
+        })
+    return scenario1, scenario2
+def process_target(targets, target_sizes):
+    for idx, (target, target_size) in enumerate(zip(targets, target_sizes)):
+        labels = target['labels']
+        valid_boxes_inds = (labels > 0)
+        targets[idx]['boxes'] = rescale_bboxes(target['boxes'], target_size) # boxes
+        targets[idx]['pair_boxes'] = rescale_pairs(target['pair_boxes'], target_size) # pairs
+    return targets

hotr/engine/trainer.py ADDED Viewed

	@@ -0,0 +1,73 @@

+# ------------------------------------------------------------------------
+# HOTR official code : engine/trainer.py
+# Copyright (c) Kakao Brain, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+import math
+import torch
+import sys
+import hotr.util.misc as utils
+import hotr.util.logger as loggers
+from hotr.util.ramp import *
+from typing import Iterable
+import wandb
+def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module,
+                    data_loader: Iterable, optimizer: torch.optim.Optimizer,
+                    device: torch.device, epoch: int, max_epoch: int, ramp_up_epoch: int,rampdown_epoch: int,max_consis_coef: float=1.0,max_norm: float = 0,dataset_file: str = 'coco', log: bool = False):
+    model.train()
+    criterion.train()
+    metric_logger = loggers.MetricLogger(mode="train", delimiter="  ")
+    metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
+    space_fmt = str(len(str(max_epoch)))
+    header = 'Epoch [{start_epoch: >{fill}}/{end_epoch}]'.format(start_epoch=epoch+1, end_epoch=max_epoch, fill=space_fmt)
+    print_freq = int(len(data_loader)/5)
+    if epoch<=rampdown_epoch:
+        consis_coef=sigmoid_rampup(epoch,ramp_up_epoch,max_consis_coef)
+    else:
+        consis_coef=cosine_rampdown(epoch-rampdown_epoch,max_epoch-rampdown_epoch,max_consis_coef)
+    print(consis_coef)
+    print(f"\n>>> Epoch #{(epoch+1)}")
+    for samples, targets in metric_logger.log_every(data_loader, print_freq, header):
+        samples = samples.to(device)
+        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
+        outputs = model(samples)
+        loss_dict = criterion(outputs, targets, log)
+        #print(loss_dict)
+        weight_dict = criterion.weight_dict
+        losses = sum(loss_dict[k] * weight_dict[k]*consis_coef if 'consistency' in k else loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
+        # reduce losses over all GPUs for logging purposes
+        loss_dict_reduced = utils.reduce_dict(loss_dict)
+        loss_dict_reduced_unscaled = {f'{k}_unscaled': v
+                                      for k, v in loss_dict_reduced.items()}
+        loss_dict_reduced_scaled = {k: v * weight_dict[k]*consis_coef if 'consistency' in k else v * weight_dict[k] for k, v in loss_dict_reduced.items() if k in weight_dict}
+        losses_reduced_scaled = sum(loss_dict_reduced_scaled.values())
+        loss_value = losses_reduced_scaled.item()
+        if not math.isfinite(loss_value):
+            print("Loss is {}, stopping training".format(loss_value))
+            print(loss_dict_reduced)
+            sys.exit(1)
+        optimizer.zero_grad()
+        losses.backward()
+        if max_norm > 0:
+            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
+        optimizer.step()
+        metric_logger.update(loss=loss_value, **loss_dict_reduced_scaled)
+        if "obj_class_error" in loss_dict:
+            metric_logger.update(obj_class_error=loss_dict_reduced['obj_class_error'])
+        metric_logger.update(lr=optimizer.param_groups[0]["lr"])
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes()
+    if utils.get_rank() == 0 and log: wandb.log(loss_dict_reduced_scaled)
+    print("Averaged stats:", metric_logger)
+    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}

hotr/metrics/utils.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import torch
+import numpy as np
+def count_parameters(model):
+    return sum(p.numel() for p in model.parameters() if p.requires_grad)
+def compute_overlap(a, b):
+    if type(a) == torch.Tensor:
+        if len(a.shape) == 2:
+            area = (b[:, 2] - b[:, 0] + 1) * (b[:, 3] - b[:, 1] + 1)
+            iw = torch.min(a[:, 2].unsqueeze(dim=1), b[:, 2]) - torch.max(a[:, 0].unsqueeze(dim=1), b[:, 0])
+            ih = torch.min(a[:, 3].unsqueeze(dim=1), b[:, 3]) - torch.max(a[:, 1].unsqueeze(dim=1), b[:, 1])
+            iw[iw<0] = 0
+            ih[ih<0] = 0
+            ua = torch.unsqueeze((a[:, 2] - a[:, 0] + 1) * (a[:, 3] - a[:, 1] + 1), dim=1) + area - iw * ih
+            ua[ua < 1e-8] = 1e-8
+            intersection = iw * ih
+            return intersection / ua
+    elif type(a) == np.ndarray:
+        if len(a.shape) == 2:
+            area = np.expand_dims((b[:, 2] - b[:, 0] + 1) * (b[:, 3] - b[:, 1] + 1), axis=0) #(1, K)
+            iw = np.minimum(np.expand_dims(a[:, 2], axis=1), np.expand_dims(b[:, 2], axis=0)) \
+                - np.maximum(np.expand_dims(a[:, 0], axis=1), np.expand_dims(b[:, 0], axis=0)) \
+                + 1
+            ih = np.minimum(np.expand_dims(a[:, 3], axis=1), np.expand_dims(b[:, 3], axis=0)) \
+                - np.maximum(np.expand_dims(a[:, 1], axis=1), np.expand_dims(b[:, 1], axis=0)) \
+                + 1
+            iw[iw<0] = 0 # (N, K)
+            ih[ih<0] = 0 # (N, K)
+            intersection = iw * ih
+            ua = np.expand_dims((a[:, 2] - a[:, 0] + 1) * (a[:, 3] - a[:, 1] + 1), axis=1) + area - intersection
+            ua[ua < 1e-8] = 1e-8
+            return intersection / ua
+        elif len(a.shape) == 1:
+            area = np.expand_dims((b[:, 2] - b[:, 0] + 1) * (b[:, 3] - b[:, 1] + 1), axis=0) #(1, K)
+            iw = np.minimum(np.expand_dims([a[2]], axis=1), np.expand_dims(b[:, 2], axis=0)) \
+                - np.maximum(np.expand_dims([a[0]], axis=1), np.expand_dims(b[:, 0], axis=0))
+            ih = np.minimum(np.expand_dims([a[3]], axis=1), np.expand_dims(b[:, 3], axis=0)) \
+                - np.maximum(np.expand_dims([a[1]], axis=1), np.expand_dims(b[:, 1], axis=0))
+            iw[iw<0] = 0 # (N, K)
+            ih[ih<0] = 0 # (N, K)
+            ua = np.expand_dims([(a[2] - a[0] + 1) * (a[3] - a[1] + 1)], axis=1) + area - iw * ih
+            ua[ua < 1e-8] = 1e-8
+            intersection = iw * ih
+            return intersection / ua
+def _compute_ap(recall, precision):
+    """ Compute the average precision, given the recall and precision curves.
+    Code originally from https://github.com/rbgirshick/py-faster-rcnn.
+    # Arguments
+            recall:    The recall curve (list).
+            precision: The precision curve (list).
+    # Returns
+            The average precision as computed in py-faster-rcnn.
+    """
+    # correct AP calculation
+    # first append sentinel values at the end
+    mrec = np.concatenate(([0.], recall, [1.]))
+    mpre = np.concatenate(([0.], precision, [0.]))
+    # compute the precision envelope
+    for i in range(mpre.size - 1, 0, -1):
+        mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
+    # to calculate area under PR curve, look for points
+    # where X axis (recall) changes value
+    i = np.where(mrec[1:] != mrec[:-1])[0]
+    # and sum (\Delta recall) * prec
+    ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
+    return ap

hotr/metrics/vcoco/ap_agent.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import numpy as np
+from hotr.metrics.utils import _compute_ap, compute_overlap
+import pdb
+class APAgent(object):
+    def __init__(self, act_name, iou_threshold=0.5):
+        self.act_name = act_name
+        self.iou_threshold = iou_threshold
+        self.fp = [np.zeros((0,))] * len(act_name)
+        self.tp  = [np.zeros((0,))] * len(act_name)
+        self.score = [np.zeros((0,))] * len(act_name)
+        self.num_ann = [0] * len(act_name)
+    def add_data(self, box, act, cat, i_box, i_act):
+        for label in range(len(self.act_name)):
+            i_inds = (i_act[:, label] == 1)
+            self.num_ann[label] += i_inds.sum()
+        n_pred = box.shape[0]
+        if n_pred == 0 : return
+        ######################
+        valid_i_inds = (i_act[:, 0] != -1) # (n_i, ) # both in COCO & V-COCO
+        overlaps = compute_overlap(box, i_box) # (n_pred, n_i)
+        assigned_input = np.argmax(overlaps, axis=1) # (n_pred, )
+        v_inds = valid_i_inds[assigned_input] # (n_pred, )
+        n_valid = v_inds.sum()
+        if n_valid == 0 : return
+        valid_box = box[v_inds]
+        valid_act = act[v_inds]
+        valid_cat = cat[v_inds]
+        ######################
+        s = valid_act * np.expand_dims(valid_cat, axis=1) # (n_v, #act)
+        for label in range(len(self.act_name)):
+            inds = np.argsort(s[:, label])[::-1] # (n_v, )
+            self.score[label] = np.append(self.score[label], s[inds, label])
+            correct_i_inds = (i_act[:, label] == 1)
+            if correct_i_inds.sum() == 0:
+                self.tp[label] = np.append(self.tp[label], np.array([0]*n_valid))
+                self.fp[label] = np.append(self.fp[label], np.array([1]*n_valid))
+                continue
+            overlaps = compute_overlap(valid_box[inds], i_box) # (n_v, n_i)
+            assigned_input = np.argmax(overlaps, axis=1) # (n_v, )
+            max_overlap = overlaps[range(n_valid), assigned_input] # (n_v, )
+            iou_inds = (max_overlap > self.iou_threshold) & correct_i_inds[assigned_input] # (n_v, )
+            i_nonzero = iou_inds.nonzero()[0]
+            i_inds = assigned_input[i_nonzero]
+            i_iou = np.unique(i_inds, return_index=True)[1]
+            i_tp = i_nonzero[i_iou]
+            t = np.zeros(n_valid, dtype=np.uint8)
+            t[i_tp] = 1
+            f = 1-t
+            self.tp[label] = np.append(self.tp[label], t)
+            self.fp[label] = np.append(self.fp[label], f)
+    def evaluate(self):
+        average_precisions = dict()
+        for label in range(len(self.act_name)):
+            if self.num_ann[label] == 0:
+                average_precisions[label] = 0
+                continue
+            # sort by score
+            indices = np.argsort(-self.score[label])
+            self.fp[label] = self.fp[label][indices]
+            self.tp[label] = self.tp[label][indices]
+            # compute false positives and true positives
+            self.fp[label] = np.cumsum(self.fp[label])
+            self.tp[label] = np.cumsum(self.tp[label])
+            # compute recall and precision
+            recall    = self.tp[label] / self.num_ann[label]
+            precision = self.tp[label] / np.maximum(self.tp[label] + self.fp[label], np.finfo(np.float64).eps)
+            # compute average precision
+            average_precisions[label] = _compute_ap(recall, precision) * 100
+        print('\n================== AP (Agent) ===================')
+        s, n = 0, 0
+        for label in range(len(self.act_name)):
+            label_name = "_".join(self.act_name[label].split("_")[1:])
+            print('{: >23}: AP = {:0.2f} (#pos = {:d})'.format(label_name, average_precisions[label], self.num_ann[label]))
+            s += average_precisions[label]
+            n += 1
+        mAP = s/n
+        print('| mAP(agent): {:0.2f}'.format(mAP))
+        print('----------------------------------------------------')
+        return mAP

hotr/metrics/vcoco/ap_role.py ADDED Viewed

	@@ -0,0 +1,193 @@

+import numpy as np
+import torch
+from hotr.metrics.utils import _compute_ap, compute_overlap
+class APRole(object):
+    def __init__(self, act_name, scenario_flag=True, iou_threshold=0.5):
+        self.act_name = act_name
+        self.iou_threshold = iou_threshold
+        self.scenario_flag = scenario_flag
+        # scenario_1 : True
+        # scenario_2 : False
+        self.fp = [np.zeros((0,))] * len(act_name)
+        self.tp  = [np.zeros((0,))] * len(act_name)
+        self.score = [np.zeros((0,))] * len(act_name)
+        self.num_ann = [0] * len(act_name)
+    def add_data(self, h_box, o_box, score, i_box, i_act, p_box, p_act):
+        # i_box, i_act : to check if only in COCO
+        for label in range(len(self.act_name)):
+            p_inds = (p_act[:, label] == 1)
+            self.num_ann[label] += p_inds.sum()
+        if h_box.shape[0] == 0 : return # if no prediction, just return
+        # COCO (O), V-COCO (X) __or__ collater, no ann in image => ignore
+        valid_i_inds = (i_act[:, 0] != -1) # (n_i, )
+        overlaps = compute_overlap(h_box, i_box) # (n_h, n_i)
+        assigned_input = np.argmax(overlaps, axis=1) # (n_h, )
+        v_inds = valid_i_inds[assigned_input] # (n_h, )
+        h_box = h_box[v_inds]
+        score = score[:, v_inds, :]
+        if h_box.shape[0] == 0 : return
+        n_h = h_box.shape[0]
+        valid_p_inds = (p_act[:, 0] != -1) | (p_box[:, 0] != -1)
+        p_act = p_act[valid_p_inds]
+        p_box = p_box[valid_p_inds]
+        n_o = o_box.shape[0]
+        if n_o == 0:
+            # no prediction for object
+            score = score.squeeze(axis=2) # (#act, n_h)
+            for label in range(len(self.act_name)):
+                h_inds = np.argsort(score[label])[::-1] # (n_h, )
+                self.score[label] = np.append(self.score[label], score[label, h_inds])
+                p_inds = (p_act[:, label] == 1)
+                if p_inds.sum() == 0:
+                    self.tp[label] = np.append(self.tp[label], np.array([0]*n_h))
+                    self.fp[label] = np.append(self.fp[label], np.array([1]*n_h))
+                    continue
+                h_overlaps = compute_overlap(h_box[h_inds], p_box[p_inds, :4]) # (n_h, n_p)
+                assigned_p = np.argmax(h_overlaps, axis=1) # (n_h, )
+                h_max_overlap = h_overlaps[range(n_h), assigned_p] # (n_h, )
+                o_overlaps = compute_overlap(np.zeros((n_h, 4)), p_box[p_inds][assigned_p, 4:8])
+                o_overlaps = np.diag(o_overlaps) # (n_h, )
+                no_role_inds = (p_box[p_inds][assigned_p, 4] == -1) # (n_h, )
+                # human (o), action (o), no object in actual image
+                h_iou_inds = (h_max_overlap > self.iou_threshold) # (n_h, )
+                o_iou_inds = (o_overlaps > self.iou_threshold) # (n_h, )
+                # scenario1 is not considered (already no object)
+                o_iou_inds[no_role_inds] = 1
+                iou_inds = (h_iou_inds & o_iou_inds)
+                p_nonzero = iou_inds.nonzero()[0]
+                p_inds = assigned_p[p_nonzero]
+                p_iou = np.unique(p_inds, return_index=True)[1]
+                p_tp = p_nonzero[p_iou]
+                t = np.zeros(n_h, dtype=np.uint8)
+                t[p_tp] = 1
+                f = 1-t
+                self.tp[label] = np.append(self.tp[label], t)
+                self.fp[label] = np.append(self.fp[label], f)
+        else:
+            s_obj_argmax = np.argmax(score.reshape(-1, n_o), axis=1).reshape(-1, n_h) # (#act, n_h)
+            s_obj_max = np.max(score.reshape(-1, n_o), axis=1).reshape(-1, n_h) # (#act, n_h)
+            h_overlaps = compute_overlap(h_box, p_box[:, :4]) # (n_h, n_p)
+            for label in range(len(self.act_name)):
+                h_inds = np.argsort(s_obj_max[label])[::-1] # (n_h, )
+                self.score[label] = np.append(self.score[label], s_obj_max[label, h_inds])
+                p_inds = (p_act[:, label] == 1) # (n_p, )
+                if p_inds.sum() == 0:
+                    self.tp[label] = np.append(self.tp[label], np.array([0]*n_h))
+                    self.fp[label] = np.append(self.fp[label], np.array([1]*n_h))
+                    continue
+                h_overlaps = compute_overlap(h_box[h_inds], p_box[:, :4]) # (n_h, n_p) # match for all hboxes
+                h_max_overlap = np.max(h_overlaps, axis=1) # (n_h, ) # get the max overlap for hbox
+                # for same human, multiple pairs exist. find the human box that has the same idx with max overlap hbox.
+                h_max_temp = np.expand_dims(h_max_overlap, axis=1)
+                h_over_thresh = (h_overlaps == h_max_temp) # (n_h, n_p)
+                h_over_thresh = h_over_thresh & np.expand_dims(p_inds, axis=0) # (n_h, n_p) # find only for current act
+                h_valid = h_over_thresh.sum(axis=1)>0 # (n_h, ) # at least one is True
+                # h_valid -> if all is False, then argmax becomes 0. <- prevent
+                assigned_p = np.argmax(h_over_thresh, axis=1) # (n_h, ) # p only for current act
+                o_mapping_box = o_box[s_obj_argmax[label]][h_inds] # (n_h, ) # find where T is.
+                p_mapping_box = p_box[assigned_p, 4:8] # (n_h, 4)
+                o_overlaps = compute_overlap(o_mapping_box, p_mapping_box)
+                o_overlaps = np.diag(o_overlaps) # (n_h, )
+                o_overlaps.setflags(write=1)
+                if (~h_valid).sum() > 0:
+                    o_overlaps[~h_valid] = 0 # (n_h, )
+                no_role_inds = (p_box[assigned_p, 4] == -1) # (n_h, )
+                nan_box_inds = np.all(o_mapping_box == 0, axis=1) | np.all(np.isnan(o_mapping_box), axis=1)
+                no_role_inds = no_role_inds & h_valid
+                nan_box_inds = nan_box_inds & h_valid
+                h_iou_inds = (h_max_overlap > self.iou_threshold) # (n_h, )
+                o_iou_inds = (o_overlaps > self.iou_threshold) # (n_h, )
+                if self.scenario_flag: # scenario_1
+                    o_iou_inds[no_role_inds & nan_box_inds] = 1
+                    o_iou_inds[no_role_inds & ~nan_box_inds] = 0
+                else: # scenario_2
+                    o_iou_inds[no_role_inds] = 1
+                iou_inds = (h_iou_inds & o_iou_inds)
+                p_nonzero = iou_inds.nonzero()[0]
+                p_inds = assigned_p[p_nonzero]
+                p_iou = np.unique(p_inds, return_index=True)[1]
+                p_tp = p_nonzero[p_iou]
+                t = np.zeros(n_h, dtype=np.uint8)
+                t[p_tp] = 1
+                f = 1-t
+                self.tp[label] = np.append(self.tp[label], t)
+                self.fp[label] = np.append(self.fp[label], f)
+    def evaluate(self, print_log=False):
+        average_precisions = dict()
+        role_num = 1 if self.scenario_flag else 2
+        for label in range(len(self.act_name)):
+            # sort by score
+            indices = np.argsort(-self.score[label])
+            self.fp[label] = self.fp[label][indices]
+            self.tp[label] = self.tp[label][indices]
+            if self.num_ann[label] == 0:
+                average_precisions[label] = 0
+                continue
+            # compute false positives and true positives
+            self.fp[label] = np.cumsum(self.fp[label])
+            self.tp[label] = np.cumsum(self.tp[label])
+            # compute recall and precision
+            recall    = self.tp[label] / self.num_ann[label]
+            precision = self.tp[label] / np.maximum(self.tp[label] + self.fp[label], np.finfo(np.float64).eps)
+            # compute average precision
+            average_precisions[label] = _compute_ap(recall, precision) * 100
+        if print_log: print(f'\n============= AP (Role scenario_{role_num}) ==============')
+        s, n = 0, 0
+        for label in range(len(self.act_name)):
+            if 'point' in self.act_name[label]:
+                continue
+            label_name = "_".join(self.act_name[label].split("_")[1:])
+            if print_log: print('{: >23}: AP = {:0.2f} (#pos = {:d})'.format(label_name, average_precisions[label], self.num_ann[label]))
+            if self.num_ann[label] != 0 :
+                s += average_precisions[label]
+                n += 1
+        mAP = s/n
+        if print_log:
+            print('| mAP(role scenario_{:d}): {:0.2f}'.format(role_num, mAP))
+            print('----------------------------------------------------')
+        return mAP

hotr/models/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from .detr import build
+def build_model(args):
+    return build(args)

hotr/models/backbone.py ADDED Viewed

	@@ -0,0 +1,118 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Backbone modules.
+"""
+from collections import OrderedDict
+import torch
+import torch.nn.functional as F
+import torchvision
+from torch import nn
+from torchvision.models._utils import IntermediateLayerGetter
+from typing import Dict, List
+from hotr.util.misc import NestedTensor, is_main_process
+from .position_encoding import build_position_encoding
+class FrozenBatchNorm2d(torch.nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt,
+    without which any other models than torchvision.models.resnet[18,34,50,101]
+    produce nans.
+    """
+    def __init__(self, n):
+        super(FrozenBatchNorm2d, self).__init__()
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        num_batches_tracked_key = prefix + 'num_batches_tracked'
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+        super(FrozenBatchNorm2d, self)._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict,
+            missing_keys, unexpected_keys, error_msgs)
+    def forward(self, x):
+        # move reshapes to the beginning
+        # to make it fuser-friendly
+        w = self.weight.reshape(1, -1, 1, 1)
+        b = self.bias.reshape(1, -1, 1, 1)
+        rv = self.running_var.reshape(1, -1, 1, 1)
+        rm = self.running_mean.reshape(1, -1, 1, 1)
+        eps = 1e-5
+        scale = w * (rv + eps).rsqrt()
+        bias = b - rm * scale
+        return x * scale + bias
+class BackboneBase(nn.Module):
+    def __init__(self, backbone: nn.Module, train_backbone: bool, num_channels: int, return_interm_layers: bool):
+        super().__init__()
+        for name, parameter in backbone.named_parameters():
+            if not train_backbone or 'layer2' not in name and 'layer3' not in name and 'layer4' not in name:
+                parameter.requires_grad_(False)
+        if return_interm_layers:
+            return_layers = {"layer1": "0", "layer2": "1", "layer3": "2", "layer4": "3"}
+        else:
+            return_layers = {'layer4': "0"}
+        self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
+        self.num_channels = num_channels
+    def forward(self, tensor_list: NestedTensor):
+        xs = self.body(tensor_list.tensors)
+        out: Dict[str, NestedTensor] = {}
+        for name, x in xs.items():
+            m = tensor_list.mask
+            assert m is not None
+            mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
+            out[name] = NestedTensor(x, mask)
+        return out
+class Backbone(BackboneBase):
+    """ResNet backbone with frozen BatchNorm."""
+    def __init__(self, name: str,
+                 train_backbone: bool,
+                 return_interm_layers: bool,
+                 dilation: bool):
+        backbone = getattr(torchvision.models, name)(
+            replace_stride_with_dilation=[False, False, dilation],
+            pretrained=is_main_process(), norm_layer=FrozenBatchNorm2d)
+        num_channels = 512 if name in ('resnet18', 'resnet34') else 2048
+        super().__init__(backbone, train_backbone, num_channels, return_interm_layers)
+class Joiner(nn.Sequential):
+    def __init__(self, backbone, position_embedding):
+        super().__init__(backbone, position_embedding)
+    def forward(self, tensor_list: NestedTensor):
+        xs = self[0](tensor_list)
+        out: List[NestedTensor] = []
+        pos = []
+        for name, x in xs.items():
+            out.append(x)
+            # position encoding
+            pos.append(self[1](x).to(x.tensors.dtype))
+        return out, pos
+def build_backbone(args):
+    position_embedding = build_position_encoding(args)
+    train_backbone = args.lr_backbone > 0
+    return_interm_layers = False # args.masks
+    backbone = Backbone(args.backbone, train_backbone, return_interm_layers, args.dilation)
+    model = Joiner(backbone, position_embedding)
+    model.num_channels = backbone.num_channels
+    return model

hotr/models/criterion.py ADDED Viewed

	@@ -0,0 +1,349 @@

+# ------------------------------------------------------------------------
+# HOTR official code : main.py
+# Copyright (c) Kakao Brain, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+import torch
+import torch.nn.functional as F
+import copy
+import numpy as np
+import itertools
+from torch import nn
+from hotr.util import box_ops
+from hotr.util.misc import (accuracy, get_world_size, is_dist_avail_and_initialized)
+class SetCriterion(nn.Module):
+    """ This class computes the loss for DETR.
+    The process happens in two steps:
+        1) we compute hungarian assignment between ground truth boxes and the outputs of the model
+        2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
+    """
+    def __init__(self, num_classes, matcher, weight_dict, eos_coef, losses, num_actions=None, HOI_losses=None, HOI_matcher=None, args=None):
+        """ Create the criterion.
+        Parameters:
+            num_classes: number of object categories, omitting the special no-object category
+            matcher: module able to compute a matching between targets and proposals
+            weight_dict: dict containing as key the names of the losses and as values their relative weight.
+            eos_coef: relative classification weight applied to the no-object category
+            losses: list of all the losses to be applied. See get_loss for list of available losses.
+        """
+        super().__init__()
+        self.num_classes = num_classes
+        self.matcher = matcher
+        self.weight_dict = weight_dict
+        self.losses = losses
+        self.eos_coef=eos_coef
+        self.HOI_losses = HOI_losses
+        self.HOI_matcher = HOI_matcher
+        self.use_consis=args.use_consis & len(args.augpath_name)>0
+        self.num_path = 1+len(args.augpath_name)
+        if args:
+            self.HOI_eos_coef = args.hoi_eos_coef
+            if args.dataset_file == 'vcoco':
+                self.invalid_ids = args.invalid_ids
+                self.valid_ids = np.concatenate((args.valid_ids,[-1]), axis=0) # no interaction
+            elif args.dataset_file == 'hico-det':
+                self.invalid_ids = []
+                self.valid_ids = list(range(num_actions)) + [-1]
+                # for targets
+                self.num_tgt_classes = len(args.valid_obj_ids)
+                tgt_empty_weight = torch.ones(self.num_tgt_classes + 1)
+                tgt_empty_weight[-1] = self.HOI_eos_coef
+                self.register_buffer('tgt_empty_weight', tgt_empty_weight)
+        self.dataset_file = args.dataset_file
+        empty_weight = torch.ones(self.num_classes + 1)
+        empty_weight[-1] = eos_coef
+        self.register_buffer('empty_weight', empty_weight)
+    #######################################################################################################################
+    # * DETR Losses
+    #######################################################################################################################
+    def loss_labels(self, outputs, targets, indices, num_boxes, log=True):
+        """Classification loss (NLL)
+        targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
+        """
+        assert 'pred_logits' in outputs
+        src_logits = outputs['pred_logits']
+        idx = self._get_src_permutation_idx(indices)
+        target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)])
+        target_classes = torch.full(src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device)
+        target_classes[idx] = target_classes_o
+        loss_ce = F.cross_entropy(src_logits.transpose(1, 2), target_classes, self.empty_weight)
+        losses = {'loss_ce': loss_ce}
+        if log:
+            # TODO this should probably be a separate loss, not hacked in this one here
+            losses['class_error'] = 100 - accuracy(src_logits[idx], target_classes_o)[0]
+        return losses
+    @torch.no_grad()
+    def loss_cardinality(self, outputs, targets, indices, num_boxes):
+        """ Compute the cardinality error, ie the absolute error in the number of predicted non-empty boxes
+        This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients
+        """
+        pred_logits = outputs['pred_logits']
+        device = pred_logits.device
+        tgt_lengths = torch.as_tensor([len(v["labels"]) for v in targets], device=device)
+        # Count the number of predictions that are NOT "no-object" (which is the last class)
+        card_pred = (pred_logits.argmax(-1) != pred_logits.shape[-1] - 1).sum(1)
+        card_err = F.l1_loss(card_pred.float(), tgt_lengths.float())
+        losses = {'cardinality_error': card_err}
+        return losses
+    def loss_boxes(self, outputs, targets, indices, num_boxes):
+        """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
+           targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
+           The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size.
+        """
+        assert 'pred_boxes' in outputs
+        idx = self._get_src_permutation_idx(indices)
+        src_boxes = outputs['pred_boxes'][idx]
+        target_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
+        loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none')
+        losses = {}
+        losses['loss_bbox'] = loss_bbox.sum() / num_boxes
+        loss_giou = 1 - torch.diag(box_ops.generalized_box_iou(
+            box_ops.box_cxcywh_to_xyxy(src_boxes),
+            box_ops.box_cxcywh_to_xyxy(target_boxes)))
+        losses['loss_giou'] = loss_giou.sum() / num_boxes
+        return losses
+    #######################################################################################################################
+    # * HOTR Losses
+    #######################################################################################################################
+    # >>> HOI Losses 1 : HO Pointer
+    def loss_pair_labels(self, outputs, targets, hoi_indices, num_boxes,use_consis, log=False):
+        assert ('pred_hidx' in outputs and 'pred_oidx' in outputs)
+        outputs_without_aux = {k: v for k, v in outputs.items() if k != 'aux_outputs'}
+        nu,q,hd=outputs['pred_hidx'].shape
+        src_hidx = outputs['pred_hidx'].view(self.num_path,nu//self.num_path,q,-1).transpose(0,1).flatten(0,1)
+        src_oidx = outputs['pred_oidx'].view(self.num_path,nu//self.num_path,q,-1).transpose(0,1).flatten(0,1)
+        hoi_ind=list(itertools.chain.from_iterable(hoi_indices))
+        idx = self._get_src_permutation_idx(hoi_ind)
+        target_hidx_classes = torch.full(src_hidx.shape[:2], -1, dtype=torch.int64, device=src_hidx.device)
+        target_oidx_classes = torch.full(src_oidx.shape[:2], -1, dtype=torch.int64, device=src_oidx.device)
+        # H Pointer loss
+        target_classes_h = torch.cat([t["h_labels"][J] for t, hoi_indice in zip(targets, hoi_indices) for (_,J) in hoi_indice])
+        target_hidx_classes[idx] = target_classes_h
+        # O Pointer loss
+        target_classes_o = torch.cat([t["o_labels"][J] for t, hoi_indice in zip(targets, hoi_indices) for (_,J) in hoi_indice])
+        target_oidx_classes[idx] = target_classes_o
+        loss_h = F.cross_entropy(src_hidx.transpose(1, 2), target_hidx_classes, ignore_index=-1)
+        loss_o = F.cross_entropy(src_oidx.transpose(1, 2), target_oidx_classes, ignore_index=-1)
+        #Consistency loss
+        if use_consis:
+            consistency_idxs=[self._get_consistency_src_permutation_idx(hoi_indice) for hoi_indice in hoi_indices ]
+            src_hidx_inputs=[F.softmax(src_hidx.view(-1,self.num_path,q,hd)[i][consistency_idx[0]],-1) for i,consistency_idx in enumerate(consistency_idxs)]
+            src_hidx_targets=[F.softmax(src_hidx.view(-1,self.num_path,q,hd)[i][consistency_idx[1]],-1) for i,consistency_idx in enumerate(consistency_idxs)]
+            src_oidx_inputs=[F.softmax(src_oidx.view(-1,self.num_path,q,hd)[i][consistency_idx[0]],-1) for i,consistency_idx in enumerate(consistency_idxs)]
+            src_oidx_targets=[F.softmax(src_oidx.view(-1,self.num_path,q,hd)[i][consistency_idx[1]],-1) for i,consistency_idx in enumerate(consistency_idxs)]
+            loss_h_consistency=[0.5*(F.kl_div(src_hidx_input.log(),src_hidx_target.clone().detach(),reduction='batchmean')+F.kl_div(src_hidx_target.log(),src_hidx_input.clone().detach(),reduction='batchmean')) for src_hidx_input,src_hidx_target in zip(src_hidx_inputs,src_hidx_targets)]
+            loss_o_consistency=[0.5*(F.kl_div(src_oidx_input.log(),src_oidx_target.clone().detach(),reduction='batchmean')+F.kl_div(src_oidx_target.log(),src_oidx_input.clone().detach(),reduction='batchmean')) for src_oidx_input,src_oidx_target in zip(src_oidx_inputs,src_oidx_targets)]
+            loss_h_consistency=torch.mean(torch.stack(loss_h_consistency))
+            loss_o_consistency=torch.mean(torch.stack(loss_o_consistency))
+            losses = {'loss_hidx': loss_h, 'loss_oidx': loss_o,'loss_h_consistency':loss_h_consistency,'loss_o_consistency':loss_o_consistency}
+        else:
+            losses = {'loss_hidx': loss_h, 'loss_oidx': loss_o}
+        return losses
+    # >>> HOI Losses 2 : pair actions
+    def loss_pair_actions(self, outputs, targets, hoi_indices, num_boxes,use_consis):
+        assert 'pred_actions' in outputs
+        src_actions = outputs['pred_actions'].flatten(end_dim=1)
+        hoi_ind=list(itertools.chain.from_iterable(hoi_indices))
+        # idx = self._get_src_permutation_idx(hoi_indices)
+        idx = self._get_src_permutation_idx(hoi_ind)
+        # Construct Target --------------------------------------------------------------------------------------------------------------
+        target_classes_o = torch.cat([t["pair_actions"][J] for t, hoi_indice in zip(targets, hoi_indices) for (_,J) in hoi_indice])
+        target_classes = torch.full(src_actions.shape, 0, dtype=torch.float32, device=src_actions.device)
+        target_classes[..., -1] = 1 # the last index for no-interaction is '1' if a label exists
+        pos_classes = torch.full(target_classes[idx].shape, 0, dtype=torch.float32, device=src_actions.device) # else, the last index for no-interaction is '0'
+        pos_classes[:, :-1] = target_classes_o.float()
+        target_classes[idx] = pos_classes
+        # --------------------------------------------------------------------------------------------------------------------------------
+        # BCE Loss -----------------------------------------------------------------------------------------------------------------------
+        logits = src_actions.sigmoid()
+        loss_bce = F.binary_cross_entropy(logits[..., self.valid_ids], target_classes[..., self.valid_ids], reduction='none')
+        p_t = logits[..., self.valid_ids] * target_classes[..., self.valid_ids] + (1 - logits[..., self.valid_ids]) * (1 - target_classes[..., self.valid_ids])
+        loss_bce = ((1-p_t)**2 * loss_bce)
+        alpha_t = 0.25 * target_classes[..., self.valid_ids] + (1 - 0.25) * (1 - target_classes[..., self.valid_ids])
+        loss_focal = alpha_t * loss_bce
+        loss_act = loss_focal.sum() / max(target_classes[..., self.valid_ids[:-1]].sum(), 1)
+        # --------------------------------------------------------------------------------------------------------------------------------
+        #Consistency loss
+        if use_consis:
+            consistency_idxs=[self._get_consistency_src_permutation_idx(hoi_indice) for hoi_indice in hoi_indices]
+            src_action_inputs=[F.logsigmoid(outputs['pred_actions'][i][consistency_idx[0]]) for i,consistency_idx in enumerate(consistency_idxs)]
+            src_action_targets=[F.logsigmoid(outputs['pred_actions'][i][consistency_idx[1]]) for i,consistency_idx in enumerate(consistency_idxs)]
+            loss_action_consistency=[F.mse_loss(src_action_input,src_action_target) for src_action_input,src_action_target in zip(src_action_inputs,src_action_targets)]
+            loss_action_consistency=torch.mean(torch.stack(loss_action_consistency))
+            # import pdb;pdb.set_trace()
+            losses = {'loss_act': loss_act,'loss_act_consistency':loss_action_consistency}
+        else:
+            losses = {'loss_act': loss_act}
+        return losses
+    # HOI Losses 3 : action targets
+    def loss_pair_targets(self, outputs, targets, hoi_indices, num_interactions,use_consis, log=True):
+        assert 'pred_obj_logits' in outputs
+        src_logits = outputs['pred_obj_logits']
+        nu,q,hd=outputs['pred_obj_logits'].shape
+        hoi_ind=list(itertools.chain.from_iterable(hoi_indices))
+        idx = self._get_src_permutation_idx(hoi_ind)
+        target_classes_o = torch.cat([t['pair_targets'][J] for t, hoi_indice in zip(targets, hoi_indices) for (_,J) in hoi_indice])
+        pad_tgt = -1 # src_logits.shape[2]-1
+        target_classes = torch.full(src_logits.shape[:2], pad_tgt, dtype=torch.int64, device=src_logits.device)
+        target_classes[idx] = target_classes_o
+        loss_obj_ce = F.cross_entropy(src_logits.transpose(1, 2), target_classes, self.tgt_empty_weight, ignore_index=-1)
+        #consistency
+        if use_consis:
+            consistency_idxs=[self._get_consistency_src_permutation_idx(hoi_indice) for hoi_indice in hoi_indices]
+            src_logits_inputs=[F.softmax(src_logits.view(-1,self.num_path,q,hd)[i][consistency_idx[0]],-1) for i,consistency_idx in enumerate(consistency_idxs)]
+            src_logits_targets=[F.softmax(src_logits.view(-1,self.num_path,q,hd)[i][consistency_idx[1]],-1) for i,consistency_idx in enumerate(consistency_idxs)]
+            loss_tgt_consistency=[0.5*(F.kl_div(src_logit_input.log(),src_logit_target.clone().detach(),reduction='batchmean')+F.kl_div(src_logit_target.log(),src_logit_input.clone().detach(),reduction='batchmean')) for src_logit_input,src_logit_target in zip(src_logits_inputs,src_logits_targets)]
+            loss_tgt_consistency=torch.mean(torch.stack(loss_tgt_consistency))
+            losses = {'loss_tgt': loss_obj_ce,"loss_tgt_label_consistency":loss_tgt_consistency}
+        else:
+            losses = {'loss_tgt': loss_obj_ce}
+        if log:
+            ignore_idx = (target_classes_o != -1)
+            losses['obj_class_error'] = 100 - accuracy(src_logits[idx][ignore_idx, :-1], target_classes_o[ignore_idx])[0]
+            # losses['obj_class_error'] = 100 - accuracy(src_logits[idx], target_classes_o)[0]
+        return losses
+    def _get_src_permutation_idx(self, indices):
+        # permute predictions following indices
+        batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
+        src_idx = torch.cat([src for (src, _) in indices])
+        return batch_idx, src_idx
+    def _get_consistency_src_permutation_idx(self, indices):
+        all_tgt=torch.cat([j for(_,j) in indices]).unique()
+        path_idxs=[torch.cat([torch.tensor([i]) for i,(_,t)in enumerate(indices) if (t==tgt).any()]) for tgt in all_tgt]
+        q_idxs=[torch.cat([s[t==tgt] for (s,t)in indices]) for tgt in all_tgt]
+        path_idxs=torch.cat([torch.combinations(path_idx) for path_idx in path_idxs if len(path_idx)>1])
+        q_idxs=torch.cat([torch.combinations(q_idx) for q_idx in q_idxs if len(q_idx)>1])
+        return (path_idxs[:,0],q_idxs[:,0]),(path_idxs[:,1],q_idxs[:,1])
+    def _get_tgt_permutation_idx(self, indices):
+        # permute targets following indices
+        batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
+        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
+        return batch_idx, tgt_idx
+    # *****************************************************************************
+    # >>> DETR Losses
+    def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs):
+        loss_map = {
+            'labels': self.loss_labels,
+            'cardinality': self.loss_cardinality,
+            'boxes': self.loss_boxes
+        }
+        assert loss in loss_map, f'do you really want to compute {loss} loss?'
+        return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs)
+    # >>> HOTR Losses
+    def get_HOI_loss(self, loss, outputs, targets, indices, num_boxes,use_consis, **kwargs):
+        loss_map = {
+            'pair_labels': self.loss_pair_labels,
+            'pair_actions': self.loss_pair_actions
+        }
+        if self.dataset_file == 'hico-det': loss_map['pair_targets'] = self.loss_pair_targets
+        assert loss in loss_map, f'do you really want to compute {loss} loss?'
+        return loss_map[loss](outputs, targets, indices, num_boxes,use_consis, **kwargs)
+    # *****************************************************************************
+    def forward(self, outputs, targets, log=False):
+        """ This performs the loss computation.
+        Parameters:
+             outputs: dict of tensors, see the output specification of the model for the format
+             targets: list of dicts, such that len(targets) == batch_size.
+                      The expected keys in each dict depends on the losses applied, see each loss' doc
+        """
+        outputs_without_aux = {k: v for k, v in outputs.items() if (k != 'aux_outputs' and k != 'hoi_aux_outputs')}
+        # Retrieve the matching between the outputs of the last layer and the targets
+        indices = self.matcher(outputs_without_aux, targets)
+        if self.HOI_losses is not None:
+            input_targets = [copy.deepcopy(target) for target in targets]
+            hoi_indices, hoi_targets = self.HOI_matcher(outputs_without_aux, input_targets, indices, log)
+        # Compute the average number of target boxes accross all nodes, for normalization purposes
+        num_boxes = sum(len(t["labels"]) for t in targets)
+        num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
+        if is_dist_avail_and_initialized():
+            torch.distributed.all_reduce(num_boxes)
+        num_boxes = torch.clamp(num_boxes / get_world_size(), min=1).item()
+        # Compute all the requested losses
+        losses = {}
+        for loss in self.losses:
+            losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes))
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if 'aux_outputs' in outputs:
+            for i, aux_outputs in enumerate(outputs['aux_outputs']):
+                indices = self.matcher(aux_outputs, targets)
+                for loss in self.losses:
+                    if loss == 'masks':
+                        # Intermediate masks losses are too costly to compute, we ignore them.
+                        continue
+                    kwargs = {}
+                    if loss == 'labels':
+                        # Logging is enabled only for the last layer
+                        kwargs = {'log': False}
+                    l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_boxes, **kwargs)
+                    l_dict = {k + f'_{i}': v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+        # HOI detection losses
+        if self.HOI_losses is not None:
+            for loss in self.HOI_losses:
+                losses.update(self.get_HOI_loss(loss, outputs, hoi_targets, hoi_indices, num_boxes,self.use_consis))
+            # if self.dataset_file == 'hico-det': losses['loss_oidx'] += losses['loss_tgt']
+            if 'hoi_aux_outputs' in outputs:
+                for i, aux_outputs in enumerate(outputs['hoi_aux_outputs']):
+                    input_targets = [copy.deepcopy(target) for target in targets]
+                    hoi_indices, targets_for_aux = self.HOI_matcher(aux_outputs, input_targets, indices, log)
+                    for loss in self.HOI_losses:
+                        kwargs = {}
+                        if loss == 'pair_targets': kwargs = {'log': False} # Logging is enabled only for the last layer
+                        l_dict = self.get_HOI_loss(loss, aux_outputs, hoi_targets, hoi_indices, num_boxes,self.use_consis, **kwargs)
+                        l_dict = {k + f'_{i}': v for k, v in l_dict.items()}
+                        losses.update(l_dict)
+                    # if self.dataset_file == 'hico-det': losses[f'loss_oidx_{i}'] += losses[f'loss_tgt_{i}']
+        return losses

hotr/models/detr.py ADDED Viewed

	@@ -0,0 +1,187 @@

+# ------------------------------------------------------------------------
+# HOTR official code : hotr/models/detr.py
+# Copyright (c) Kakao Brain, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+"""
+DETR & HOTR model and criterion classes.
+"""
+import torch
+import torch.nn.functional as F
+from torch import nn
+from hotr.util.misc import (NestedTensor, nested_tensor_from_tensor_list)
+from .backbone import build_backbone
+from .detr_matcher import build_matcher
+from .hotr_matcher import build_hoi_matcher
+from .transformer import build_transformer, build_hoi_transformer
+from .criterion import SetCriterion
+from .post_process import PostProcess
+from .feed_forward import MLP
+from .hotr import HOTR
+from .hotr_v1 import HOTR_V1
+class DETR(nn.Module):
+    """ This is the DETR module that performs object detection """
+    def __init__(self, backbone, transformer, num_classes, num_queries, aux_loss=False):
+        """ Initializes the model.
+        Parameters:
+            backbone: torch module of the backbone to be used. See backbone.py
+            transformer: torch module of the transformer architecture. See transformer.py
+            num_classes: number of object classes
+            num_queries: number of object queries, ie detection slot. This is the maximal number of objects
+                         DETR can detect in a single image. For COCO, we recommend 100 queries.
+            aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
+        """
+        super().__init__()
+        self.num_queries = num_queries
+        self.transformer = transformer
+        hidden_dim = transformer.d_model
+        self.class_embed = nn.Linear(hidden_dim, num_classes + 1)
+        self.bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)
+        self.query_embed = nn.Embedding(num_queries, hidden_dim)
+        self.input_proj = nn.Conv2d(backbone.num_channels, hidden_dim, kernel_size=1)
+        self.backbone = backbone
+        self.aux_loss = aux_loss
+    def forward(self, samples: NestedTensor):
+        """ The forward expects a NestedTensor, which consists of:
+               - samples.tensor: batched images, of shape [batch_size x 3 x H x W]
+               - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels
+            It returns a dict with the following elements:
+               - "pred_logits": the classification logits (including no-object) for all queries.
+                                Shape= [batch_size x num_queries x (num_classes + 1)]
+               - "pred_boxes": The normalized boxes coordinates for all queries, represented as
+                               (center_x, center_y, height, width). These values are normalized in [0, 1],
+                               relative to the size of each individual image (disregarding possible padding).
+                               See PostProcess for information on how to retrieve the unnormalized bounding box.
+               - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
+                                dictionnaries containing the two above keys for each decoder layer.
+        """
+        if isinstance(samples, (list, torch.Tensor)):
+            samples = nested_tensor_from_tensor_list(samples)
+        features, pos = self.backbone(samples)
+        src, mask = features[-1].decompose()
+        assert mask is not None
+        hs = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos[-1])[0]
+        outputs_class = self.class_embed(hs)
+        outputs_coord = self.bbox_embed(hs).sigmoid()
+        out = {'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1]}
+        if self.aux_loss:
+            out['aux_outputs'] = self._set_aux_loss(outputs_class, outputs_coord)
+        return out
+    @torch.jit.unused
+    def _set_aux_loss(self, outputs_class, outputs_coord):
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        return [{'pred_logits': a, 'pred_boxes': b}
+                for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]
+def build(args):
+    device = torch.device(args.device)
+    backbone = build_backbone(args)
+    transformer = build_transformer(args)
+    model = DETR(
+        backbone,
+        transformer,
+        num_classes=args.num_classes,
+        num_queries=args.num_queries,
+        aux_loss=args.aux_loss,
+    )
+    matcher = build_matcher(args)
+    weight_dict = {'loss_ce': 1, 'loss_bbox': args.bbox_loss_coef}
+    weight_dict['loss_giou'] = args.giou_loss_coef
+    # TODO this is a hack
+    if args.aux_loss:
+        aux_weight_dict = {}
+        for i in range(args.dec_layers - 1):
+            aux_weight_dict.update({k + f'_{i}': v for k, v in weight_dict.items()})
+        weight_dict.update(aux_weight_dict)
+    losses = ['labels', 'boxes', 'cardinality'] if args.frozen_weights is None else []
+    if args.HOIDet:
+        hoi_matcher = build_hoi_matcher(args)
+        hoi_losses = []
+        hoi_losses.append('pair_labels')
+        hoi_losses.append('pair_actions')
+        if args.dataset_file == 'hico-det': hoi_losses.append('pair_targets')
+        hoi_weight_dict={}
+        hoi_weight_dict['loss_hidx'] = args.hoi_idx_loss_coef
+        hoi_weight_dict['loss_oidx'] = args.hoi_idx_loss_coef
+        hoi_weight_dict['loss_h_consistency'] = args.hoi_idx_consistency_loss_coef
+        hoi_weight_dict['loss_o_consistency'] = args.hoi_idx_consistency_loss_coef
+        hoi_weight_dict['loss_act'] = args.hoi_act_loss_coef
+        hoi_weight_dict['loss_act_consistency'] = args.hoi_act_consistency_loss_coef
+        if args.dataset_file == 'hico-det':
+            hoi_weight_dict['loss_tgt'] = args.hoi_tgt_loss_coef
+            hoi_weight_dict['loss_tgt_consistency'] = args.hoi_tgt_consistency_loss_coef
+        if args.hoi_aux_loss:
+            hoi_aux_weight_dict = {}
+            for i in range(args.hoi_dec_layers):
+                hoi_aux_weight_dict.update({k + f'_{i}': v for k, v in hoi_weight_dict.items()})
+            hoi_weight_dict.update(hoi_aux_weight_dict)
+        criterion = SetCriterion(args.num_classes, matcher=matcher, weight_dict=hoi_weight_dict,
+                                 eos_coef=args.eos_coef, losses=losses, num_actions=args.num_actions,
+                                 HOI_losses=hoi_losses, HOI_matcher=hoi_matcher, args=args)
+        interaction_transformer = build_hoi_transformer(args) # if (args.share_enc and args.pretrained_dec) else None
+        kwargs = {}
+        if args.dataset_file == 'hico-det': kwargs['return_obj_class'] = args.valid_obj_ids
+        if args.sep_enc_forward:
+            model = HOTR_V1(
+                detr=model,
+                num_hoi_queries=args.num_hoi_queries,
+                num_actions=args.num_actions,
+                interaction_transformer=interaction_transformer,
+                augpath_name = args.augpath_name,
+                share_dec_param = args.share_dec_param,
+                stop_grad_stage = args.stop_grad_stage,
+                freeze_detr=(args.frozen_weights is not None),
+                share_enc=args.share_enc,
+                pretrained_dec=args.pretrained_dec,
+                temperature=args.temperature,
+                hoi_aux_loss=args.hoi_aux_loss,
+                **kwargs # only return verb class for HICO-DET dataset
+            )
+        else:
+            model = HOTR(
+                detr=model,
+                num_hoi_queries=args.num_hoi_queries,
+                num_actions=args.num_actions,
+                interaction_transformer=interaction_transformer,
+                augpath_name = args.augpath_name,
+                share_dec_param = args.share_dec_param,
+                stop_grad_stage = args.stop_grad_stage,
+                freeze_detr=(args.frozen_weights is not None),
+                share_enc=args.share_enc,
+                pretrained_dec=args.pretrained_dec,
+                temperature=args.temperature,
+                hoi_aux_loss=args.hoi_aux_loss,
+                **kwargs # only return verb class for HICO-DET dataset
+            )
+        postprocessors = {'hoi': PostProcess(args.HOIDet)}
+    else:
+        criterion = SetCriterion(args.num_classes, matcher=matcher, weight_dict=weight_dict,
+                                 eos_coef=args.eos_coef, losses=losses)
+        postprocessors = {'bbox': PostProcess(args.HOIDet)}
+    criterion.to(device)
+    return model, criterion, postprocessors

hotr/models/detr_matcher.py ADDED Viewed

	@@ -0,0 +1,81 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Modules to compute the matching cost and solve the corresponding LSAP.
+"""
+import torch
+from scipy.optimize import linear_sum_assignment
+from torch import nn
+from hotr.util.box_ops import box_cxcywh_to_xyxy, generalized_box_iou
+class HungarianMatcher(nn.Module):
+    """This class computes an assignment between the targets and the predictions of the network
+    For efficiency reasons, the targets don't include the no_object. Because of this, in general,
+    there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
+    while the others are un-matched (and thus treated as non-objects).
+    """
+    def __init__(self, cost_class: float = 1, cost_bbox: float = 1, cost_giou: float = 1):
+        """Creates the matcher
+        Params:
+            cost_class: This is the relative weight of the classification error in the matching cost
+            cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
+            cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost
+        """
+        super().__init__()
+        self.cost_class = cost_class
+        self.cost_bbox = cost_bbox
+        self.cost_giou = cost_giou
+        assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0, "all costs cant be 0"
+    @torch.no_grad()
+    def forward(self, outputs, targets):
+        """ Performs the matching
+        Params:
+            outputs: This is a dict that contains at least these entries:
+                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
+                 "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates
+            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
+                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
+                           objects in the target) containing the class labels
+                 "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates
+        Returns:
+            A list of size batch_size, containing tuples of (index_i, index_j) where:
+                - index_i is the indices of the selected predictions (in order)
+                - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds:
+                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        bs, num_queries = outputs["pred_logits"].shape[:2]
+        # We flatten to compute the cost matrices in a batch
+        out_prob = outputs["pred_logits"].flatten(0, 1).softmax(-1)  # [batch_size * num_queries, num_classes]
+        out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
+        # Also concat the target labels and boxes
+        tgt_ids = torch.cat([v["labels"] for v in targets])
+        tgt_bbox = torch.cat([v["boxes"] for v in targets])
+        # Compute the classification cost. Contrary to the loss, we don't use the NLL,
+        # but approximate it in 1 - proba[target class].
+        # The 1 is a constant that doesn't change the matching, it can be ommitted.
+        cost_class = -out_prob[:, tgt_ids]
+        # Compute the L1 cost between boxes
+        cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)
+        # Compute the giou cost betwen boxes
+        cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox))
+        # Final cost matrix
+        C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
+        C = C.view(bs, num_queries, -1).cpu()
+        sizes = [len(v["boxes"]) for v in targets]
+        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
+        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
+def build_matcher(args):
+    return HungarianMatcher(cost_class=args.set_cost_class, cost_bbox=args.set_cost_bbox, cost_giou=args.set_cost_giou)

hotr/models/feed_forward.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import torch.nn.functional as F
+from torch import nn
+class MLP(nn.Module):
+    """ Very simple multi-layer perceptron (also called FFN)"""
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x

hotr/models/hotr.py ADDED Viewed

	@@ -0,0 +1,241 @@

+# ------------------------------------------------------------------------
+# HOTR official code : hotr/models/hotr.py
+# Copyright (c) Kakao Brain, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import copy
+import time
+import datetime
+from hotr.util.misc import NestedTensor, nested_tensor_from_tensor_list
+from .feed_forward import MLP
+class HOTR(nn.Module):
+    def __init__(self, detr,
+                 num_hoi_queries,
+                 num_actions,
+                 interaction_transformer,
+                 augpath_name,
+                 share_dec_param,
+                 stop_grad_stage,
+                 freeze_detr,
+                 share_enc,
+                 pretrained_dec,
+                 temperature,
+                 hoi_aux_loss,
+                 return_obj_class=None):
+        super().__init__()
+        # * Instance Transformer ---------------
+        self.detr = detr
+        if freeze_detr:
+            # if this flag is given, freeze the object detection related parameters of DETR
+            for p in self.parameters():
+                p.requires_grad_(False)
+        hidden_dim = detr.transformer.d_model
+        # --------------------------------------
+        # * Interaction Transformer -----------------------------------------
+        self.num_queries = num_hoi_queries
+        self.query_embed = nn.Embedding(self.num_queries, hidden_dim)
+        self.H_Pointer_embed   = MLP(hidden_dim, hidden_dim, hidden_dim, 3)
+        self.O_Pointer_embed   = MLP(hidden_dim, hidden_dim, hidden_dim, 3)
+        self.action_embed = nn.Linear(hidden_dim, num_actions+1)
+        # --------------------------------------------------------------------
+        # * HICO-DET FFN heads ---------------------------------------------
+        self.return_obj_class = (return_obj_class is not None)
+        if return_obj_class: self._valid_obj_ids = return_obj_class + [return_obj_class[-1]+1]
+        # ------------------------------------------------------------------
+        # * Transformer Options ---------------------------------------------
+        self.interaction_transformer = interaction_transformer
+        if share_enc: # share encoder
+            self.interaction_transformer.encoder = detr.transformer.encoder
+        if pretrained_dec: # free variables for interaction decoder
+            self.interaction_transformer.decoder = copy.deepcopy(detr.transformer.decoder)
+            for p in self.interaction_transformer.decoder.parameters():
+                p.requires_grad_(True)
+        # ---------------------------------------------------------------------
+        #Augmented paths
+        self.aug_paths = augpath_name
+        if 'p2' in augpath_name:
+            if not share_dec_param:
+                self.xtoHO_interaction_decoder = copy.deepcopy(self.interaction_transformer.decoder)
+                self.HOtoI_interaction_decoder = copy.deepcopy(self.interaction_transformer.decoder)
+            else:
+                self.xtoHO_interaction_decoder = self.interaction_transformer.decoder
+                self.HOtoI_interaction_decoder = self.interaction_transformer.decoder
+            self.query_embed_HOtoI = nn.Embedding(self.num_queries, hidden_dim)
+            self.query_embed_HOtoI2 = nn.Embedding(self.num_queries, hidden_dim)
+            self.H_Pointer_embed_HOtoI   = MLP(hidden_dim, hidden_dim, hidden_dim, 3)
+            self.O_Pointer_embed_HOtoI   = MLP(hidden_dim, hidden_dim, hidden_dim, 3)
+            self.action_embed_HOtoI = nn.Linear(hidden_dim, num_actions+1)
+        if 'p3' in augpath_name:
+            if not share_dec_param:
+                self.xtoHI_interaction_decoder = copy.deepcopy(self.interaction_transformer.decoder)
+                self.HItoO_interaction_decoder = copy.deepcopy(self.interaction_transformer.decoder)
+            else:
+                self.xtoHI_interaction_decoder = self.interaction_transformer.decoder
+                self.HItoO_interaction_decoder = self.interaction_transformer.decoder
+            self.query_embed_HItoO = nn.Embedding(self.num_queries, hidden_dim)
+            self.query_embed_HItoO2 = nn.Embedding(self.num_queries, hidden_dim)
+            self.H_Pointer_embed_HItoO   = MLP(hidden_dim, hidden_dim, hidden_dim, 3)
+            self.O_Pointer_embed_HItoO   = MLP(hidden_dim, hidden_dim, hidden_dim, 3)
+            self.action_embed_HItoO = nn.Linear(hidden_dim, num_actions+1)
+        if 'p4' in augpath_name:
+            if not share_dec_param:
+                self.xtoOI_interaction_decoder = copy.deepcopy(self.interaction_transformer.decoder)
+                self.OItoH_interaction_decoder = copy.deepcopy(self.interaction_transformer.decoder)
+            else:
+                self.xtoOI_interaction_decoder = self.interaction_transformer.decoder
+                self.OItoH_interaction_decoder = self.interaction_transformer.decoder
+            self.query_embed_OItoH = nn.Embedding(self.num_queries, hidden_dim)
+            self.query_embed_OItoH2 = nn.Embedding(self.num_queries, hidden_dim)
+            self.H_Pointer_embed_OItoH  = MLP(hidden_dim, hidden_dim, hidden_dim, 3)
+            self.O_Pointer_embed_OItoH  = MLP(hidden_dim, hidden_dim, hidden_dim, 3)
+            self.action_embed_OItoH = nn.Linear(hidden_dim, num_actions+1)
+        self.stop_grad_stage = stop_grad_stage
+        # * Loss Options -------------------
+        self.tau = temperature
+        self.hoi_aux_loss = hoi_aux_loss
+        # ----------------------------------
+    def forward(self, samples: NestedTensor):
+        if isinstance(samples, (list, torch.Tensor)):
+            samples = nested_tensor_from_tensor_list(samples)
+        # >>>>>>>>>>>>  BACKBONE LAYERS  <<<<<<<<<<<<<<<
+        features, pos = self.detr.backbone(samples)
+        bs = features[-1].tensors.shape[0]
+        src, mask = features[-1].decompose()
+        assert mask is not None
+        # ----------------------------------------------
+        # >>>>>>>>>>>> OBJECT DETECTION LAYERS <<<<<<<<<<
+        start_time = time.time()
+        hs, memory = self.detr.transformer(self.detr.input_proj(src), mask, self.detr.query_embed.weight, pos[-1])
+        inst_repr = F.normalize(hs[-1], p=2, dim=2) # instance representations
+        # Prediction Heads for Object Detection
+        outputs_class = self.detr.class_embed(hs)
+        outputs_coord = self.detr.bbox_embed(hs).sigmoid()
+        object_detection_time = time.time() - start_time
+        # -----------------------------------------------
+        # >>>>>>>>>>>> HOI DETECTION LAYERS <<<<<<<<<<<<<<<
+        start_time = time.time()
+        assert hasattr(self, 'interaction_transformer'), "Missing Interaction Transformer."
+        H_Pointer_reprs_bag,O_Pointer_reprs_bag,outputs_action=[],[],[]
+        # main path P1
+        interaction_hs= self.interaction_transformer(self.detr.input_proj(src), mask, self.query_embed.weight, pos[-1])[0] # interaction representations
+        H_Pointer_reprs_bag.append(F.normalize(self.H_Pointer_embed(interaction_hs), p=2, dim=-1))
+        O_Pointer_reprs_bag.append(F.normalize(self.O_Pointer_embed(interaction_hs), p=2, dim=-1))
+        outputs_action.append(self.action_embed(interaction_hs))
+        if len(self.aug_paths)!=0:
+            pos_aug = pos[-1].flatten(2).permute(2, 0, 1)
+            mask_aug = mask.flatten(1)
+        # P2 (x->HO->I)
+        if 'p2' in self.aug_paths:
+            tgt_2 = torch.zeros_like(self.query_embed_HOtoI.weight.unsqueeze(1).repeat(1, bs, 1))
+            hs_HOtoI = self.xtoHO_interaction_decoder(tgt_2,memory,memory_key_padding_mask=mask_aug, pos=pos_aug, query_pos=self.query_embed_HOtoI.weight.unsqueeze(1).repeat(1, bs, 1)).transpose(1,2)
+            tgt_HOtoI = hs_HOtoI.transpose(1,2)[-1] if not self.stop_grad_stage else hs_HOtoI.clone().detach().transpose(1,2)[-1]
+            hs2_HOtoI = self.HOtoI_interaction_decoder(tgt_HOtoI,memory,memory_key_padding_mask=mask_aug, pos=pos_aug, query_pos=self.query_embed_HOtoI2.weight.unsqueeze(1).repeat(1, bs, 1)).transpose(1,2)
+            H_Pointer_reprs_bag.append(F.normalize(self.H_Pointer_embed_HOtoI(hs_HOtoI), p=2, dim=-1))
+            O_Pointer_reprs_bag.append(F.normalize(self.O_Pointer_embed_HOtoI(hs_HOtoI), p=2, dim=-1))
+            outputs_action.append(self.action_embed_HOtoI(hs2_HOtoI))
+        # P3 (x->HI->O)
+        if 'p3' in self.aug_paths:
+            tgt_3 = torch.zeros_like(self.query_embed_HItoO.weight.unsqueeze(1).repeat(1, bs, 1))
+            hs_HItoO = self.xtoHI_interaction_decoder(tgt_3,memory,memory_key_padding_mask=mask_aug, pos=pos_aug, query_pos=self.query_embed_HItoO.weight.unsqueeze(1).repeat(1, bs, 1)).transpose(1,2)
+            tgt_HItoO = hs_HItoO.transpose(1,2)[-1] if not self.stop_grad_stage else hs_HItoO.clone().detach().transpose(1,2)[-1]
+            hs2_HItoO = self.HItoO_interaction_decoder(tgt_HItoO,memory,memory_key_padding_mask=mask_aug, pos=pos_aug, query_pos=self.query_embed_HItoO2.weight.unsqueeze(1).repeat(1, bs, 1)).transpose(1,2)
+            H_Pointer_reprs_bag.append(F.normalize(self.H_Pointer_embed_HItoO(hs_HItoO), p=2, dim=-1))
+            O_Pointer_reprs_bag.append(F.normalize(self.O_Pointer_embed_HItoO(hs2_HItoO), p=2, dim=-1))
+            outputs_action.append(self.action_embed_HItoO(hs_HItoO))
+        # P4 (x->OI->H)
+        if 'p4' in self.aug_paths:
+            tgt_4 = torch.zeros_like(self.query_embed_OItoH.weight.unsqueeze(1).repeat(1, bs, 1))
+            hs_OItoH = self.xtoOI_interaction_decoder(tgt_3,memory,memory_key_padding_mask=mask_aug, pos=pos_aug, query_pos=self.query_embed_OItoH.weight.unsqueeze(1).repeat(1, bs, 1)).transpose(1,2)
+            tgt_OItoH = hs_OItoH.transpose(1,2)[-1] if not self.stop_grad_stage else hs_OItoH.clone().detach().transpose(1,2)[-1]
+            hs2_OItoH = self.OItoH_interaction_decoder(tgt_OItoH,memory,memory_key_padding_mask=mask_aug, pos=pos_aug, query_pos=self.query_embed_OItoH2.weight.unsqueeze(1).repeat(1, bs, 1)).transpose(1,2)
+            H_Pointer_reprs_bag.append(F.normalize(self.H_Pointer_embed_OItoH(hs2_OItoH), p=2, dim=-1))
+            O_Pointer_reprs_bag.append(F.normalize(self.O_Pointer_embed_OItoH(hs_OItoH), p=2, dim=-1))
+            outputs_action.append(self.action_embed_OItoH(hs_OItoH))
+        inst_repr_all=inst_repr.transpose(1,2).repeat(1+len(self.aug_paths),1,1)
+        H_Pointer_reprs_bag=torch.cat(H_Pointer_reprs_bag,1)
+        O_Pointer_reprs_bag=torch.cat(O_Pointer_reprs_bag,1)
+        outputs_hidx = [(torch.bmm(H_Pointer_repr, inst_repr_all)) / self.tau for H_Pointer_repr in H_Pointer_reprs_bag] #(dec_layer,(1+len(aug))*bs,dec_q,hidden_dim)
+        outputs_oidx = [(torch.bmm(O_Pointer_repr, inst_repr_all)) / self.tau for O_Pointer_repr in O_Pointer_reprs_bag]
+        outputs_action=torch.stack(outputs_action,dim=2) #(dec_layer,bs,1+#aug,dec_q,#action)
+        # --------------------------------------------------
+        hoi_detection_time = time.time() - start_time
+        hoi_recognition_time = max(hoi_detection_time - object_detection_time, 0)
+        # -------------------------------------------------------------------
+        # [Target Classification]
+        if self.return_obj_class:
+            detr_logits = outputs_class[-1, ..., self._valid_obj_ids]
+            o_indices = [output_oidx.max(-1)[-1].view(1+len(self.aug_paths),bs,self.num_queries).transpose(0,1) for output_oidx in outputs_oidx]
+            obj_logit_stack = [torch.stack([detr_logits[batch_, o_idx, :] for batch_, o_idc in enumerate(o_indice) for o_idx in o_idc], 0) for o_indice in o_indices]
+            outputs_obj_class = obj_logit_stack
+        out = {
+            "pred_logits": outputs_class[-1],
+            "pred_boxes": outputs_coord[-1],
+            "pred_hidx": outputs_hidx[-1],
+            "pred_oidx": outputs_oidx[-1],
+            "pred_actions": outputs_action[-1],
+            "hoi_recognition_time": hoi_recognition_time,
+        }
+        if self.return_obj_class: out["pred_obj_logits"] = outputs_obj_class[-1]
+        # import pdb;pdb.set_trace()
+        if self.hoi_aux_loss: # auxiliary loss
+            out['hoi_aux_outputs'] = \
+                self._set_aux_loss_with_tgt(outputs_class, outputs_coord, outputs_hidx, outputs_oidx, outputs_action, outputs_obj_class) \
+                if self.return_obj_class else \
+                self._set_aux_loss(outputs_class, outputs_coord, outputs_hidx, outputs_oidx, outputs_action)
+        return out
+    @torch.jit.unused
+    def _set_aux_loss(self, outputs_class, outputs_coord, outputs_hidx, outputs_oidx, outputs_action):
+        return [{'pred_logits': a,  'pred_boxes': b, 'pred_hidx': c, 'pred_oidx': d, 'pred_actions': e}
+                for a, b, c, d, e in zip(
+                    outputs_class[-1:].repeat((outputs_action.shape[0], 1, 1, 1)),
+                    outputs_coord[-1:].repeat((outputs_action.shape[0], 1, 1, 1)),
+                    outputs_hidx[:-1],
+                    outputs_oidx[:-1],
+                    outputs_action[:-1])]
+    @torch.jit.unused
+    def _set_aux_loss_with_tgt(self, outputs_class, outputs_coord, outputs_hidx, outputs_oidx, outputs_action, outputs_tgt):
+        return [{'pred_logits': a,  'pred_boxes': b, 'pred_hidx': c, 'pred_oidx': d, 'pred_actions': e, 'pred_obj_logits': f}
+                for a, b, c, d, e, f in zip(
+                    outputs_class[-1:].repeat((outputs_action.shape[0], 1, 1, 1)),
+                    outputs_coord[-1:].repeat((outputs_action.shape[0], 1, 1, 1)),
+                    outputs_hidx[:-1],
+                    outputs_oidx[:-1],
+                    outputs_action[:-1],
+                    outputs_tgt[:-1])]

hotr/models/hotr_matcher.py ADDED Viewed

	@@ -0,0 +1,216 @@

+# ------------------------------------------------------------------------
+# HOTR official code : hotr/models/hotr_matcher.py
+# Copyright (c) Kakao Brain, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+import torch
+from scipy.optimize import linear_sum_assignment
+from torch import nn
+from hotr.util.box_ops import box_cxcywh_to_xyxy, generalized_box_iou
+import hotr.util.misc as utils
+import wandb
+class HungarianPairMatcher(nn.Module):
+    def __init__(self, args):
+        """Creates the matcher
+        Params:
+            cost_action: This is the relative weight of the multi-label action classification error in the matching cost
+            cost_hbox: This is the relative weight of the classification error for human idx in the matching cost
+            cost_obox: This is the relative weight of the classification error for object idx in the matching cost
+        """
+        super().__init__()
+        self.cost_action = args.set_cost_act
+        self.cost_hbox = self.cost_obox = args.set_cost_idx
+        self.cost_target = args.set_cost_tgt
+        self.log_printer = args.wandb
+        self.is_vcoco = (args.dataset_file == 'vcoco')
+        self.is_hico = (args.dataset_file == 'hico-det')
+        if self.is_vcoco:
+            self.valid_ids = args.valid_ids
+            self.invalid_ids = args.invalid_ids
+        assert self.cost_action != 0 or self.cost_hbox != 0 or self.cost_obox != 0, "all costs cant be 0"
+    def reduce_redundant_gt_box(self, tgt_bbox, indices):
+        """Filters redundant Ground-Truth Bounding Boxes
+        Due to random crop augmentation, there exists cases where there exists
+        multiple redundant labels for the exact same bounding box and object class.
+        This function deals with the redundant labels for smoother HOTR training.
+        """
+        tgt_bbox_unique, map_idx, idx_cnt = torch.unique(tgt_bbox, dim=0, return_inverse=True, return_counts=True)
+        k_idx, bbox_idx = indices
+        triggered = False
+        if (len(tgt_bbox) != len(tgt_bbox_unique)):
+            map_dict = {k: v for k, v in enumerate(map_idx)}
+            map_bbox2kidx = {int(bbox_id): k_id for bbox_id, k_id in zip(bbox_idx, k_idx)}
+            bbox_lst, k_lst = [], []
+            for bbox_id in bbox_idx:
+                if map_dict[int(bbox_id)] not in bbox_lst:
+                    bbox_lst.append(map_dict[int(bbox_id)])
+                    k_lst.append(map_bbox2kidx[int(bbox_id)])
+            bbox_idx = torch.tensor(bbox_lst)
+            k_idx = torch.tensor(k_lst)
+            tgt_bbox_res = tgt_bbox_unique
+        else:
+            tgt_bbox_res = tgt_bbox
+        bbox_idx = bbox_idx.to(tgt_bbox.device)
+        return tgt_bbox_res, k_idx, bbox_idx
+    @torch.no_grad()
+    def forward(self, outputs, targets, indices, log=False):
+        assert "pred_actions" in outputs, "There is no action output for pair matching"
+        num_obj_queries = outputs["pred_boxes"].shape[1]
+        bs,num_path, num_queries = outputs["pred_actions"].shape[:3]
+        detr_query_num = outputs["pred_logits"].shape[1] \
+            if (outputs["pred_oidx"].shape[-1] == (outputs["pred_logits"].shape[1] + 1)) else -1
+        return_list = []
+        if self.log_printer and log:
+            log_dict = {'h_cost': [], 'o_cost': [], 'act_cost': []}
+            if self.is_hico: log_dict['tgt_cost'] = []
+        for batch_idx in range(bs):
+            tgt_bbox = targets[batch_idx]["boxes"] # (num_boxes, 4)
+            tgt_cls = targets[batch_idx]["labels"] # (num_boxes)
+            if self.is_vcoco:
+                targets[batch_idx]["pair_actions"][:, self.invalid_ids] = 0
+                keep_idx = (targets[batch_idx]["pair_actions"].sum(dim=-1) != 0)
+                targets[batch_idx]["pair_boxes"] = targets[batch_idx]["pair_boxes"][keep_idx]
+                targets[batch_idx]["pair_actions"] = targets[batch_idx]["pair_actions"][keep_idx]
+                targets[batch_idx]["pair_targets"] = targets[batch_idx]["pair_targets"][keep_idx]
+                tgt_pbox = targets[batch_idx]["pair_boxes"] # (num_pair_boxes, 8)
+                tgt_act = targets[batch_idx]["pair_actions"] # (num_pair_boxes, 29)
+                tgt_tgt = targets[batch_idx]["pair_targets"] # (num_pair_boxes)
+                tgt_hbox = tgt_pbox[:, :4] # (num_pair_boxes, 4)
+                tgt_obox = tgt_pbox[:, 4:] # (num_pair_boxes, 4)
+            elif self.is_hico:
+                tgt_act = targets[batch_idx]["pair_actions"] # (num_pair_boxes, 117)
+                tgt_tgt = targets[batch_idx]["pair_targets"] # (num_pair_boxes)
+                tgt_hbox = targets[batch_idx]["sub_boxes"] # (num_pair_boxes, 4)
+                tgt_obox = targets[batch_idx]["obj_boxes"] # (num_pair_boxes, 4)
+            # find which gt boxes match the h, o boxes in the pair
+            if self.is_vcoco:
+                hbox_with_cls = torch.cat([tgt_hbox, torch.ones((tgt_hbox.shape[0], 1)).to(tgt_hbox.device)], dim=1)
+            elif self.is_hico:
+                hbox_with_cls = torch.cat([tgt_hbox, torch.zeros((tgt_hbox.shape[0], 1)).to(tgt_hbox.device)], dim=1)
+            obox_with_cls = torch.cat([tgt_obox, tgt_tgt.unsqueeze(-1)], dim=1)
+            obox_with_cls[obox_with_cls[:, :4].sum(dim=1) == -4, -1] = -1 # turn the class of occluded objects to -1
+            bbox_with_cls = torch.cat([tgt_bbox, tgt_cls.unsqueeze(-1)], dim=1)
+            bbox_with_cls, k_idx, bbox_idx = self.reduce_redundant_gt_box(bbox_with_cls, indices[batch_idx])
+            bbox_with_cls = torch.cat((bbox_with_cls, torch.as_tensor([-1.]*5).unsqueeze(0).to(tgt_cls.device)), dim=0)
+            cost_hbox = torch.cdist(hbox_with_cls, bbox_with_cls, p=1)
+            cost_obox = torch.cdist(obox_with_cls, bbox_with_cls, p=1)
+            # find which gt boxes matches which prediction in K
+            h_match_indices = torch.nonzero(cost_hbox == 0, as_tuple=False) # (num_hbox, num_boxes)
+            o_match_indices = torch.nonzero(cost_obox == 0, as_tuple=False) # (num_obox, num_boxes)
+            tgt_hids, tgt_oids = [], []
+            # obtain ground truth indices for h
+            if len(h_match_indices) != len(o_match_indices):
+                import pdb; pdb.set_trace()
+            for h_match_idx, o_match_idx in zip(h_match_indices, o_match_indices):
+                hbox_idx, H_bbox_idx = h_match_idx
+                obox_idx, O_bbox_idx = o_match_idx
+                if O_bbox_idx == (len(bbox_with_cls)-1): # if the object class is -1
+                    O_bbox_idx = H_bbox_idx # happens in V-COCO, the target object may not appear
+                GT_idx_for_H = (bbox_idx == H_bbox_idx).nonzero(as_tuple=False).squeeze(-1)
+                query_idx_for_H = k_idx[GT_idx_for_H]
+                tgt_hids.append(query_idx_for_H)
+                GT_idx_for_O = (bbox_idx == O_bbox_idx).nonzero(as_tuple=False).squeeze(-1)
+                query_idx_for_O = k_idx[GT_idx_for_O]
+                tgt_oids.append(query_idx_for_O)
+            # check if empty
+            if len(tgt_hids) == 0: tgt_hids.append(torch.as_tensor([-1])) # we later ignore the label -1
+            if len(tgt_oids) == 0: tgt_oids.append(torch.as_tensor([-1])) # we later ignore the label -1
+            tgt_sum = (tgt_act.sum(dim=-1)).unsqueeze(0)
+            flag = False
+            if tgt_act.shape[0] == 0:
+                tgt_act = torch.zeros((1, tgt_act.shape[1])).to(targets[batch_idx]["pair_actions"].device)
+                targets[batch_idx]["pair_actions"] = torch.zeros((1, targets[batch_idx]["pair_actions"].shape[1])).to(targets[batch_idx]["pair_actions"].device)
+                if self.is_hico:
+                    pad_tgt = -1 # outputs["pred_obj_logits"].shape[-1]-1
+                    tgt_tgt = torch.tensor([pad_tgt]).to(targets[batch_idx]["pair_targets"])
+                    targets[batch_idx]["pair_targets"] = torch.tensor([pad_tgt]).to(targets[batch_idx]["pair_targets"].device)
+                tgt_sum = (tgt_act.sum(dim=-1) + 1).unsqueeze(0)
+            # Concat target label
+            tgt_hids = torch.cat(tgt_hids).repeat(num_path)
+            tgt_oids = torch.cat(tgt_oids).repeat(num_path)
+            # import pdb;pdb.set_trace()
+            outputs_hidx=outputs["pred_hidx"].view(num_path,bs,num_queries,-1).transpose(0,1).flatten(1,2)
+            outputs_oidx=outputs["pred_oidx"].view(num_path,bs,num_queries,-1).transpose(0,1).flatten(1,2)
+            outputs_action=outputs["pred_actions"].view(bs,num_path*num_queries,-1)
+            out_hprob = outputs_hidx[batch_idx].softmax(-1)
+            out_oprob = outputs_oidx[batch_idx].softmax(-1)
+            out_act = outputs_action[batch_idx].clone()
+            if self.is_vcoco: out_act[..., self.invalid_ids] = 0
+            if self.is_hico:
+                outputs_obj_logits=outputs["pred_obj_logits"].view(bs,num_path,num_queries,-1).view(bs,num_path*num_queries,-1)
+                out_tgt = outputs_obj_logits[batch_idx].softmax(-1)
+                out_tgt[..., -1] = 0 # don't get cost for no-object
+            tgt_act = torch.cat([tgt_act, torch.zeros(tgt_act.shape[0]).unsqueeze(-1).to(tgt_act.device)], dim=-1).repeat(num_path,1)
+            cost_hclass = -out_hprob[:, tgt_hids] # [batch_size * num_queries, detr.num_queries+1]
+            cost_oclass = -out_oprob[:, tgt_oids] # [batch_size * num_queries, detr.num_queries+1]
+            # import pdb;pdb.set_trace()
+            cost_pos_act = (-torch.matmul(out_act, tgt_act.t().float())) / tgt_sum.repeat(1,num_path)
+            cost_neg_act = (torch.matmul(out_act, (~tgt_act.bool()).type(torch.int64).t().float())) / (~tgt_act.bool()).type(torch.int64).sum(dim=-1).unsqueeze(0)
+            cost_action = cost_pos_act + cost_neg_act
+            h_cost = self.cost_hbox * cost_hclass
+            o_cost = self.cost_obox * cost_oclass
+            act_cost = self.cost_action * cost_action
+            C = h_cost + o_cost + act_cost
+            if self.is_hico:
+                cost_target = -out_tgt[:, tgt_tgt.repeat(num_path)]
+                tgt_cost = self.cost_target * cost_target
+                C += tgt_cost
+            C = C.view(num_path,num_queries, -1).cpu()
+            sizes = [len(tgt_hids)//num_path]*num_path
+            hoi_indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
+            return_list.append([(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in hoi_indices])
+            # import pdb;pdb.set_trace()
+            targets[batch_idx]["h_labels"] = tgt_hids.to(tgt_hbox.device)
+            targets[batch_idx]["o_labels"] = tgt_oids.to(tgt_obox.device)
+            log_act_cost = torch.zeros([1]).to(tgt_act.device) if tgt_act.shape[0] == 0 else act_cost.min(dim=0)[0].mean()
+            if self.log_printer and log:
+                log_dict['h_cost'].append(h_cost[:num_queries].min(dim=0)[0].mean())
+                log_dict['o_cost'].append(o_cost[:num_queries].min(dim=0)[0].mean())
+                log_dict['act_cost'].append(act_cost[:num_queries].min(dim=0)[0].mean())
+                if self.is_hico: log_dict['tgt_cost'].append(tgt_cost[:num_queries].min(dim=0)[0].mean())
+        if self.log_printer and log:
+            log_dict['h_cost'] = torch.stack(log_dict['h_cost']).mean()
+            log_dict['o_cost'] = torch.stack(log_dict['o_cost']).mean()
+            log_dict['act_cost'] = torch.stack(log_dict['act_cost']).mean()
+            if self.is_hico: log_dict['tgt_cost'] = torch.stack(log_dict['tgt_cost']).mean()
+            if utils.get_rank() == 0: wandb.log(log_dict)
+        return return_list, targets
+def build_hoi_matcher(args):
+    return HungarianPairMatcher(args)

hotr/models/position_encoding.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Various positional encodings for the transformer.
+"""
+import math
+import torch
+from torch import nn
+from hotr.util.misc import NestedTensor
+class PositionEmbeddingSine(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one
+    used by the Attention is all you need paper, generalized to work on images.
+    """
+    def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+    def forward(self, tensor_list: NestedTensor):
+        x = tensor_list.tensors
+        mask = tensor_list.mask
+        assert mask is not None
+        not_mask = ~mask
+        y_embed = not_mask.cumsum(1, dtype=torch.float32)
+        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            eps = 1e-6
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+class PositionEmbeddingLearned(nn.Module):
+    """
+    Absolute pos embedding, learned.
+    """
+    def __init__(self, num_pos_feats=256):
+        super().__init__()
+        self.row_embed = nn.Embedding(50, num_pos_feats)
+        self.col_embed = nn.Embedding(50, num_pos_feats)
+        self.reset_parameters()
+    def reset_parameters(self):
+        nn.init.uniform_(self.row_embed.weight)
+        nn.init.uniform_(self.col_embed.weight)
+    def forward(self, tensor_list: NestedTensor):
+        x = tensor_list.tensors
+        h, w = x.shape[-2:]
+        i = torch.arange(w, device=x.device)
+        j = torch.arange(h, device=x.device)
+        x_emb = self.col_embed(i)
+        y_emb = self.row_embed(j)
+        pos = torch.cat([
+            x_emb.unsqueeze(0).repeat(h, 1, 1),
+            y_emb.unsqueeze(1).repeat(1, w, 1),
+        ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1)
+        return pos
+def build_position_encoding(args):
+    N_steps = args.hidden_dim // 2
+    if args.position_embedding in ('v2', 'sine'):
+        # TODO find a better way of exposing other arguments
+        position_embedding = PositionEmbeddingSine(N_steps, normalize=True)
+    elif args.position_embedding in ('v3', 'learned'):
+        position_embedding = PositionEmbeddingLearned(N_steps)
+    else:
+        raise ValueError(f"not supported {args.position_embedding}")
+    return position_embedding

hotr/models/post_process.py ADDED Viewed

	@@ -0,0 +1,162 @@

+# ------------------------------------------------------------------------
+# HOTR official code : hotr/models/post_process.py
+# Copyright (c) Kakao Brain, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+import time
+import copy
+import torch
+import torch.nn.functional as F
+from torch import nn
+from hotr.util import box_ops
+class PostProcess(nn.Module):
+    """ This module converts the model's output into the format expected by the coco api"""
+    def __init__(self, HOIDet):
+        super().__init__()
+        self.HOIDet = HOIDet
+    @torch.no_grad()
+    def forward(self, outputs, target_sizes, threshold=0, dataset='coco',args=None):
+        """ Perform the computation
+        Parameters:
+            outputs: raw outputs of the model
+            target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch
+                          For evaluation, this must be the original image size (before any data augmentation)
+                          For visualization, this should be the image size after data augment, but before padding
+        """
+        out_logits, out_bbox = outputs['pred_logits'], outputs['pred_boxes']
+        num_path = 1+len(args.augpath_name)
+        path_id = args.path_id
+        assert len(out_logits) == len(target_sizes)
+        assert target_sizes.shape[1] == 2
+        prob = F.softmax(out_logits, -1)
+        scores, labels = prob[..., :-1].max(-1)
+        boxes = box_ops.box_cxcywh_to_xyxy(out_bbox)
+        img_h, img_w = target_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
+        boxes = boxes * scale_fct[:, None, :]
+        # Preidction Branch for HOI detection
+        if self.HOIDet:
+            if dataset == 'vcoco':
+                """ Compute HOI triplet prediction score for V-COCO.
+                Our scoring function follows the implementation details of UnionDet.
+                """
+                out_time = outputs['hoi_recognition_time']
+                bss,q,hd=outputs['pred_hidx'].shape
+                start_time = time.time()
+                pair_actions = torch.sigmoid(outputs['pred_actions'][:,path_id,...])
+                h_prob = F.softmax(outputs['pred_hidx'].view(num_path,bss//num_path,q,hd)[path_id], -1)
+                h_idx_score, h_indices = h_prob.max(-1)
+                o_prob = F.softmax(outputs['pred_oidx'].view(num_path,bss//num_path,q,hd)[path_id], -1)
+                o_idx_score, o_indices = o_prob.max(-1)
+                hoi_recognition_time = (time.time() - start_time) + out_time
+                # import pdb;pdb.set_trace()
+                results = []
+                # iterate for batch size
+                for batch_idx, (s, l, b) in enumerate(zip(scores, labels, boxes)):
+                    h_inds = (l == 1) & (s > threshold)
+                    o_inds = (s > threshold)
+                    h_box, h_cat = b[h_inds], s[h_inds]
+                    o_box, o_cat = b[o_inds], s[o_inds]
+                    # for scenario 1 in v-coco dataset
+                    o_inds = torch.cat((o_inds, torch.ones(1).type(torch.bool).to(o_inds.device)))
+                    o_box = torch.cat((o_box, torch.Tensor([0, 0, 0, 0]).unsqueeze(0).to(o_box.device)))
+                    result_dict = {
+                        'h_box': h_box, 'h_cat': h_cat,
+                        'o_box': o_box, 'o_cat': o_cat,
+                        'scores': s, 'labels': l, 'boxes': b
+                    }
+                    h_inds_lst = (h_inds == True).nonzero(as_tuple=False).squeeze(-1)
+                    o_inds_lst = (o_inds == True).nonzero(as_tuple=False).squeeze(-1)
+                    K = boxes.shape[1]
+                    n_act = pair_actions[batch_idx][:, :-1].shape[-1]
+                    score = torch.zeros((n_act, K, K+1)).to(pair_actions[batch_idx].device)
+                    sorted_score = torch.zeros((n_act, K, K+1)).to(pair_actions[batch_idx].device)
+                    id_score = torch.zeros((K, K+1)).to(pair_actions[batch_idx].device)
+                    # import pdb;pdb.set_trace()
+                    # Score function
+                    for hs, h_idx, os, o_idx, pair_action in zip(h_idx_score[batch_idx], h_indices[batch_idx], o_idx_score[batch_idx], o_indices[batch_idx], pair_actions[batch_idx]):
+                        matching_score = (1-pair_action[-1]) # no interaction score
+                        if h_idx == o_idx: o_idx = -1
+                        if matching_score > id_score[h_idx, o_idx]:
+                            id_score[h_idx, o_idx] = matching_score
+                            sorted_score[:, h_idx, o_idx] = matching_score * pair_action[:-1]
+                        score[:, h_idx, o_idx] += matching_score * pair_action[:-1]
+                    score += sorted_score
+                    score = score[:, h_inds, :]
+                    score = score[:, :, o_inds]
+                    result_dict.update({
+                        'pair_score': score,
+                        'hoi_recognition_time': hoi_recognition_time,
+                    })
+                    results.append(result_dict)
+            elif dataset == 'hico-det':
+                """ Compute HOI triplet prediction score for HICO-DET.
+                For HICO-DET, we follow the same scoring function but do not accumulate the results.
+                """
+                bss,q,hd=outputs['pred_hidx'].shape
+                out_time = outputs['hoi_recognition_time']
+                a,b,c=outputs['pred_obj_logits'].shape
+                start_time = time.time()
+                out_obj_logits, out_verb_logits = outputs['pred_obj_logits'].view(-1,num_path,b,c)[:,path_id,...], outputs['pred_actions'][:,path_id,...]
+                out_verb_logits = outputs['pred_actions'][:,path_id,...]
+                # actions
+                matching_scores = (1-out_verb_logits.sigmoid()[..., -1:]) #* (1-out_verb_logits.sigmoid()[..., 57:58])
+                verb_scores = out_verb_logits.sigmoid()[..., :-1] * matching_scores
+                # hbox, obox
+                outputs_hrepr, outputs_orepr = outputs['pred_hidx'].view(num_path,bss//num_path,q,hd)[path_id], outputs['pred_oidx'].view(num_path,bss//num_path,q,hd)[path_id]
+                obj_scores, obj_labels = F.softmax(out_obj_logits, -1)[..., :-1].max(-1)
+                h_prob = F.softmax(outputs_hrepr, -1)
+                h_idx_score, h_indices = h_prob.max(-1)
+                # targets
+                o_prob = F.softmax(outputs_orepr, -1)
+                o_idx_score, o_indices = o_prob.max(-1)
+                hoi_recognition_time = (time.time() - start_time) + out_time
+                # hidx, oidx
+                sub_boxes, obj_boxes = [], []
+                for batch_id, (box, h_idx, o_idx) in enumerate(zip(boxes, h_indices, o_indices)):
+                    sub_boxes.append(box[h_idx, :])
+                    obj_boxes.append(box[o_idx, :])
+                sub_boxes = torch.stack(sub_boxes, dim=0)
+                obj_boxes = torch.stack(obj_boxes, dim=0)
+                # accumulate results (iterate through interaction queries)
+                results = []
+                for os, ol, vs, ms, sb, ob in zip(obj_scores, obj_labels, verb_scores, matching_scores, sub_boxes, obj_boxes):
+                    sl = torch.full_like(ol, 0) # self.subject_category_id = 0 in HICO-DET
+                    l = torch.cat((sl, ol))
+                    b = torch.cat((sb, ob))
+                    results.append({'labels': l.to('cpu'), 'boxes': b.to('cpu')})
+                    vs = vs * os.unsqueeze(1)
+                    ids = torch.arange(b.shape[0])
+                    res_dict = {
+                        'verb_scores': vs.to('cpu'),
+                        'sub_ids': ids[:ids.shape[0] // 2],
+                        'obj_ids': ids[ids.shape[0] // 2:],
+                        'hoi_recognition_time': hoi_recognition_time
+                    }
+                    results[-1].update(res_dict)
+        else:
+            results = [{'scores': s, 'labels': l, 'boxes': b} for s, l, b in zip(scores, labels, boxes)]
+        return results

hotr/models/transformer.py ADDED Viewed

	@@ -0,0 +1,320 @@

+# ------------------------------------------------------------------------
+# HOTR official code : hotr/models/transformer.py
+# Copyright (c) Kakao Brain, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+"""
+DETR & HOTR Transformer class.
+Copy-paste from torch.nn.Transformer with modifications:
+    * positional encodings are passed in MHattention
+    * extra LN at the end of encoder is removed
+    * decoder returns a stack of activations from all decoding layers
+"""
+import copy
+from typing import Optional, List
+import torch
+import torch.nn.functional as F
+from torch import nn, Tensor
+class Transformer(nn.Module):
+    def __init__(self, d_model=512, nhead=8, num_encoder_layers=6,
+                 num_decoder_layers=6, dim_feedforward=2048, dropout=0.1,
+                 activation="relu", normalize_before=False,
+                 return_intermediate_dec=False):
+        super().__init__()
+        encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward,
+                                                dropout, activation, normalize_before)
+        encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
+        self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
+        decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward,
+                                                dropout, activation, normalize_before)
+        decoder_norm = nn.LayerNorm(d_model)
+        self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm,
+                                          return_intermediate=return_intermediate_dec)
+        self._reset_parameters()
+        self.d_model = d_model
+        self.nhead = nhead
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+    def forward(self, src, mask, query_embed, pos_embed,query_obj=None, return_decoder_input=False):
+        # flatten NxCxHxW to HWxNxC
+        bs, c, h, w = src.shape
+        src = src.flatten(2).permute(2, 0, 1)
+        pos_embed = pos_embed.flatten(2).permute(2, 0, 1)
+        if query_embed.dim()==2:
+            query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1)
+        mask = mask.flatten(1)
+        tgt = torch.zeros_like(query_embed)
+        memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed)
+        if query_obj is None:
+            hs = self.decoder(tgt, memory, memory_key_padding_mask=mask, pos=pos_embed, query_pos=query_embed)
+        else:
+            hs = self.decoder(query_obj, memory, memory_key_padding_mask=mask, pos=pos_embed, query_pos=query_embed)
+        return hs.transpose(1, 2), memory,
+class TransformerEncoder(nn.Module):
+    def __init__(self, encoder_layer, num_layers, norm=None):
+        super().__init__()
+        self.layers = _get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+    def forward(self, src,
+                mask: Optional[Tensor] = None,
+                src_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None):
+        output = src
+        for layer in self.layers:
+            output = layer(output, src_mask=mask,
+                           src_key_padding_mask=src_key_padding_mask, pos=pos)
+        if self.norm is not None:
+            output = self.norm(output)
+        return output
+class TransformerDecoder(nn.Module):
+    def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False):
+        super().__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+        self.return_intermediate = return_intermediate
+    def forward(self, tgt, memory,
+                tgt_mask: Optional[Tensor] = None,
+                memory_mask: Optional[Tensor] = None,
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                memory_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None,
+                query_pos: Optional[Tensor] = None):
+        output = tgt
+        intermediate = []
+        for layer in self.layers:
+            output = layer(output, memory, tgt_mask=tgt_mask,
+                           memory_mask=memory_mask,
+                           tgt_key_padding_mask=tgt_key_padding_mask,
+                           memory_key_padding_mask=memory_key_padding_mask,
+                           pos=pos, query_pos=query_pos)
+            if self.return_intermediate:
+                intermediate.append(self.norm(output))
+        if self.norm is not None:
+            output = self.norm(output)
+            if self.return_intermediate:
+                intermediate.pop()
+                intermediate.append(output)
+        if self.return_intermediate:
+            return torch.stack(intermediate)
+        return output.unsqueeze(0)
+class TransformerEncoderLayer(nn.Module):
+    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
+                 activation="relu", normalize_before=False):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+    def forward_post(self,
+                     src,
+                     src_mask: Optional[Tensor] = None,
+                     src_key_padding_mask: Optional[Tensor] = None,
+                     pos: Optional[Tensor] = None):
+        q = k = self.with_pos_embed(src, pos)
+        src2 = self.self_attn(q, k, value=src, attn_mask=src_mask,
+                              key_padding_mask=src_key_padding_mask)[0]
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = src + self.dropout2(src2)
+        src = self.norm2(src)
+        return src
+    def forward_pre(self, src,
+                    src_mask: Optional[Tensor] = None,
+                    src_key_padding_mask: Optional[Tensor] = None,
+                    pos: Optional[Tensor] = None):
+        src2 = self.norm1(src)
+        q = k = self.with_pos_embed(src2, pos)
+        src2 = self.self_attn(q, k, value=src2, attn_mask=src_mask,
+                              key_padding_mask=src_key_padding_mask)[0]
+        src = src + self.dropout1(src2)
+        src2 = self.norm2(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src2))))
+        src = src + self.dropout2(src2)
+        return src
+    def forward(self, src,
+                src_mask: Optional[Tensor] = None,
+                src_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None):
+        if self.normalize_before:
+            return self.forward_pre(src, src_mask, src_key_padding_mask, pos)
+        return self.forward_post(src, src_mask, src_key_padding_mask, pos)
+class TransformerDecoderLayer(nn.Module):
+    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
+                 activation="relu", normalize_before=False):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.dropout3 = nn.Dropout(dropout)
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+    def forward_post(self, tgt, memory,
+                     tgt_mask: Optional[Tensor] = None,
+                     memory_mask: Optional[Tensor] = None,
+                     tgt_key_padding_mask: Optional[Tensor] = None,
+                     memory_key_padding_mask: Optional[Tensor] = None,
+                     pos: Optional[Tensor] = None,
+                     query_pos: Optional[Tensor] = None):
+        q = k = self.with_pos_embed(tgt, query_pos)
+        tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask,
+                              key_padding_mask=tgt_key_padding_mask)[0]
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+        tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos),
+                                   key=self.with_pos_embed(memory, pos),
+                                   value=memory, attn_mask=memory_mask,
+                                   key_padding_mask=memory_key_padding_mask)[0]
+        tgt = tgt + self.dropout2(tgt2)
+        tgt = self.norm2(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout3(tgt2)
+        tgt = self.norm3(tgt)
+        return tgt
+    def forward_pre(self, tgt, memory,
+                    tgt_mask: Optional[Tensor] = None,
+                    memory_mask: Optional[Tensor] = None,
+                    tgt_key_padding_mask: Optional[Tensor] = None,
+                    memory_key_padding_mask: Optional[Tensor] = None,
+                    pos: Optional[Tensor] = None,
+                    query_pos: Optional[Tensor] = None):
+        tgt2 = self.norm1(tgt)
+        q = k = self.with_pos_embed(tgt2, query_pos)
+        tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask,
+                              key_padding_mask=tgt_key_padding_mask)[0]
+        tgt = tgt + self.dropout1(tgt2)
+        tgt2 = self.norm2(tgt)
+        tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos),
+                                   key=self.with_pos_embed(memory, pos),
+                                   value=memory, attn_mask=memory_mask,
+                                   key_padding_mask=memory_key_padding_mask)[0]
+        tgt = tgt + self.dropout2(tgt2)
+        tgt2 = self.norm3(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
+        tgt = tgt + self.dropout3(tgt2)
+        return tgt
+    def forward(self, tgt, memory,
+                tgt_mask: Optional[Tensor] = None,
+                memory_mask: Optional[Tensor] = None,
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                memory_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None,
+                query_pos: Optional[Tensor] = None):
+        if self.normalize_before:
+            return self.forward_pre(tgt, memory, tgt_mask, memory_mask,
+                                    tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos)
+        return self.forward_post(tgt, memory, tgt_mask, memory_mask,
+                                 tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos)
+def _get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+def build_transformer(args):
+    return Transformer(
+        d_model=args.hidden_dim,
+        dropout=args.dropout,
+        nhead=args.nheads,
+        dim_feedforward=args.dim_feedforward,
+        num_encoder_layers=args.enc_layers,
+        num_decoder_layers=args.dec_layers,
+        normalize_before=args.pre_norm,
+        return_intermediate_dec=True,
+    )
+def build_hoi_transformer(args):
+    return Transformer(
+        d_model=args.hidden_dim,
+        dropout=args.dropout,
+        nhead=args.hoi_nheads,
+        dim_feedforward=args.hoi_dim_feedforward,
+        num_encoder_layers=args.hoi_enc_layers,
+        num_decoder_layers=args.hoi_dec_layers,
+        normalize_before=args.pre_norm,
+        return_intermediate_dec=True,
+    )
+def _get_activation_fn(activation):
+    """Return an activation function given a string"""
+    if activation == "relu":
+        return F.relu
+    if activation == "gelu":
+        return F.gelu
+    if activation == "glu":
+        return F.glu
+    raise RuntimeError(F"activation should be relu/gelu, not {activation}.")

hotr/util/__init__.py ADDED Viewed

File without changes

hotr/util/box_ops.py ADDED Viewed

	@@ -0,0 +1,110 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Utilities for bounding box manipulation and GIoU.
+"""
+import torch
+from torchvision.ops.boxes import box_area
+def box_cxcywh_to_xyxy(x):
+    x_c, y_c, w, h = x.unbind(-1)
+    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
+         (x_c + 0.5 * w), (y_c + 0.5 * h)]
+    return torch.stack(b, dim=-1)
+def box_xyxy_to_cxcywh(x):
+    x0, y0, x1, y1 = x.unbind(-1)
+    b = [(x0 + x1) / 2, (y0 + y1) / 2,
+         (x1 - x0), (y1 - y0)]
+    return torch.stack(b, dim=-1)
+# modified from torchvision to also return the union
+def box_iou(boxes1, boxes2):
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
+    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
+    rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
+    wh = (rb - lt).clamp(min=0)  # [N,M,2]
+    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
+    union = area1[:, None] + area2 - inter
+    iou = inter / union
+    return iou, union
+def generalized_box_iou(boxes1, boxes2):
+    """
+    Generalized IoU from https://giou.stanford.edu/
+    The boxes should be in [x0, y0, x1, y1] format
+    Returns a [N, M] pairwise matrix, where N = len(boxes1)
+    and M = len(boxes2)
+    """
+    # degenerate boxes gives inf / nan results
+    # so do an early check
+    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
+    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
+    iou, union = box_iou(boxes1, boxes2)
+    lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
+    rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
+    wh = (rb - lt).clamp(min=0)  # [N,M,2]
+    area = wh[:, :, 0] * wh[:, :, 1]
+    return iou - (area - union) / area
+def masks_to_boxes(masks):
+    """Compute the bounding boxes around the provided masks
+    The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
+    Returns a [N, 4] tensors, with the boxes in xyxy format
+    """
+    if masks.numel() == 0:
+        return torch.zeros((0, 4), device=masks.device)
+    h, w = masks.shape[-2:]
+    y = torch.arange(0, h, dtype=torch.float)
+    x = torch.arange(0, w, dtype=torch.float)
+    y, x = torch.meshgrid(y, x)
+    x_mask = (masks * x.unsqueeze(0))
+    x_max = x_mask.flatten(1).max(-1)[0]
+    x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
+    y_mask = (masks * y.unsqueeze(0))
+    y_max = y_mask.flatten(1).max(-1)[0]
+    y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
+    return torch.stack([x_min, y_min, x_max, y_max], 1)
+def rescale_bboxes(out_bbox, size):
+    img_h, img_w = size
+    b = box_cxcywh_to_xyxy(out_bbox)
+    b = b * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32).to(out_bbox.get_device())
+    return b
+def rescale_pairs(out_pairs, size):
+    img_h, img_w = size
+    h_bbox = out_pairs[:, :4]
+    o_bbox = out_pairs[:, 4:]
+    h = box_cxcywh_to_xyxy(h_bbox)
+    h = h * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32).to(h_bbox.get_device())
+    obj_mask = (o_bbox[:, 0] != -1)
+    if obj_mask.sum() != 0:
+        o = box_cxcywh_to_xyxy(o_bbox)
+        o = o * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32).to(o_bbox.get_device())
+        o_bbox[obj_mask] = o[obj_mask]
+    o = o_bbox
+    p = torch.cat([h, o], dim=-1)
+    return p

hotr/util/logger.py ADDED Viewed

	@@ -0,0 +1,145 @@

+# ------------------------------------------------------------------------
+# HOTR official code : hotr/util/logger.py
+# Copyright (c) Kakao Brain, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+import torch
+import time
+import datetime
+import sys
+from time import sleep
+from collections import defaultdict
+from hotr.util.misc import SmoothedValue
+def print_params(model):
+    n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print('\n[Logger] Number of params: ', n_parameters)
+    return n_parameters
+def print_args(args):
+    print('\n[Logger] DETR Arguments:')
+    for k, v in vars(args).items():
+        if k in [
+            'lr', 'lr_backbone', 'lr_drop',
+            'frozen_weights',
+            'backbone', 'dilation',
+            'position_embedding', 'enc_layers', 'dec_layers', 'num_queries',
+            'dataset_file']:
+            print(f'\t{k}: {v}')
+    if args.HOIDet:
+        print('\n[Logger] DETR_HOI Arguments:')
+        for k, v in vars(args).items():
+            if k in [
+                'freeze_enc',
+                'query_flag',
+                'hoi_nheads',
+                'hoi_dim_feedforward',
+                'hoi_dec_layers',
+                'hoi_idx_loss_coef',
+                'hoi_act_loss_coef',
+                'hoi_eos_coef',
+                'object_threshold']:
+                print(f'\t{k}: {v}')
+class MetricLogger(object):
+    def __init__(self, mode="test", delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+        self.mode = mode
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(
+            type(self).__name__, attr))
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append(
+                "{}: {}".format(name, str(meter))
+            )
+        return self.delimiter.join(loss_str)
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+    def log_every(self, iterable, print_freq, header=None):
+        i = 0
+        if not header:
+            header = ''
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt='{avg:.4f}')
+        data_time = SmoothedValue(fmt='{avg:.4f}')
+        space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
+        if torch.cuda.is_available():
+            log_msg = self.delimiter.join([
+                header,
+                '[{0' + space_fmt + '}/{1}]',
+                'eta: {eta}',
+                '{meters}',
+                'time: {time}',
+                'data: {data}',
+                'max mem: {memory:.0f}'
+            ])
+        else:
+            log_msg = self.delimiter.join([
+                header,
+                '[{0' + space_fmt + '}/{1}]',
+                'eta: {eta}',
+                '{meters}',
+                'time: {time}',
+                'data: {data}'
+            ])
+        MB = 1024.0 * 1024.0
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if (i % print_freq == 0 and i !=0) or i == len(iterable) - 1:
+                eta_seconds = iter_time.global_avg * (len(iterable) - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if torch.cuda.is_available():
+                    print(log_msg.format(
+                        i+1, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time),
+                        memory=torch.cuda.max_memory_allocated() / MB),
+                        flush=(self.mode=='test'), end=("\r" if self.mode=='test' else "\n"))
+                else:
+                    print(log_msg.format(
+                        i+1, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time)),
+                        flush=(self.mode=='test'), end=("\r" if self.mode=='test' else "\n"))
+            else:
+                log_interval = self.delimiter.join([header, '[{0' + space_fmt + '}/{1}]'])
+                if torch.cuda.is_available(): print(log_interval.format(i+1, len(iterable)), flush=True, end="\r")
+                else: print(log_interval.format(i+1, len(iterable)), flush=True, end="\r")
+            i += 1
+            end = time.time()
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        if self.mode=='test': print("")
+        print('[stats] Total Time ({}) : {} ({:.4f} s / it)'.format(
+            self.mode, total_time_str, total_time / len(iterable)))

hotr/util/misc.py ADDED Viewed

	@@ -0,0 +1,401 @@

+# ------------------------------------------------------------------------
+# HOTR official code : hotr/util/misc.py
+# Copyright (c) Kakao Brain, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+"""
+Misc functions, including distributed helpers.
+Mostly copy-paste from torchvision references.
+"""
+import os
+import subprocess
+from collections import deque
+import pickle
+import socket
+from typing import Optional, List
+import ast
+import torch
+import torch.distributed as dist
+from torch import Tensor
+# needed due to empty tensor bug in pytorch and torchvision 0.5
+import torchvision
+if float(torchvision.__version__[:3]) < 0.7:
+    from torchvision.ops import _new_empty_tensor
+    from torchvision.ops.misc import _output_size
+os.environ['MASTER_PORT']='8993'
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        if not is_dist_avail_and_initialized():
+            return
+        t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
+        dist.barrier()
+        dist.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+    @property
+    def global_avg(self):
+        return self.total / self.count
+    @property
+    def max(self):
+        return max(self.deque)
+    @property
+    def value(self):
+        return self.deque[-1]
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value)
+def all_gather(data):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors)
+    Args:
+        data: any picklable object
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+    world_size = get_world_size()
+    if world_size == 1:
+        return [data]
+    # serialized to a Tensor
+    buffer = pickle.dumps(data)
+    storage = torch.ByteStorage.from_buffer(buffer)
+    tensor = torch.ByteTensor(storage).to("cuda")
+    # obtain Tensor size of each rank
+    local_size = torch.tensor([tensor.numel()], device="cuda")
+    size_list = [torch.tensor([0], device="cuda") for _ in range(world_size)]
+    dist.all_gather(size_list, local_size)
+    size_list = [int(size.item()) for size in size_list]
+    max_size = max(size_list)
+    # receiving Tensor from all ranks
+    # we pad the tensor because torch all_gather does not support
+    # gathering tensors of different shapes
+    tensor_list = []
+    for _ in size_list:
+        tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda"))
+    if local_size != max_size:
+        padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda")
+        tensor = torch.cat((tensor, padding), dim=0)
+    dist.all_gather(tensor_list, tensor)
+    data_list = []
+    for size, tensor in zip(size_list, tensor_list):
+        buffer = tensor.cpu().numpy().tobytes()[:size]
+        data_list.append(pickle.loads(buffer))
+    return data_list
+def reduce_dict(input_dict, average=True):
+    """
+    Args:
+        input_dict (dict): all the values will be reduced
+        average (bool): whether to do average or sum
+    Reduce the values in the dictionary from all processes so that all processes
+    have the averaged results. Returns a dict with the same fields as
+    input_dict, after reduction.
+    """
+    world_size = get_world_size()
+    if world_size < 2:
+        return input_dict
+    with torch.no_grad():
+        names = []
+        values = []
+        # sort the keys so that they are consistent across processes
+        for k in sorted(input_dict.keys()):
+            names.append(k)
+            values.append(input_dict[k])
+        values = torch.stack(values, dim=0)
+        dist.all_reduce(values)
+        if average:
+            values /= world_size
+        reduced_dict = {k: v for k, v in zip(names, values)}
+    return reduced_dict
+def get_sha():
+    cwd = os.path.dirname(os.path.abspath(__file__))
+    def _run(command):
+        return subprocess.check_output(command, cwd=cwd).decode('ascii').strip()
+    sha = 'N/A'
+    diff = "clean"
+    branch = 'N/A'
+    try:
+        sha = _run(['git', 'rev-parse', 'HEAD'])
+        subprocess.check_output(['git', 'diff'], cwd=cwd)
+        diff = _run(['git', 'diff-index', 'HEAD'])
+        diff = "has uncommited changes" if diff else "clean"
+        branch = _run(['git', 'rev-parse', '--abbrev-ref', 'HEAD'])
+    except Exception:
+        pass
+    message = f"sha: {sha}, status: {diff}, branch: {branch}"
+    return message
+def collate_fn(batch):
+    batch = list(zip(*batch))
+    batch[0] = nested_tensor_from_tensor_list(batch[0])
+    return tuple(batch)
+def _max_by_axis(the_list):
+    # type: (List[List[int]]) -> List[int]
+    maxes = the_list[0]
+    for sublist in the_list[1:]:
+        for index, item in enumerate(sublist):
+            maxes[index] = max(maxes[index], item)
+    return maxes
+def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
+    # TODO make this more general
+    if tensor_list[0].ndim == 3:
+        # TODO make it support different-sized images
+        max_size = _max_by_axis([list(img.shape) for img in tensor_list])
+        # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
+        batch_shape = [len(tensor_list)] + max_size
+        b, c, h, w = batch_shape
+        dtype = tensor_list[0].dtype
+        device = tensor_list[0].device
+        tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
+        mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
+        for img, pad_img, m in zip(tensor_list, tensor, mask):
+            pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
+            m[: img.shape[1], :img.shape[2]] = False
+    else:
+        raise ValueError('not supported')
+    return NestedTensor(tensor, mask)
+class NestedTensor(object):
+    def __init__(self, tensors, mask: Optional[Tensor]):
+        self.tensors = tensors
+        self.mask = mask
+    def to(self, device):
+        # type: (Device) -> NestedTensor # noqa
+        cast_tensor = self.tensors.to(device)
+        mask = self.mask
+        if mask is not None:
+            assert mask is not None
+            cast_mask = mask.to(device)
+        else:
+            cast_mask = None
+        return NestedTensor(cast_tensor, cast_mask)
+    def decompose(self):
+        return self.tensors, self.mask
+    def __repr__(self):
+        return str(self.tensors)
+def setup_for_distributed(is_master):
+    """
+    This function disables printing when not in master process
+    """
+    import builtins as __builtin__
+    builtin_print = __builtin__.print
+    def print(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        if is_master or force:
+            builtin_print(*args, **kwargs)
+    __builtin__.print = print
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+def is_main_process():
+    return get_rank() == 0
+def save_on_master(*args, **kwargs):
+    if is_main_process():
+        torch.save(*args, **kwargs)
+def _check_if_valid_ip(ip):
+    try:
+        socket.inet_aton(ip)
+        # legal
+    except socket.error:
+        # Not legal
+        return False
+    return True
+def arg_as_list(s):
+    v = ast.literal_eval(s)
+    if type(v) is not list:
+        raise argparse.ArgumentTypeError("List should be given.")
+    return v
+def _maybe_gethostbyname(addr):
+    """to be compatible with Braincloud on which one can access the nodes by their task names.
+    Each node has to wait until all the tasks in the group are up on the cloud."""
+    if _check_if_valid_ip(addr):
+        # If IP address is given, do nothing
+        return addr
+    # Otherwise, find the IP address by hostname
+    done = False
+    retry = 0
+    print(f"Get URL by the given hostname '{addr}' in Braincloud..")
+    while not done:
+        try:
+            addr = socket.gethostbyname(addr)
+            done = True
+        except:
+            retry += 1
+            print(f"Retrying count: {retry}")
+            time.sleep(3)
+    print(f"Found the host by IP address: {addr}")
+    return addr
+def init_distributed_mode(args):
+    if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
+        os.environ["MASTER_ADDR"] = _maybe_gethostbyname(os.environ["MASTER_ADDR"])
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ['WORLD_SIZE'])
+        args.gpu = int(os.environ['LOCAL_RANK'])
+        args.dist_url = 'env://'
+        os.environ['LOCAL_SIZE'] = str(torch.cuda.device_count())
+    elif 'SLURM_PROCID' in os.environ:
+        proc_id = int(os.environ['SLURM_PROCID'])
+        ntasks = int(os.environ['SLURM_NTASKS'])
+        node_list = os.environ['SLURM_NODELIST']
+        num_gpus = torch.cuda.device_count()
+        addr = subprocess.getoutput(
+            'scontrol show hostname {} | head -n1'.format(node_list))
+        os.environ['MASTER_PORT'] = os.environ.get('MASTER_PORT', '29500')
+        os.environ['MASTER_ADDR'] = addr
+        os.environ['WORLD_SIZE'] = str(ntasks)
+        os.environ['RANK'] = str(proc_id)
+        os.environ['LOCAL_RANK'] = str(proc_id % num_gpus)
+        os.environ['LOCAL_SIZE'] = str(num_gpus)
+        args.dist_url = 'env://'
+        args.world_size = ntasks
+        args.rank = proc_id
+        args.gpu = proc_id % num_gpus
+    else:
+        print('Not using distributed mode')
+        args.distributed = False
+        return
+    args.distributed = True
+    torch.cuda.set_device(args.gpu)
+    args.dist_backend = 'nccl'
+    print('| distributed init (rank {}): {}'.format(
+        args.rank, args.dist_url), flush=True)
+    torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                                         world_size=args.world_size, rank=args.rank)
+    torch.distributed.barrier()
+    setup_for_distributed(args.rank == 0)
+@torch.no_grad()
+def accuracy(output, target, topk=(1,)):
+    """Computes the precision@k for the specified values of k"""
+    if target.numel() == 0:
+        return [torch.zeros([], device=output.device)]
+    maxk = max(topk)
+    batch_size = target.size(0)
+    _, pred = output.topk(maxk, 1, True, True)
+    pred = pred.t()
+    correct = pred.eq(target.view(1, -1).expand_as(pred))
+    res = []
+    for k in topk:
+        correct_k = correct[:k].view(-1).float().sum(0)
+        res.append(correct_k.mul_(100.0 / batch_size))
+    return res
+def interpolate(input, size=None, scale_factor=None, mode="nearest", align_corners=None):
+    # type: (Tensor, Optional[List[int]], Optional[float], str, Optional[bool]) -> Tensor
+    """
+    Equivalent to nn.functional.interpolate, but with support for empty batch sizes.
+    This will eventually be supported natively by PyTorch, and this
+    class can go away.
+    """
+    if float(torchvision.__version__[:3]) < 0.7:
+        if input.numel() > 0:
+            return torch.nn.functional.interpolate(
+                input, size, scale_factor, mode, align_corners
+            )
+        output_shape = _output_size(2, input, size, scale_factor)
+        output_shape = list(input.shape[:-2]) + list(output_shape)
+        return _new_empty_tensor(input, output_shape)
+    else:
+        return torchvision.ops.misc.interpolate(input, size, scale_factor, mode, align_corners)

hotr/util/ramp.py ADDED Viewed

	@@ -0,0 +1,23 @@

+# Copyright (c) 2018, Curious AI Ltd. All rights reserved.
+#
+# This work is licensed under the Creative Commons Attribution-NonCommercial
+# 4.0 International License. To view a copy of this license, visit
+# http://creativecommons.org/licenses/by-nc/4.0/ or send a letter to
+# Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.
+import numpy as np
+def sigmoid_rampup(current, rampup_length,max_coef=1.):
+    """Exponential rampup from https://arxiv.org/abs/1610.02242"""
+    """Modified version from https://github.com/vikasverma1077/GraphMix/blob/master/semisupervised/codes/ramps.py"""
+    if rampup_length == 0:
+        return max_coef
+    else:
+        current = np.clip(current, 0.0, rampup_length)
+        phase = 1.0 - current / rampup_length
+        return float(np.exp(-5.0 * phase * phase))*max_coef
+def cosine_rampdown(current, rampdown_length,max_coef=1.):
+    """Cosine rampdown from https://arxiv.org/abs/1608.03983"""
+    assert 0 <= current <= rampdown_length
+    return float(.5 * (np.cos(np.pi *current / rampdown_length) + 1))*max_coef

imgs/mainfig.png ADDED Viewed

main.py ADDED Viewed

	@@ -0,0 +1,240 @@

+# ------------------------------------------------------------------------
+# HOTR official code : main.py
+# Copyright (c) Kakao Brain, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+import argparse
+import datetime
+import json
+import random
+import time
+import multiprocessing
+from pathlib import Path
+import numpy as np
+import torch
+from torch.utils.data import DataLoader, DistributedSampler
+import hotr.data.datasets as datasets
+import hotr.util.misc as utils
+from hotr.engine.arg_parser import get_args_parser
+from hotr.data.datasets import build_dataset, get_coco_api_from_dataset
+from hotr.engine.trainer import train_one_epoch
+from hotr.engine import hoi_evaluator, hoi_accumulator
+from hotr.models import build_model
+import wandb
+from hotr.util.logger import print_params, print_args
+def save_ckpt(args, model_without_ddp, optimizer, lr_scheduler, epoch, filename):
+    # save_ckpt: function for saving checkpoints
+    output_dir = Path(args.output_dir)
+    if args.output_dir:
+        checkpoint_path = output_dir / f'{filename}.pth'
+        utils.save_on_master({
+            'model': model_without_ddp.state_dict(),
+            'optimizer': optimizer.state_dict(),
+            'lr_scheduler': lr_scheduler.state_dict(),
+            'epoch': epoch,
+            'args': args,
+        }, checkpoint_path)
+def main(args):
+    utils.init_distributed_mode(args)
+    if args.frozen_weights is not None:
+        print("Freeze weights for detector")
+    device = torch.device(args.device)
+    # fix the seed for reproducibility
+    seed = args.seed + utils.get_rank()
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    # Data Setup
+    dataset_train = build_dataset(image_set='train', args=args)
+    dataset_val = build_dataset(image_set='val' if not args.eval else 'test', args=args)
+    assert dataset_train.num_action() == dataset_val.num_action(), "Number of actions should be the same between splits"
+    args.num_classes = dataset_train.num_category()
+    args.num_actions = dataset_train.num_action()
+    args.action_names = dataset_train.get_actions()
+    if args.share_enc: args.hoi_enc_layers = args.enc_layers
+    if args.pretrained_dec: args.hoi_dec_layers = args.dec_layers
+    if args.dataset_file == 'vcoco':
+        # Save V-COCO dataset statistics
+        args.valid_ids = np.array(dataset_train.get_object_label_idx()).nonzero()[0]
+        args.invalid_ids = np.argwhere(np.array(dataset_train.get_object_label_idx()) == 0).squeeze(1)
+        args.human_actions = dataset_train.get_human_action()
+        args.object_actions = dataset_train.get_object_action()
+        args.num_human_act = dataset_train.num_human_act()
+    elif args.dataset_file == 'hico-det':
+        args.valid_obj_ids = dataset_train.get_valid_obj_ids()
+    print_args(args)
+    if args.distributed:
+        sampler_train = DistributedSampler(dataset_train, shuffle=True)
+        sampler_val = DistributedSampler(dataset_val, shuffle=False)
+    else:
+        sampler_train = torch.utils.data.RandomSampler(dataset_train)
+        sampler_val = torch.utils.data.SequentialSampler(dataset_val)
+    batch_sampler_train = torch.utils.data.BatchSampler(
+        sampler_train, args.batch_size, drop_last=True)
+    data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train,
+                                  collate_fn=utils.collate_fn, num_workers=args.num_workers)
+    data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val,
+                                 drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers)
+    # Model Setup
+    model, criterion, postprocessors = build_model(args)
+    # import pdb;pdb.set_trace()
+    model.to(device)
+    model_without_ddp = model
+    if args.distributed:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
+        model_without_ddp = model.module
+    n_parameters = print_params(model)
+    param_dicts = [
+        {"params": [p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad]},
+        {
+            "params": [p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad],
+            "lr": args.lr_backbone,
+        },
+    ]
+    optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay)
+    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)
+    # lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, [1,100])
+    # Weight Setup
+    if args.frozen_weights is not None:
+        if args.frozen_weights.startswith('https'):
+            checkpoint = torch.hub.load_state_dict_from_url(
+                args.frozen_weights, map_location='cpu', check_hash=True)
+        else:
+            checkpoint = torch.load(args.frozen_weights, map_location='cpu')
+        model_without_ddp.detr.load_state_dict(checkpoint['model'])
+    if args.resume:
+        if args.resume.startswith('https'):
+            checkpoint = torch.hub.load_state_dict_from_url(
+                args.resume, map_location='cpu', check_hash=True)
+        else:
+            checkpoint = torch.load(args.resume, map_location='cpu')
+        model_without_ddp.load_state_dict(checkpoint['model'])
+        if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            # lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
+            args.start_epoch = checkpoint['epoch'] + 1
+    # import pdb;pdb.set_trace()
+    if args.eval:
+        # test only mode
+        if args.HOIDet:
+            if args.dataset_file == 'vcoco':
+                total_res = hoi_evaluator(args, model, criterion, postprocessors, data_loader_val, device)
+                sc1, sc2 = hoi_accumulator(args, total_res, True, False)
+            elif args.dataset_file == 'hico-det':
+                test_stats = hoi_evaluator(args, model, None, postprocessors, data_loader_val, device)
+                print(f'| mAP (full)\t\t: {test_stats["mAP"]:.2f}')
+                print(f'| mAP (rare)\t\t: {test_stats["mAP rare"]:.2f}')
+                print(f'| mAP (non-rare)\t: {test_stats["mAP non-rare"]:.2f}')
+            else: raise ValueError(f'dataset {args.dataset_file} is not supported.')
+            return
+        else:
+            test_stats, coco_evaluator = evaluate_coco(model, criterion, postprocessors,
+                                                  data_loader_val, base_ds, device, args.output_dir)
+            if args.output_dir:
+                utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth")
+            return
+    # stats
+    scenario1, scenario2 = 0, 0
+    best_mAP, best_rare, best_non_rare = 0, 0, 0
+    # add argparse
+    if args.wandb and utils.get_rank() == 0:
+        wandb.init(
+            project=args.project_name,
+            group=args.group_name,
+            name=args.run_name,
+            config=args
+        )
+        wandb.watch(model)
+    # Training starts here!
+    # lr_scheduler.step()
+    start_time = time.time()
+    for epoch in range(args.start_epoch, args.epochs):
+        if args.distributed:
+            sampler_train.set_epoch(epoch)
+        train_stats = train_one_epoch(
+            model, criterion, data_loader_train, optimizer, device, epoch, args.epochs, args.ramp_up_epoch,args.ramp_down_epoch,args.hoi_consistency_loss_coef,
+            args.clip_max_norm, dataset_file=args.dataset_file, log=args.wandb)
+        lr_scheduler.step()
+        # Validation
+        if args.validate:
+            print('-'*100)
+            if args.dataset_file == 'vcoco':
+                total_res = hoi_evaluator(args, model, criterion, postprocessors, data_loader_val, device)
+                if utils.get_rank() == 0:
+                    sc1, sc2 = hoi_accumulator(args, total_res, False, args.wandb)
+                    if sc1 > scenario1:
+                        scenario1 = sc1
+                        scenario2 = sc2
+                        save_ckpt(args, model_without_ddp, optimizer, lr_scheduler, epoch, filename='best')
+                    print(f'| Scenario #1 mAP : {sc1:.2f} ({scenario1:.2f})')
+                    print(f'| Scenario #2 mAP : {sc2:.2f} ({scenario2:.2f})')
+            elif args.dataset_file == 'hico-det':
+                test_stats = hoi_evaluator(args, model, None, postprocessors, data_loader_val, device)
+                if utils.get_rank() == 0:
+                    if test_stats['mAP'] > best_mAP:
+                        best_mAP = test_stats['mAP']
+                        best_rare = test_stats['mAP rare']
+                        best_non_rare = test_stats['mAP non-rare']
+                        save_ckpt(args, model_without_ddp, optimizer, lr_scheduler, epoch, filename='best')
+                    print(f'| mAP (full)\t\t: {test_stats["mAP"]:.2f} ({best_mAP:.2f})')
+                    print(f'| mAP (rare)\t\t: {test_stats["mAP rare"]:.2f} ({best_rare:.2f})')
+                    print(f'| mAP (non-rare)\t: {test_stats["mAP non-rare"]:.2f} ({best_non_rare:.2f})')
+                    if args.wandb and utils.get_rank() == 0:
+                        wandb.log({
+                            'mAP': test_stats['mAP'],
+                            'mAP rare': test_stats['mAP rare'],
+                            'mAP non-rare': test_stats['mAP non-rare']
+                        })
+            print('-'*100)
+        save_ckpt(args, model_without_ddp, optimizer, lr_scheduler, epoch, filename='checkpoint')
+        if (epoch + 1) % args.lr_drop == 0 :
+            save_ckpt(args, model_without_ddp, optimizer, lr_scheduler, epoch, filename='checkpoint_'+str(epoch))
+        # if (epoch + 1) % args.pseudo_epoch == 0 :
+        #     save_ckpt(args, model_without_ddp, optimizer, lr_scheduler, epoch, filename='checkpoint_pseudo_'+str(epoch))
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    print('Training time {}'.format(total_time_str))
+    if args.dataset_file == 'vcoco':
+        print(f'| Scenario #1 mAP : {scenario1:.2f}')
+        print(f'| Scenario #2 mAP : {scenario2:.2f}')
+    elif args.dataset_file == 'hico-det':
+        print(f'| mAP (full)\t\t: {best_mAP:.2f}')
+        print(f'| mAP (rare)\t\t: {best_rare:.2f}')
+        print(f'| mAP (non-rare)\t: {best_non_rare:.2f}')
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        'End-to-End Human Object Interaction training and evaluation script',
+        parents=[get_args_parser()]
+    )
+    args = parser.parse_args()
+    if args.output_dir:
+        args.output_dir += f"/{args.group_name}/{args.run_name}/"
+        Path(args.output_dir).mkdir(parents=True, exist_ok=True)
+    main(args)

tools/launch.py ADDED Viewed

	@@ -0,0 +1,192 @@

+# --------------------------------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# --------------------------------------------------------------------------------------------------------------------------
+# Modified from https://github.com/pytorch/pytorch/blob/173f224570017b4b1a3a1a13d0bff280a54d9cd9/torch/distributed/launch.py
+# --------------------------------------------------------------------------------------------------------------------------
+r"""
+`torch.distributed.launch` is a module that spawns up multiple distributed
+training processes on each of the training nodes.
+The utility can be used for single-node distributed training, in which one or
+more processes per node will be spawned. The utility can be used for either
+CPU training or GPU training. If the utility is used for GPU training,
+each distributed process will be operating on a single GPU. This can achieve
+well-improved single-node training performance. It can also be used in
+multi-node distributed training, by spawning up multiple processes on each node
+for well-improved multi-node distributed training performance as well.
+This will especially be benefitial for systems with multiple Infiniband
+interfaces that have direct-GPU support, since all of them can be utilized for
+aggregated communication bandwidth.
+In both cases of single-node distributed training or multi-node distributed
+training, this utility will launch the given number of processes per node
+(``--nproc_per_node``). If used for GPU training, this number needs to be less
+or euqal to the number of GPUs on the current system (``nproc_per_node``),
+and each process will be operating on a single GPU from *GPU 0 to
+GPU (nproc_per_node - 1)*.
+**How to use this module:**
+1. Single-Node multi-process distributed training
+::
+    >>> python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE
+               YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3 and all other
+               arguments of your training script)
+2. Multi-Node multi-process distributed training: (e.g. two nodes)
+Node 1: *(IP: 192.168.1.1, and has a free port: 1234)*
+::
+    >>> python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE
+               --nnodes=2 --node_rank=0 --master_addr="192.168.1.1"
+               --master_port=1234 YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3
+               and all other arguments of your training script)
+Node 2:
+::
+    >>> python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE
+               --nnodes=2 --node_rank=1 --master_addr="192.168.1.1"
+               --master_port=1234 YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3
+               and all other arguments of your training script)
+3. To look up what optional arguments this module offers:
+::
+    >>> python -m torch.distributed.launch --help
+**Important Notices:**
+1. This utilty and multi-process distributed (single-node or
+multi-node) GPU training currently only achieves the best performance using
+the NCCL distributed backend. Thus NCCL backend is the recommended backend to
+use for GPU training.
+2. In your training program, you must parse the command-line argument:
+``--local_rank=LOCAL_PROCESS_RANK``, which will be provided by this module.
+If your training program uses GPUs, you should ensure that your code only
+runs on the GPU device of LOCAL_PROCESS_RANK. This can be done by:
+Parsing the local_rank argument
+::
+    >>> import argparse
+    >>> parser = argparse.ArgumentParser()
+    >>> parser.add_argument("--local_rank", type=int)
+    >>> args = parser.parse_args()
+Set your device to local rank using either
+::
+    >>> torch.cuda.set_device(arg.local_rank)  # before your code runs
+or
+::
+    >>> with torch.cuda.device(arg.local_rank):
+    >>>    # your code to run
+3. In your training program, you are supposed to call the following function
+at the beginning to start the distributed backend. You need to make sure that
+the init_method uses ``env://``, which is the only supported ``init_method``
+by this module.
+::
+    torch.distributed.init_process_group(backend='YOUR BACKEND',
+                                         init_method='env://')
+4. In your training program, you can either use regular distributed functions
+or use :func:`torch.nn.parallel.DistributedDataParallel` module. If your
+training program uses GPUs for training and you would like to use
+:func:`torch.nn.parallel.DistributedDataParallel` module,
+here is how to configure it.
+::
+    model = torch.nn.parallel.DistributedDataParallel(model,
+                                                      device_ids=[arg.local_rank],
+                                                      output_device=arg.local_rank)
+Please ensure that ``device_ids`` argument is set to be the only GPU device id
+that your code will be operating on. This is generally the local rank of the
+process. In other words, the ``device_ids`` needs to be ``[args.local_rank]``,
+and ``output_device`` needs to be ``args.local_rank`` in order to use this
+utility
+5. Another way to pass ``local_rank`` to the subprocesses via environment variable
+``LOCAL_RANK``. This behavior is enabled when you launch the script with
+``--use_env=True``. You must adjust the subprocess example above to replace
+``args.local_rank`` with ``os.environ['LOCAL_RANK']``; the launcher
+will not pass ``--local_rank`` when you specify this flag.
+.. warning::
+    ``local_rank`` is NOT globally unique: it is only unique per process
+    on a machine.  Thus, don't use it to decide if you should, e.g.,
+    write to a networked filesystem.  See
+    https://github.com/pytorch/pytorch/issues/12042 for an example of
+    how things can go wrong if you don't do this correctly.
+"""
+import sys
+import subprocess
+import os
+import socket
+from argparse import ArgumentParser, REMAINDER
+import torch
+def parse_args():
+    """
+    Helper function parsing the command line options
+    @retval ArgumentParser
+    """
+    parser = ArgumentParser(description="PyTorch distributed training launch "
+                                        "helper utilty that will spawn up "
+                                        "multiple distributed processes")
+    # Optional arguments for the launch helper
+    parser.add_argument("--nnodes", type=int, default=1,
+                        help="The number of nodes to use for distributed "
+                             "training")
+    parser.add_argument("--node_rank", type=int, default=0,
+                        help="The rank of the node for multi-node distributed "
+                             "training")
+    parser.add_argument("--nproc_per_node", type=int, default=1,
+                        help="The number of processes to launch on each node, "
+                             "for GPU training, this is recommended to be set "
+                             "to the number of GPUs in your system so that "
+                             "each process can be bound to a single GPU.")
+    parser.add_argument("--master_addr", default="127.0.0.1", type=str,
+                        help="Master node (rank 0)'s address, should be either "
+                             "the IP address or the hostname of node 0, for "
+                             "single node multi-proc training, the "
+                             "--master_addr can simply be 127.0.0.1")
+    parser.add_argument("--master_port", default=29500, type=int,
+                        help="Master node (rank 0)'s free port that needs to "
+                             "be used for communciation during distributed "
+                             "training")
+    # positional
+    parser.add_argument("training_script", type=str,
+                        help="The full path to the single GPU training "
+                             "program/script to be launched in parallel, "
+                             "followed by all the arguments for the "
+                             "training script")
+    # rest from the training program
+    parser.add_argument('training_script_args', nargs=REMAINDER)
+    return parser.parse_args()
+def main():
+    args = parse_args()
+    # world size in terms of number of processes
+    dist_world_size = args.nproc_per_node * args.nnodes
+    # set PyTorch distributed related environmental variables
+    current_env = os.environ.copy()
+    current_env["MASTER_ADDR"] = args.master_addr
+    current_env["MASTER_PORT"] = str(args.master_port)
+    current_env["WORLD_SIZE"] = str(dist_world_size)
+    processes = []
+    for local_rank in range(0, args.nproc_per_node):
+        # each process's rank
+        dist_rank = args.nproc_per_node * args.node_rank + local_rank
+        current_env["RANK"] = str(dist_rank)
+        current_env["LOCAL_RANK"] = str(local_rank)
+        cmd = [args.training_script] + args.training_script_args
+        process = subprocess.Popen(cmd, env=current_env)
+        processes.append(process)
+    for process in processes:
+        process.wait()
+        if process.returncode != 0:
+            raise subprocess.CalledProcessError(returncode=process.returncode,
+                                                cmd=process.args)
+if __name__ == "__main__":
+    main()

tools/run_dist_launch.sh ADDED Viewed

	@@ -0,0 +1,29 @@

+#!/usr/bin/env bash
+# ------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+set -x
+GPUS=$1
+RUN_COMMAND=${@:2}
+if [ $GPUS -lt 8 ]; then
+    GPUS_PER_NODE=${GPUS_PER_NODE:-$GPUS}
+else
+    GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+fi
+MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
+MASTER_PORT=${MASTER_PORT:-"29500"}
+NODE_RANK=${NODE_RANK:-0}
+let "NNODES=GPUS/GPUS_PER_NODE"
+python ./tools/launch.py \
+    --nnodes ${NNODES} \
+    --node_rank ${NODE_RANK} \
+    --master_addr ${MASTER_ADDR} \
+    --master_port ${MASTER_PORT} \
+    --nproc_per_node ${GPUS_PER_NODE} \
+    ${RUN_COMMAND}

tools/run_dist_slurm.sh ADDED Viewed

	@@ -0,0 +1,33 @@

+#!/usr/bin/env bash
+# --------------------------------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# --------------------------------------------------------------------------------------------------------------------------
+# Modified from https://github.com/open-mmlab/mmdetection/blob/3b53fe15d87860c6941f3dda63c0f27422da6266/tools/slurm_train.sh
+# --------------------------------------------------------------------------------------------------------------------------
+set -x
+PARTITION=$1
+JOB_NAME=$2
+GPUS=$3
+RUN_COMMAND=${@:4}
+if [ $GPUS -lt 8 ]; then
+    GPUS_PER_NODE=${GPUS_PER_NODE:-$GPUS}
+else
+    GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+fi
+CPUS_PER_TASK=${CPUS_PER_TASK:-4}
+SRUN_ARGS=${SRUN_ARGS:-""}
+srun -p ${PARTITION} \
+    --job-name=${JOB_NAME} \
+    --gres=gpu:${GPUS_PER_NODE} \
+    --ntasks=${GPUS} \
+    --ntasks-per-node=${GPUS_PER_NODE} \
+    --cpus-per-task=${CPUS_PER_TASK} \
+    --kill-on-bad-exit=1 \
+    ${SRUN_ARGS} \
+    ${RUN_COMMAND}

v-coco ADDED Viewed

	@@ -0,0 +1 @@


1	+ /data/public/rw/datasets/v-coco/