haiphamcse commited on Apr 16

Commit

9855f47

verified ·

1 Parent(s): 3aee1e1

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +14 -0
perception_models/.gitignore +10 -0
perception_models/CODE_OF_CONDUCT.md +80 -0
perception_models/CONTRIBUTING.md +31 -0
perception_models/LEGRAD_PE_USAGE.md +72 -0
perception_models/LICENSE.PE +201 -0
perception_models/LICENSE.PLM +124 -0
perception_models/README.md +408 -0
perception_models/__pycache__/legrad_pe_audio.cpython-310.pyc +0 -0
perception_models/__pycache__/legrad_pe_audio.cpython-313.pyc +0 -0
perception_models/__pycache__/legrad_pe_image.cpython-312.pyc +0 -0
perception_models/__pycache__/legrad_pe_image.cpython-313.pyc +0 -0
perception_models/apps/detection/DETA_pe/README.md +53 -0
perception_models/apps/detection/DETA_pe/datasets/__init__.py +37 -0
perception_models/apps/detection/DETA_pe/datasets/coco.py +345 -0
perception_models/apps/detection/DETA_pe/datasets/coco_eval.py +265 -0
perception_models/apps/detection/DETA_pe/datasets/coco_panoptic.py +107 -0
perception_models/apps/detection/DETA_pe/datasets/data_prefetcher.py +70 -0
perception_models/apps/detection/DETA_pe/datasets/objects365.py +54 -0
perception_models/apps/detection/DETA_pe/datasets/panoptic_eval.py +52 -0
perception_models/apps/detection/DETA_pe/datasets/samplers.py +348 -0
perception_models/apps/detection/DETA_pe/datasets/torchvision_datasets/__init__.py +7 -0
perception_models/apps/detection/DETA_pe/datasets/torchvision_datasets/coco.py +84 -0
perception_models/apps/detection/DETA_pe/datasets/transforms.py +327 -0
perception_models/apps/detection/DETA_pe/engine.py +303 -0
perception_models/apps/detection/DETA_pe/engine_tta.py +239 -0
perception_models/apps/detection/DETA_pe/main.py +754 -0
perception_models/apps/detection/DETA_pe/models/__init__.py +15 -0
perception_models/apps/detection/DETA_pe/models/assigner.py +378 -0
perception_models/apps/detection/DETA_pe/models/backbone.py +235 -0
perception_models/apps/detection/DETA_pe/models/deformable_detr.py +776 -0
perception_models/apps/detection/DETA_pe/models/deformable_transformer.py +451 -0
perception_models/apps/detection/DETA_pe/models/matcher.py +102 -0
perception_models/apps/detection/DETA_pe/models/ops/functions/__init__.py +9 -0
perception_models/apps/detection/DETA_pe/models/ops/functions/ms_deform_attn_func.py +106 -0
perception_models/apps/detection/DETA_pe/models/ops/make.sh +10 -0
perception_models/apps/detection/DETA_pe/models/ops/modules/__init__.py +9 -0
perception_models/apps/detection/DETA_pe/models/ops/modules/ms_deform_attn.py +161 -0
perception_models/apps/detection/DETA_pe/models/ops/setup.py +71 -0
perception_models/apps/detection/DETA_pe/models/ops/src/cpu/ms_deform_attn_cpu.cpp +41 -0
perception_models/apps/detection/DETA_pe/models/ops/src/cpu/ms_deform_attn_cpu.h +33 -0
perception_models/apps/detection/DETA_pe/models/ops/src/cuda/ms_deform_attn_cuda.cu +153 -0
perception_models/apps/detection/DETA_pe/models/ops/src/cuda/ms_deform_attn_cuda.h +30 -0
perception_models/apps/detection/DETA_pe/models/ops/src/cuda/ms_deform_im2col_cuda.cuh +1327 -0
perception_models/apps/detection/DETA_pe/models/ops/src/ms_deform_attn.h +62 -0
perception_models/apps/detection/DETA_pe/models/ops/src/vision.cpp +16 -0
perception_models/apps/detection/DETA_pe/models/ops/test.py +89 -0
perception_models/apps/detection/DETA_pe/models/pev1.py +686 -0
perception_models/apps/detection/DETA_pe/models/position_encoding.py +97 -0
perception_models/apps/detection/DETA_pe/models/segmentation.py +369 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,17 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+perception_models/apps/pe/docs/assets/dog.mp4 filter=lfs diff=lfs merge=lfs -text
+perception_models/apps/pe/docs/assets/dog.png filter=lfs diff=lfs merge=lfs -text
+perception_models/apps/pe/docs/assets/office.mp4 filter=lfs diff=lfs merge=lfs -text
+perception_models/apps/pe/docs/assets/office.wav filter=lfs diff=lfs merge=lfs -text
+perception_models/apps/pe/docs/assets/pikachu.webp filter=lfs diff=lfs merge=lfs -text
+perception_models/apps/pe/docs/assets/shark.png filter=lfs diff=lfs merge=lfs -text
+perception_models/apps/pe/docs/assets/spatial_correspondence.png filter=lfs diff=lfs merge=lfs -text
+perception_models/apps/pe/docs/assets/spatial_features.png filter=lfs diff=lfs merge=lfs -text
+perception_models/apps/pe/docs/assets/teaser.png filter=lfs diff=lfs merge=lfs -text
+perception_models/apps/pe/docs/assets/train.mp4 filter=lfs diff=lfs merge=lfs -text
+perception_models/apps/pe/docs/assets/train.wav filter=lfs diff=lfs merge=lfs -text
+perception_models/apps/plm/docs/plm_main_fig.png filter=lfs diff=lfs merge=lfs -text
+perception_models/core/tests/Rock-climbing-Canada-1920x1147.jpg filter=lfs diff=lfs merge=lfs -text
+perception_models/core/tests/selfie_cathedral_peak.jpg filter=lfs diff=lfs merge=lfs -text

perception_models/.gitignore ADDED Viewed

	@@ -0,0 +1,10 @@

+*.pyc
+.vscode
+*.ipynb
+slurm-*.out
+wandb
+data/*
+data-gym-cache/*
+torchinductor_*/*
+tmp*/*
+apps/plm/dummy_datasets

perception_models/CODE_OF_CONDUCT.md ADDED Viewed

	@@ -0,0 +1,80 @@

+# Code of Conduct
+## Our Pledge
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to make participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+## Our Standards
+Examples of behavior that contributes to creating a positive environment
+include:
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+Examples of unacceptable behavior by participants include:
+* The use of sexualized language or imagery and unwelcome sexual attention or
+  advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+  address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+## Our Responsibilities
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+## Scope
+This Code of Conduct applies within all project spaces, and it also applies when
+an individual is representing the project or its community in public spaces.
+Examples of representing a project or community include using an official
+project e-mail address, posting via an official social media account, or acting
+as an appointed representative at an online or offline event. Representation of
+a project may be further defined and clarified by project maintainers.
+This Code of Conduct also applies outside the project spaces when there is a
+reasonable belief that an individual's behavior may have a negative impact on
+the project or its community.
+## Enforcement
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at <opensource-conduct@fb.com>. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+## Attribution
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+[homepage]: https://www.contributor-covenant.org
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq

perception_models/CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,31 @@

+# Contributing to Perception Models
+We want to make contributing to this project as easy and transparent as
+possible.
+## Pull Requests
+We actively welcome your pull requests.
+1. Fork the repo and create your branch from `main`.
+2. If you've added code that should be tested, add tests.
+3. If you've changed APIs, update the documentation.
+4. Ensure the test suite passes.
+5. Make sure your code lints.
+6. If you haven't already, complete the Contributor License Agreement ("CLA").
+## Contributor License Agreement ("CLA")
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Facebook's open source projects.
+Complete your CLA here: <https://code.facebook.com/cla>
+## Issues
+We use GitHub issues to track public bugs. Please ensure your description is
+clear and has sufficient instructions to be able to reproduce the issue.
+Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
+disclosure of security bugs. In those cases, please go through the process
+outlined on that page and do not file a public issue.
+## License
+By contributing to mae, you agree that your contributions will be licensed
+under the LICENSE file in the root directory of this source tree.

perception_models/LEGRAD_PE_USAGE.md ADDED Viewed

	@@ -0,0 +1,72 @@

+# LeGrad + PE Perception Encoder Notebook Usage
+This repository includes a notebook `legrad_perception_encoder.ipynb` that demonstrates how to run **LeGrad** explanations on the PE CoCa-style vision encoder.
+## 1. Environment and installation
+- **Install this repo** (from the repo root):
+```bash
+pip install -e .
+```
+- **Install LeGrad** (if not already installed):
+```bash
+pip install legrad
+```
+Make sure you have a working CUDA‑enabled PyTorch environment.
+## 2. Open the notebook
+From the repo root:
+```bash
+cd xai/perception_models
+jupyter lab legrad_perception_encoder.ipynb
+```
+## 3. What the notebook does
+The notebook shows how to:
+1. Load a PE CoCa‑style vision encoder:
+   - Uses `pe.CLIP.from_config("PE-Core-B16-224", pretrained=True)` and moves the model to CUDA.
+2. Wrap the model with LeGrad:
+   - `LeWrapper` lives in `core/legrad_pe.py`.
+   - It hooks PE residual blocks and attention pooling so gradients can be used to build visual explanations.
+3. Prepare inputs:
+   - Build an image transform with `transforms.get_image_transform(model.image_size)`.
+   - Tokenize text prompts with `transforms.get_text_tokenizer(model.context_length)`.
+4. Run LeGrad:
+   - **Multi‑layer explanation**:
+     - `heatmap = wrapped_model.compute_legrad_coca(text_emb, image=image_tensor)`
+   - **Single‑layer explanation**:
+     - `heatmap = wrapped_model.compute_legrad_coca_one_layer(text_emb, image=image_tensor, layer_idx=-1)`
+5. Visualize:
+   - Convert the `heatmap` to numpy and use `legrad.visualize` (or standard plotting) to overlay it on the image.
+## 4. Minimal code sketch (inside the notebook)
+The core usage pattern is:
+```python
+import core.vision_encoder.pe as pe
+import core.vision_encoder.transforms as transforms
+from core.legrad_pe import LeWrapper
+model = pe.CLIP.from_config("PE-Core-B16-224", pretrained=True).cuda()
+preprocess = transforms.get_image_transform(model.image_size)
+tokenizer = transforms.get_text_tokenizer(model.context_length)
+wrapped_model = LeWrapper(model, layer_index=-2)
+```
+You can then:
+- Preprocess an input image with `preprocess`,
+- Tokenize prompts with `tokenizer`,
+- Encode text/image, and
+- Call one of the `compute_legrad_*` methods to obtain a heatmap for visualization.

perception_models/LICENSE.PE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

perception_models/LICENSE.PLM ADDED Viewed

	@@ -0,0 +1,124 @@

+FAIR Noncommercial Research License
+Last Updated: 17 April 2025
+“Acceptable Use Policy” means the FAIR Acceptable Use Policy, applicable to Research Materials, that is incorporated into this Agreement.
+“Agreement” means the terms and conditions for use, reproduction, distribution and modification of the Research Materials set forth herein.
+“Documentation” means the specifications, manuals and documentation accompanying
+Research Materials distributed by Meta.
+“Licensee” or “you” means you, or your employer or any other person or entity (if you are entering into this Agreement on such person or entity’s behalf), of the age required under applicable laws, rules or regulations to provide legal consent and that has legal authority to bind your employer or such other person or entity if you are entering in this Agreement on their behalf.
+“Meta” or “we” means Meta Platforms Ireland Limited (if you are located in or, if you are an entity, your principal place of business is in the EEA or Switzerland) and Meta Platforms, Inc. (if you are located outside of the EEA or Switzerland).
+“Noncommercial Research Uses” means noncommercial research use cases related to research, development, education, processing, or analysis and in each case, is not primarily intended for commercial advantage or monetary compensation to you or others.
+“Research Materials” means, collectively, Documentation and the models, software and algorithms, including machine-learning model code, trained model weights, inference-enabling code, training-enabling code, fine-tuning enabling code, demonstration materials and other elements of the foregoing distributed by Meta and made available under this Agreement.
+By clicking “I Accept” below or by using or distributing any portion or element of the Research Materials, you agree to be bound by this Agreement.
+1. License Rights and Redistribution.
+a. Grant of Rights. You are granted a non-exclusive, worldwide, non-transferable and royalty-free limited license under Meta’s intellectual property or other rights owned by Meta embodied in the Research Materials to use, reproduce, distribute, copy, create derivative works of, and make modifications to the Research Materials.
+b. Redistribution and Use.
+ i. You will not use the Research Materials or any outputs or results of the Research Materials in connection with any commercial uses or for any uses other than Noncommercial Research Uses;
+ii. Distribution of Research Materials, and any derivative works thereof, are subject to the terms of this Agreement. If you distribute or make the Research Materials, or any derivative works thereof, available to a third party, you may only do so under the terms of this Agreement. You shall also provide a copy of this Agreement to such third party.
+iii.  If you submit for publication the results of research you perform on, using, or otherwise in connection with Research Materials, you must acknowledge the use of Research Materials in your publication.
+iv. Your use of the Research Materials must comply with applicable laws and regulations (including Trade Control Laws) and adhere to the FAIR Acceptable Use Policy, which is hereby incorporated by reference into this Agreement.
+2. User Support. Your Noncommercial Research Use of the Research Materials is done at your own discretion; Meta does not process any information nor provide any service in relation to such use.  Meta is under no obligation to provide any support services for the Research Materials. Any support provided is “as is”, “with all faults”, and without warranty of any kind.
+3. Disclaimer of Warranty. UNLESS REQUIRED BY APPLICABLE LAW, THE RESEARCH MATERIALS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN “AS IS” BASIS, WITHOUT WARRANTIES OF ANY KIND, AND META DISCLAIMS ALL WARRANTIES OF ANY KIND, BOTH EXPRESS AND IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING OR REDISTRIBUTING THE RESEARCH MATERIALS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE RESEARCH MATERIALS AND ANY OUTPUT AND RESULTS.
+4. Limitation of Liability. IN NO EVENT WILL META OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY DIRECT OR INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF META OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
+5. Intellectual Property.
+a. Subject to Meta’s ownership of Research Materials and derivatives made by or for Meta, with respect to any derivative works and modifications of the Research Materials that are made by you, as between you and Meta, you are and will be the owner of such derivative works and modifications.
+b. If you institute litigation or other proceedings against Meta or any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Research Materials, outputs or results, or any portion of any of the foregoing, constitutes infringement of intellectual property or other rights owned or licensable by you, then any licenses granted to you under this Agreement shall terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Meta from and against any claim by any third party arising out of or related to your use or distribution of the Research Materials.
+6. Term and Termination. The term of this Agreement will commence upon your acceptance of this Agreement or access to the Research Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein. Meta may terminate this Agreement if you are in breach of any term or condition of this Agreement. Upon termination of this Agreement, you shall delete and cease use of the Research Materials. Sections 5, 6 and 9 shall survive the termination of this Agreement.
+7. Governing Law and Jurisdiction. This Agreement will be governed and construed under the laws of the State of California without regard to choice of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement. The courts of California shall have exclusive jurisdiction of any dispute arising out of this Agreement.
+8. Modifications and Amendments. Meta may modify this Agreement from time to time by posting a revised version at [INSERT URL]; provided that they are similar in spirit to the current version of the Agreement, but may differ in detail to address new problems or concerns. All such changes will be effective immediately. Your continued use of the Research Materials after any modification to this Agreement constitutes your agreement to such modification. Except as provided in this Agreement, no modification or addition to any provision of this Agreement will be binding unless it is in writing and signed by an authorized representative of both you and Meta.
+FAIR Acceptable Use Policy
+The Fundamental AI Research (FAIR) team at Meta seeks to further understanding of new and existing research domains with the mission of advancing the state-of-the-art in artificial intelligence through open research for the benefit of all.
+As part of this mission, Meta makes certain research materials available for noncommercial research use. Meta is committed to promoting the safe and responsible use of such research materials.
+Prohibited Uses
+You agree you will not use, or allow others to use, Research Materials to:
+ Violate the law or others’ rights, including to:
+Engage in, promote, generate, contribute to, encourage, plan, incite, or further illegal or unlawful activity or content, such as:
+Violence or terrorism
+Exploitation or harm to children, including the solicitation, creation, acquisition, or dissemination of child exploitative content or failure to report Child Sexual Abuse Material
+Human trafficking, exploitation, and sexual violence
+The illegal distribution of information or materials to minors, including obscene materials, or failure to employ legally required age-gating in connection with such information or materials.
+Sexual solicitation
+Any other criminal activity
+Engage in, promote, incite, or facilitate the harassment, abuse, threatening, or bullying of individuals or groups of individuals
+Engage in, promote, incite, or facilitate discrimination or other unlawful or harmful conduct in the provision of employment, employment benefits, credit, housing, other economic benefits, or other essential goods and services
+Engage in the unauthorized or unlicensed practice of any profession including, but not limited to, financial, legal, medical/health, or related professional practices
+Collect, process, disclose, generate, or infer health, demographic, or other sensitive personal or private information about individuals without rights and consents required by applicable laws
+Engage in or facilitate any action or generate any content that infringes, misappropriates, or otherwise violates any third-party rights, including the outputs or results of any technology using FAIR research materials
+Create, generate, or facilitate the creation of malicious code, malware, computer viruses or do anything else that could disable, overburden, interfere with or impair the proper working, integrity, operation or appearance of a website or computer system
+2. Engage in, promote, incite, facilitate, or assist in the planning or development of activities that present a risk of death or bodily harm to individuals, including use of research artifacts related to the following:
+Military, warfare, nuclear industries or applications, espionage, use for materials or activities that are subject to the International Traffic Arms Regulations (ITAR) maintained by the United States Department of State
+Guns and illegal weapons (including weapon development)
+Illegal drugs and regulated/controlled substances
+Operation of critical infrastructure, transportation technologies, or heavy machinery
+Self-harm or harm to others, including suicide, cutting, and eating disorders
+Any content intended to incite or promote violence, abuse, or any infliction of bodily harm to an individual
+3. Intentionally deceive or mislead others, including use of FAIR Research Materials related to the following:
+ Generating, promoting, or furthering fraud or the creation or promotion of disinformation
+ Generating, promoting, or furthering defamatory content, including the creation of defamatory statements, images, or other content
+Generating, promoting, or further distributing spam
+ Impersonating another individual without consent, authorization, or legal right
+Representing that outputs of FAIR research materials or outputs from technology using FAIR research materials are human-generated
+Generating or facilitating false online engagement, including fake reviews and other means of fake online engagement
+4. Fail to appropriately disclose to end users any known dangers of your Research Materials.
+Please report any violation of this Policy or other problems that could lead to a violation of this Policy by submitting a report here [https://docs.google.com/forms/d/e/1FAIpQLSeb11cryAopJ7LNrC4nxEUXrHY26hfkXQMf_uH-oFgA3WlYZQ/viewform].

perception_models/README.md ADDED Viewed

	@@ -0,0 +1,408 @@

+# Perception Models: Powerful Models for Image, Video, and Audio Perception
+[![Code License](https://img.shields.io/badge/Code_License-Apache_2.0-olive)](https://opensource.org/licenses/Apache-2.0)
+This repo is the home to the state-of-the-art for image and video _perception_: [**Perception Encoder (PE)**](https://arxiv.org/abs/2504.13181) for image, video, [audio](https://ai.meta.com/research/publications/pushing-the-frontier-of-audiovisual-perception-with-large-scale-multimodal-correspondence-learning/) encoding, and [**Perception Language Model (PLM)**](https://arxiv.org/abs/2504.13180) for decoding.
+> [!TIP]
+> Click to Navigate!
+>
+> [Perception Encoder and Perception Encoder Audio-Visual](#perception-encoder-pe)
+>
+> [Perception Language Model](#perception-language-model-plm)
+>
+> [Dataset Releases](#dataset-releases)
+## Updates
+* **[Dec-16-25]:** We have released the Perception Encoder Audio-Visual (PE-AV) and Perception Encoder Audio-Frame (PE-A-Frame) models: [[`Blog`](https://ai.meta.com/blog/sam-audio/)][[`paper`](https://ai.meta.com/research/publications/pushing-the-frontier-of-audiovisual-perception-with-large-scale-multimodal-correspondence-learning/)] :fire::fire:
+* **[Jul-14-25]:** PerceptionLM is now available in [Hugging Face transformers](https://huggingface.co/docs/transformers/main/en/model_doc/perception_lm). :fire::fire:
+* **[Jul-11-25]:** We have release 8 new checkpoints for [Perception Encoder](apps/pe/README.md): 2x small core models (T and S), 2x tiling-tuned lang models (G and L), and 4x smaller spatial models (L, B, S, T). Give them a try! :fire::fire::fire:
+* **[May-28-25]:** Perception Encoder has been integrated into [timm](https://github.com/huggingface/pytorch-image-models)! :fire::fire:
+* **[Apr-18-25]:** Perception Language Model (PLM) and PLM-VideoBench are added to lmms-eval. This makes it easy to reproduce PLM results and allows you to evaluate on the PLM-VideoBench. [[`lmms-eval`](https://github.com/EvolvingLMMs-Lab/lmms-eval/pull/638)] :fire::fire:
+* **[Apr-17-25]:** Perception Encoder (PE) and Perception Language Model (PLM) are released. [[`Blog`](https://ai.meta.com/blog/meta-fair-updates-perception-localization-reasoning)] :fire::fire:
+## Perception Encoder (PE)
+[![Data](https://img.shields.io/badge/Download-PE%20Data-ffcc00.svg)](https://huggingface.co/datasets/facebook/PE-Video)
+[![Hugging Face Collection](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Collection-blue)](https://huggingface.co/collections/facebook/perception-encoder-67f977c9a65ca5895a7f6ba1)
+[![Paper](https://img.shields.io/badge/Technical%20Report-Perception%20Encoder-b31b1b.svg)](https://ai.meta.com/research/publications/perception-encoder-the-best-visual-embeddings-are-not-at-the-output-of-the-network)
+[![Paper](https://img.shields.io/badge/Technical%20Report-Perception%20Encoder%20AV-b31b1b.svg)](https://ai.meta.com/research/publications/pushing-the-frontier-of-audiovisual-perception-with-large-scale-multimodal-correspondence-learning/)
+[![Paper](https://img.shields.io/badge/arXiv-2504.13181-brightgreen.svg?style=flat-square)](https://arxiv.org/abs/2504.13181)
+[![Colab Demo](https://img.shields.io/static/v1?label=Demo&message=Google%20Colab&logo=google&color=orange)](https://colab.research.google.com/github/facebookresearch/perception_models/blob/main/apps/pe/docs/pe_demo.ipynb)
+[![Model License](https://img.shields.io/badge/Model_License-Apache_2.0-olive)](https://opensource.org/licenses/Apache-2.0)
+[Perception Encoder (PE)](https://arxiv.org/abs/2504.13181) is a family of the state-of-the-art vision and audio encoders for encoding images, video, and audio: PE core outperforms SigLIP2 on image and InternVideo2 on video benchmarks; PE lang can be used to outperform QwenVL2.5 and InternVL3 on vision language modeling; and PE spatial outperforms DINOv2 on dense prediction tasks. And all of this follows the same, easily scalable contrastive pretraining. Please see [README](apps/pe/README.md) for more details.
+<img src="apps/pe/docs/assets/teaser.png" style="width: 100%; margin: 0 auto; display: block;" />
+### Models
+PE has 4 types of checkpoints, each excelling in a different area of computer vision and audio understanding:
+ - [PE core](#vision-language-benchmarks): a CLIP model excels in vision-language tasks such as zero-shot image and video classification and video retrieval.
+ - [PE lang](#multimodal-llm-benchmarks): a LLM-aligned PE that powers [PLM](https://arxiv.org/abs/2504.13180) to compete at the forefront of multimodal LLM benchmarks.
+ - [PE spatial](#vision-centric-benchmarks): a spatially tuned PE that outperforms best spatial models for vision-centric tasks such as detection, depth estimation, and tracking.
+ - [PE audio-visual](#audio-visual-benchmarks): a CLIP Model that embeds audio, video, audio-video, and text into a joint embedding space.
+#### Vision-Language Benchmarks
+|    | Model | Checkpoint | IN-1k | IN-v2 | IN-A | ObjectNet | COCO-T2I | Kinetics-400 | VTT-T2V
+|:--:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|
+|    | **T/16** 384px | [PE-Core-T16-384](https://huggingface.co/facebook/PE-Core-T16-384) | 62.1 | 54.7 | 21.1 | 43.9 | 33.0 | 41.5 | 28.8 |
+|    | **S/16** 384px | [PE-Core-S16-384](https://huggingface.co/facebook/PE-Core-S16-384) | 72.7 | 65.0 | 49.5 | 60.0 | 42.6 | 55.0 | 39.3 |
+|    | **B/16** 224px | [PE-Core-B16-224](https://huggingface.co/facebook/PE-Core-B16-224) | 78.4 | 71.7 | 62.4 | 71.9 | 50.9 | 65.6 | 47.6 |
+|    | **L/14** 336px | [PE-Core-L14-336](https://huggingface.co/facebook/PE-Core-L14-336) | 83.5 | 77.9 | 89.0 | 84.7 | 57.1 | 73.4 | 50.3 |
+|    | **G/14** 448px | [PE-Core-G14-448](https://huggingface.co/facebook/PE-Core-G14-448) | 85.4 | 80.2 | 92.6 | 88.2 | 58.1 | 76.9 | 51.2 |
+#### Multimodal LLM Benchmarks
+🔬 Controlled Setting:
+|    | Encoder | Checkpoint | Doc VQA (val) | InfoQA (val) | TextVQA | MVBench | PerceptionTest (val) | EgoSchema (val) |
+|:--:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|
+|    | **L/14** 448px | [PE-Lang-L14-448](https://huggingface.co/facebook/PE-Lang-L14-448) | 81.9 | 46.4 | 73.0 | 52.3 | 54.7 | 59.8 |
+|    | **G/14** 448px | [PE-Lang-G14-448](https://huggingface.co/facebook/PE-Lang-G14-448) | 84.4 | 48.3 | 75.2 | 52.4 | 56.0 | 62.0 |
+🔥 SotA Setting:
+|    | Model | Encoder | Doc VQA (test) | InfoQA (test) | TextVQA | MVBench | PerceptionTest (test) | EgoSchema (test) |
+|:--:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|
+|    | PLM-3B | [PE-Lang-L14-448-Tiling](https://huggingface.co/facebook/PE-Lang-L14-448-Tiling)* | 93.8 | 74.6 | 84.3 | 74.7 | 79.3 | 66.9 |
+|    | PLM-8B | [PE-Lang-G14-448-Tiling](https://huggingface.co/facebook/PE-Lang-G14-448-Tiling)* | 94.6 | 80.9 | 86.5 | 77.1 | 82.7 | 68.8 |
+\* These checkpoints were aligned with tiling. Use them if you use higher than 448 resolution with tiling in the LLM decoder.
+#### Vision-centric Benchmarks
+🦾 Main model:
+|    | Encoder | Checkpoint | ADE20k <br/> [Segmentation](https://github.com/open-mmlab/mmsegmentation)<br />Linear Probe mIoU | DAVIS<br /> [Tracking](https://github.com/facebookresearch/dino/blob/main/eval_video_segmentation.py) <br />Zero-Shot J&F  | LVIS <br /> [Mask R-CNN](../detection/detectron2_pe/) 1024px <br /> Box / Mask mAP | COCO <br/> [DETA](../detection/DETA_pe/) 1824px <br /> Box mAP |
+|:--:|:---:|:---:|:---:|:---:|:---:|:---:|
+|    | **G/14** 448px | [PE-Spatial-G14-448](https://huggingface.co/facebook/PE-Spatial-G14-448) | 49.3 | 61.5 | 54.2 / 49.3 | 66.0 |
+<div align="center">
+  <img src="apps/pe/docs/assets/spatial_correspondence.png" style="width: 80%; margin: 0 auto; padding-top: 20px; padding-bottom: 20px; display: block;" />
+  Visualization of PCA of non-maked visual tokens, mapped to RGB values.
+</div>
+⚗️ Distilled Models:
+|    | Encoder<br />(Distilled from G) | Checkpoint | ADE20k <br/> [Segmentation](https://github.com/open-mmlab/mmsegmentation)<br />Linear Probe mIoU | DAVIS<br /> [Tracking](https://github.com/facebookresearch/dino/blob/main/eval_video_segmentation.py) <br />Zero-Shot J&F  |
+|:--:|:---:|:---:|:---:|:---:|
+|    | **T/16** 512px | [PE-Spatial-T16-512](https://huggingface.co/facebook/PE-Spatial-T16-512) | 27.6 | 55.0 |
+|    | **S/16** 512px | [PE-Spatial-S16-512](https://huggingface.co/facebook/PE-Spatial-S16-512) | 37.5 | 57.5 |
+|    | **B/16** 512px | [PE-Spatial-B16-512](https://huggingface.co/facebook/PE-Spatial-B16-512) | 44.4 | 58.9 |
+|    | **L/14** 448px | [PE-Spatial-L14-448](https://huggingface.co/facebook/PE-Spatial-L14-448) | 48.1 | 60.6 |
+See paper for comparison to other models.
+#### Audio-Visual Benchmarks
+|    | Model | Checkpoint   | Avg Retrieval | AudioCaps T→A | AudioCaps T→V | AudioCaps V→A | Clotho T→A | Valor T→A | Valor T→V | VCTK A→T | VGGSound V→A | Internal V→A |
+|:--:|:-----:|--------------|---------------|---------------|---------------|---------------|------------|-----------|-----------|----------|---------------|---------------|
+| 🆕 | **AV S** 16 frames | [`pe-av-small-16-frame`](https://huggingface.co/facebook/pe-av-small-16-frame)  | 45.2          | 41.2          | 18.6          | 75.4          | 24.0       | 29.8      | 70.1      | 96.1     | 34.1          | 17.9          |
+| 🆕 | **AV B** 16 frames | [`pe-av-base-16-frame`](https://huggingface.co/facebook/pe-av-base-16-frame)   | 47.0          | 43.1          | 19.8          | 80.6          | 23.4       | 31.9      | 70.0      | 94.8     | 39.0          | 20.4          |
+| 🆕 | **AV L** 16 frames | [`pe-av-large-16-frame`](https://huggingface.co/facebook/pe-av-large-16-frame)  | 48.2          | 44.7          | 19.5          | 86.1          | 22.8       | 35.0      | 70.9      | 85.6     | 45.2          | 23.9          |
+| 🆕 | **AV S** all frames | [`pe-av-small`](https://huggingface.co/facebook/pe-av-small)           | 48.1          | 41.8          | 18.8          | 77.4          | 23.9       | 29.3      | 70.9      | 94.9     | 35.4          | 40.5          |
+| 🆕 | **AV B** all frames | [`pe-av-base`](https://huggingface.co/facebook/pe-av-base)            | 50.2          | 42.7          | 19.6          | 83.7          | 23.8       | 30.8      | 71.2      | 94.9     | 40.7          | 44.6          |
+| 🆕 | **AV L** all frames | [`pe-av-large`](https://huggingface.co/facebook/pe-av-large)           | 51.6          | 45.8          | 20.8          | 88.3          | 23.0       | 35.1      | 70.9      | 85.6     | 48.3          | 46.5          |
+#### Audio Event Localization Benchmarks
+|    | Model |     Checkpoint   | Internal Bench (AUROC)      | ASFX-SED (AUROC)        | AudioSet-Strong (AUROC) | DESED (AUROC) | UrbanSED (AUROC) |
+|:--:|:-----:|------------------|---------------------|------------------|-----------------------|-------------|-------------|
+| 🆕 | **A-Frame S** | [`pe-a-frame-small`](https://huggingface.co/facebook/pe-a-frame-small)| 0.91                | 0.83             | 0.96                  | 0.96        | 0.88        |
+| 🆕 | **A-Frame B** | [`pe-a-frame-base`](https://huggingface.co/facebook/pe-a-frame-base)| 0.92                | 0.83             | 0.96                  | 0.98        | 0.89        |
+| 🆕 | **A-Frame L** | [`pe-a-frame-large`](https://huggingface.co/facebook/pe-a-frame-large)| 0.91                | 0.83             | 0.96                  | 0.97        | 0.89        |
+### Getting Started with PE
+You can get started with the following example for image and text feature extraction or use our [Colab Demo](https://colab.research.google.com/github/facebookresearch/perception_models/blob/main/apps/pe/docs/pe_demo.ipynb)
+```python
+import torch
+from PIL import Image
+import core.vision_encoder.pe as pe
+import core.vision_encoder.transforms as transforms
+print("CLIP configs:", pe.CLIP.available_configs())
+# CLIP configs: ['PE-Core-G14-448', 'PE-Core-L14-336', 'PE-Core-B16-224', 'PE-Core-S16-384', 'PE-Core-T16-384']
+model = pe.CLIP.from_config("PE-Core-L14-336", pretrained=True)  # Downloads from HF
+model = model.cuda()
+preprocess = transforms.get_image_transform(model.image_size)
+tokenizer = transforms.get_text_tokenizer(model.context_length)
+image = preprocess(Image.open("docs/assets/cat.png")).unsqueeze(0).cuda()
+text = tokenizer(["a diagram", "a dog", "a cat"]).cuda()
+with torch.no_grad(), torch.autocast("cuda"):
+    image_features, text_features, logit_scale = model(image, text)
+    text_probs = (logit_scale * image_features @ text_features.T).softmax(dim=-1)
+print("Label probs:", text_probs)  # prints: [[0.0, 0.0, 1.0]]
+```
+> [!TIP]
+> See [`apps/pe/README.md`](apps/pe/README.md) for details and how to get started!
+### Getting Started with PE-AV
+```python
+import os
+from core.audio_visual_encoder import PEAudioVisual, PEAudioVisualTransform
+import torch
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = PEAudioVisual.from_config("pe-av-large", pretrained=True).to(device)
+transform = PEAudioVisualTransform.from_config("pe-av-large")
+video_files = ["assets/train.mp4", "assets/office.mp4"]
+descriptions = [
+    "A person talking with sirens and a train in the background",
+    "Two people talking in an office, with sounds of workers typing on a keyboard"
+]
+def embed(videos=None, audio=None, text=None):
+    inputs = transform(videos=videos, audio=audio, text=text)
+    inputs = inputs.to(device)
+    with torch.inference_mode(), torch.autocast(device.type, dtype=torch.bfloat16):
+        return model(**inputs)
+vt_outputs = embed(videos=video_files, text=descriptions)
+avt_outputs = embed(videos=video_files, audio=video_files, text=descriptions)
+at_outputs = embed(audio=video_files, text=descriptions)
+# Compute dot product between visual and text
+vt_dot_products = torch.einsum("ij,ij->i", vt_outputs.visual_embeds, vt_outputs.visual_text_embeds)
+# Compute dot product between audio_visual and text
+avt_dot_products = torch.einsum("ij,ij->i", avt_outputs.audio_visual_embeds, avt_outputs.audio_visual_text_embeds)
+# Compute dot product between audio and text
+at_dot_products = torch.einsum("ij,ij->i", at_outputs.audio_embeds, at_outputs.audio_text_embeds)
+# Compute dot product between audio and video
+av_dot_products = torch.einsum("ij,ij->i", avt_outputs.audio_embeds, avt_outputs.video_embeds)
+```
+### Getting Started with PE-A-Frame
+```python
+from core.audio_visual_encoder import (
+    PEAudioFrame,
+    PEAudioFrameTransform,
+)
+import torch
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = PEAudioFrame.from_config("pe-a-frame-large", pretrained=True).to(device)
+transform = PEAudioFrameTransform.from_config("pe-a-frame-large")
+descriptions = ["a person talking"]
+inputs = transform(
+    audio=["assets/office.mp4"],
+    text=descriptions,
+).to(device)
+with torch.inference_mode():
+    outputs = model(**inputs)
+# Print the spans for each description (start and end timestamps for when they occur in the audio)
+for description, spans in zip(descriptions, outputs.spans):
+    span_str = ", ".join([f"({start:.2f}, {end:.2f})" for start, end in spans])
+    print(f'"{description}": [{span_str}]')
+```
+> [!TIP]
+> See [`apps/pe/README.md`](apps/pe/README.md) for additional details!
+## Perception Language Model (PLM)
+[![Data](https://img.shields.io/badge/Download-PLM%20Data-ffcc00.svg)](https://huggingface.co/datasets/facebook/PLM-Video-Human)
+[![Hugging Face Collection](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Collection-blue)](https://huggingface.co/collections/facebook/perception-lm-67f9783f171948c383ee7498)
+[![Paper](https://img.shields.io/badge/Technical%20Report-PerceptionLM-b31b1b.svg)](https://ai.meta.com/research/publications/perceptionlm-open-access-data-and-models-for-detailed-visual-understanding)
+[![Paper](https://img.shields.io/badge/arXiv-2504.13180-brightgreen.svg?style=flat-square)](https://arxiv.org/abs/2504.13180)
+[![Colab](https://img.shields.io/badge/Google%20Colab-Tutorials-red)](apps/plm/notebook_demos)
+[![ModelLicense](https://img.shields.io/badge/Model_License-FAIR_Research_License-lightgrey)](LICENSE.PLM)
+PerceptionLM (PLM) is a family of open and fully reproducible models to facilitate research in vision-language modeling (VLM). In conjunction with PE, it is powerful enough to compete with the latest state-of-the-art VLMs such as InternVL3 and QwenVL2.5, while using _fully open data_. We also release the largest spatiotemporally annotated video dense captioning and fine-grained human activity recognition datasets to ever exist.
+![Description of the image](apps/plm/docs/plm_main_fig.png)
+### Models
+PLM releases models in three different sizes (1B, 3B and 8B).
+* [Perception-LM-1B](https://huggingface.co/facebook/Perception-LM-1B): A PLM model trained using Llama-3.2-1B-Instruct base LLM.
+* [Perception-LM-3B](https://huggingface.co/facebook/Perception-LM-3B): A PLM model trained using Llama-3.2-3B-Instruct base LLM.
+* [Perception-LM-8B](https://huggingface.co/facebook/Perception-LM-8B): A PLM model trained using Llama-3.1-8B-Instruct base LLM.
+#### PLM Image Benchmark Results
+| Model  | DocVQA | ChartQA | TextVQA | InfoQA | AI2D  | OCRBench | COCO | Nocap | Flickr | MMMU | VQAv2 | OKVQA | VizWiz | MME | SEED | BLINK | CVBench | RealWorldQA | VSR | POPE |
+|:---------:|:--------:|:---------:|:---------:|:--------:|:------:|:----------:|:------------:|:-------------:|:--------------:|:------:|:-------:|:--------:|:--------:|:-----:|:------:|:-------:|:----------:|:-------------:|:-----:|:------:|
+| PLM1B  | 90.7   | 78.6    | 82.1    | 63.0   | 84.9 | 807      | 138.6      | 124.2       | 100.5        | 34.8 | 81.7  | 61.0   | 59.7   | 1603| 76.3 | 46.8  | 73.8     | 67.1        | 68.8| 88.4 |
+| PLM3B  | 93.8   | 84.3    | 84.3    | 74.6   | 90.9 | 830      | 144.9      | 126.5       | 98.0         | 41.2 | 84.3  | 66.8   | 64.0   | 1879| 78.5 | 55.4  | 81.4     | 72.4        | 80.4| 88.7 |
+| PLM8B  | 94.6   | 85.5    | 86.5    | 80.9   | 92.7 | 870      | 146.7      | 129.9       | 105.6        | 46.1 | 85.6  | 69.6   | 67.0   | 1989| 79.3 | 56.0  | 81.3     | 75.0        | 82.8| 89.9 |
+#### PLM Video Benchmark Results
+| Model  | VATEX                    | DREAM&nbsp;1K      | How2QA       | MVBench      | NExTQA      | PerceptionTest&nbsp;(test) | STAR       | TVQA       | VideoMME        | TVBench      | ActivityNetQA   | EgoSchema&nbsp;(test) | TemporalBench    | TOMATO     | MotionBench&nbsp;(dev) | TempCompass&nbsp;(MCQ) | CGBench&nbsp;(clue) | Charades&nbsp;STA   | VideoHallucer   | Halluc.&nbsp;EventHallusion |
+|:-------------:|:---------------------------:|:-----------------------:|:---------------------:|:-------------:|:-------------:|:--------------------------:|:----------:|:----------:|:----------------:|:-------------:|:--------------------:|:----------------------:|:---------------------:|:------------:|:------------------------:|:-----------------------:|:---------------------:|:-------------------:|:-------------------------------:|:--------------------------------:|
+| PLM1B  | 92.5 | 34.3 | 86.4 | 70.1 | 80.3 | 72.7 | 83.7 | 50.3 | 49.2 | 50.4 | 62.5 | 60.4 | 18.2 | 25.5 | 52.2 | 64.6 | 43.6 | 55.2 | 49.2 | 79.5 |
+| PLM3B  | 96.1 | 37.4 | 89.4 | 74.7 | 83.4 | 79.3 | 84.8 | 55.3 | 54.9 | 58.9 | 66.2 | 66.9 | 23.4 | 30.9 | 60.4 | 69.3 | 47.2 | 57.7 | 55.5 | 76.5 |
+| PLM8B  | 99.7 | 35.9 | 90.7 | 77.1 | 84.1 | 82.7 | 84.9 | 59.3 | 58.3 | 63.5 | 67.3 | 68.8 | 28.3 | 33.2 | 61.4 | 72.7 | 46.4 | 58.6 | 57.7 | 77.3 |
+### PLM Resources
+| Resource | Description | Documentation                                          |
+| --- | --- |--------------------------------------------------------|
+| **Evaluation** | Evaluation of PLM using lmms-eval | [`docs/evaluation.md`](apps/plm/docs/evaluation.md)    |
+| **Training / Finetuning** | Training and finetuning instructions for PLM | [`docs/training.md`](apps/plm/docs/training.md)                 |
+| **PLM-VideoBench** | Evaluation on PLM-VideoBench using lmms-eval | [`docs/plm_videobench.md`](apps/plm/docs/plm_videobench.md)     |
+| **End-to-End Finetuning Example** | End-to-end finetuning example on radiology images | [`docs/finetune_example.md`](apps/plm/docs/finetune_example.md) |
+| **Generating Response** | Generate responses using a trained model with `generate.py` | [`generate.py`](apps/plm/generate.py)                           |
+> [!TIP]
+> See [`apps/plm/README.md`](apps/plm/README.md) for details and how to get started!
+## Dataset Releases
+### 🎥 [PE-Video-Dataset (PVD)](https://huggingface.co/datasets/facebook/PE-Video)
+PVD comprises 1M high quality and diverse videos. Among them, 120K videos are accompanied by automated and human-verified annotations. and all videos are accompanied with video description and keywords. The videos are motion-centered, covering both first-person and third-person views with a wide coverage of scenes.
+🔹 [**PVD**](https://huggingface.co/datasets/facebook/PE-Video) - 1M High-Quality Human Annotated Video Dataset
+<table>
+   <tr>
+    <td colspan="2" align="center"><strong>PVD</strong></td>
+  </tr>
+  <tr>
+    <td align="center">
+      <img src="https://github.com/user-attachments/assets/ead8a7ed-4d5b-465a-a396-68948683dfcf" alt="output_2" width="300"/><br>
+      A person's hands pruning a plant with green leaves.
+    </td>
+    <td align="center">
+      <img src="https://github.com/user-attachments/assets/9e509e49-f550-4c5c-9571-ed57c5118227" alt="output" width="300"/><br>
+      A detailed diorama of a rural landscape featuring a horse-drawn carriage moving along a dirt path
+    </td>
+  </tr>
+</table>
+---
+### 🎥 [PLM-Video-Human](https://huggingface.co/datasets/facebook/PLM-Video-Human)
+PLM-Video-Human is a collection of human-annotated resources for training Vision Language Models, focused on detailed video understanding. Training tasks include:
+🔹 [**FGQA**](https://huggingface.co/datasets/facebook/PLM-Video-Human#fine-grained-question-answering-fgqa) — Fine-Grained Question Answering
+🔹 [**RTLoc**](https://huggingface.co/datasets/facebook/PLM-Video-Human#region-temporal-localization-rtloc) — Region-Temporal Localization
+🔹 [**RCap**](https://huggingface.co/datasets/facebook/PLM-Video-Human#region-video-captioning-rcap) — Region Video Captioning
+🔹 [**RDCap**](https://huggingface.co/datasets/facebook/PLM-Video-Human#region-dense-temporal-captioning-rdcap) — Region Dense Temporal Captioning
+<table>
+  <tr>
+    <td colspan="2" align="center"><strong>FGQA</strong></td>
+  </tr>
+  <tr>
+    <td colspan="2" align="center">
+      <img src="https://github.com/user-attachments/assets/4f5c6c5e-687d-49df-9bf8-db9ec7f1f281" alt="fgqa" width="500"/>
+    </td>
+  </tr>
+  <tr>
+    <th>Question</th>
+    <th>Answer</th>
+  </tr>
+  <tr>
+    <td>In what direction do you move the tool while removing the shell?</td>
+    <td>Both clockwise and anticlockwise.</td>
+  </tr>
+</table>
+<table>
+   <tr>
+    <td colspan="2" align="center"><strong>STC</strong></td>
+  </tr>
+  <tr>
+    <td colspan="2" align="center">
+      <img src="https://github.com/user-attachments/assets/a2a129c7-c1e9-47b5-a3b4-fc96a237a9fb" alt="stc" width="500"/>
+    </td>
+  </tr>
+  <tr>
+    <th>Time (s) &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</th>
+    <th>Description</th>
+  </tr>
+  <tr>
+    <td>[0, 4]</td>
+    <td>The masked subject is a young boy wearing a red jacket and gray pants. He is grasping a monkey bar–like activity in a playground.</td>
+  </tr>
+  <tr>
+    <td>[5, 14]</td>
+    <td>He lets go of his hands and runs to the right side of the frame.</td>
+  </tr>
+  <tr>
+    <td>[15, 30]</td>
+    <td>The subject is out of frame.</td>
+  </tr>
+  <tr>
+    <td>[31, 45]</td>
+    <td>The subject runs back into the frame toward the higher monkey bar in the playground.</td>
+  </tr>
+  <tr>
+    <td>[46, 74]</td>
+    <td>He jumps underneath the metal bar and looks up at it. A man wearing a white polo runs toward the subject.</td>
+  </tr>
+  <tr>
+    <td>[75, 116]</td>
+    <td>The man in the white polo lifts the subject upward so he can grasp the higher metal bar. The subject holds onto the bar and hangs from it.</td>
+  </tr>
+</table>
+---
+### 🤖 Auto-Generated Datasets
+Sythetic image/video captions and QAs used in PLM, please refer to the paper, Section 3 (PLM), for more details. The sythetic annotations covers: SA1B, Openimages, Obejct365, ArxivQA, UCSF, PDFAcc, YT-1B, Ego4d with captions, YT-1B with MCQAs and Ego4d with QAs.
+🖼️ [**PLM-Image-Auto**](https://huggingface.co/datasets/facebook/PLM-Image-Auto) — Automatically generated image datasets
+📹 [**PLM-Video-Auto**](https://huggingface.co/datasets/facebook/PLM-Video-Auto) — Automatically generated video datasets
+---
+## Installation :wrench:
+```shell
+git clone https://github.com/facebookresearch/perception_models.git
+cd perception_models
+conda create --name perception_models python=3.12
+conda activate perception_models
+# Install PyTorch
+pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 xformers --index-url https://download.pytorch.org/whl/cu124
+# We use torchcodec for decoding videos into PyTorch tensors
+conda install ffmpeg -c conda-forge
+pip install torchcodec==0.1 --index-url=https://download.pytorch.org/whl/cu124
+pip install -e .
+```
+This will install an editable version of repo, allowing you to make changes to the code without needing to reinstall the package every time.
+## 🙏 Acknowledgement
+We are thankful to [Meta Lingua](https://github.com/facebookresearch/lingua) for releasing their code as open-source contributions. The code structure and code implementation of the LLM is directly forked from [Meta Lingua](https://github.com/facebookresearch/lingua). We are also thankful to [Open_CLIP](https://github.com/mlfoundations/open_clip) for open-source contributions in CLIP training, and [CLIP_benchmark](https://github.com/LAION-AI/CLIP_benchmark) for CLIP model evaluation.
+## 📜 Citation
+```BibTeX
+@article{bolya2025PerceptionEncoder,
+  title={Perception Encoder: The best visual embeddings are not at the output of the network},
+  author={Daniel Bolya and Po-Yao Huang and Peize Sun and Jang Hyun Cho and Andrea Madotto and Chen Wei and Tengyu Ma and Jiale Zhi and Jathushan Rajasegaran and Hanoona Rasheed and Junke Wang and Marco Monteiro and Hu Xu and Shiyu Dong and Nikhila Ravi and Daniel Li and Piotr Doll{\'a}r and Christoph Feichtenhofer},
+  journal={arXiv:2504.13181},
+  year={2025}
+}
+@article{cho2025PerceptionLM,
+  title={PerceptionLM: Open-Access Data and Models for Detailed Visual Understanding},
+  author={Jang Hyun Cho and Andrea Madotto and Effrosyni Mavroudi and Triantafyllos Afouras and Tushar Nagarajan and Muhammad Maaz and Yale Song and Tengyu Ma and Shuming Hu and Hanoona Rasheed and Peize Sun and Po-Yao Huang and Daniel Bolya and Suyog Jain and Miguel Martin and Huiyu Wang and Nikhila Ravi and Shashank Jain and Temmy Stark and Shane Moon and Babak Damavandi and Vivian Lee and Andrew Westbury and Salman Khan and Philipp Kr\"{a}henb\"{u}hl and Piotr Doll{\'a}r and Lorenzo Torresani and Kristen Grauman and Christoph Feichtenhofer},
+  journal={arXiv:2504.13180},
+  year={2025}
+}
+```

perception_models/__pycache__/legrad_pe_audio.cpython-310.pyc ADDED Viewed

Binary file (6.49 kB). View file

perception_models/__pycache__/legrad_pe_audio.cpython-313.pyc ADDED Viewed

Binary file (10.2 kB). View file

perception_models/__pycache__/legrad_pe_image.cpython-312.pyc ADDED Viewed

Binary file (12.8 kB). View file

perception_models/__pycache__/legrad_pe_image.cpython-313.pyc ADDED Viewed

Binary file (11.5 kB). View file

perception_models/apps/detection/DETA_pe/README.md ADDED Viewed

	@@ -0,0 +1,53 @@

+# SOTA COCO Object Detection with PE
+## Getting started
+Please refer to [INSTALL.md](../INSTALL.md) for installation and dataset preparation instructions.
+Also install [Deformable Attention](models/ops/make.sh) ops.
+## Results and Fine-tuned Models
+<table><tbody>
+<!-- START TABLE -->
+<!-- TABLE HEADER -->
+<th valign="bottom">detector</th>
+<th valign="bottom">vision encoder</th>
+<th valign="bottom">box<br/>AP</th>
+<th valign="bottom">box(TTA)<br/>AP</th>
+<th valign="bottom">download</th>
+<!-- TABLE BODY -->
+<!-- ROW: DETA -->
+ <tr><td align="left">DETA</td>
+<td align="center">PE spatial G</td>
+<td align="center"> 65.2 </td>
+<td align="center"> 66.0 </td>
+<td align="center"><a href="https://huggingface.co/facebook/PE-Detection/resolve/main/deta_coco_1824pix.pth">model</a></td>
+</tr>
+</tbody></table>
+## Training
+We apply a four-stage training, Objects365(12ep, 1024pix), Objects365(6ep, 1536pix), COCO(12ep, 1728pix), COCO(3ep, 1824pix)
+```
+sbatch scripts/pretrain_spatial_Gwin384_o365ep12_1024pix_16node.sh
+sbatch scripts/pretrain_continue_spatial_Gwin384_o365ep6_1536pix_16node.sh
+sbatch scripts/finetune_spatial_Gwin384_cocoep12_1728pix_8node.sh
+sbatch scripts/finetune_further_spatial_Gwin384_cocoep3_1824pix_8node.sh
+```
+## Evaluation
+```
+bash scripts/eval_1824pix.sh --resume deta_coco_1824pix.pth
+```
+## Evaluation with TTA (Test-Time Augmentation)
+```
+sbatch scripts/eval_tta_slurm_1824pix.sh --resume deta_coco_1824pix.pth
+```
+Note: If you get 65.9 AP, it is probably caused by different package versions, trying different hyperparameters like `--quad_scale 0.4` will give 66.0 AP.

perception_models/apps/detection/DETA_pe/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# ------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+import torch.utils.data
+from .coco import build as build_coco
+from .objects365 import build as build_objects365
+from .torchvision_datasets import CocoDetection
+def get_coco_api_from_dataset(dataset):
+    for _ in range(10):
+        # if isinstance(dataset, torchvision.datasets.CocoDetection):
+        #     break
+        if isinstance(dataset, torch.utils.data.Subset):
+            dataset = dataset.dataset
+    if isinstance(dataset, CocoDetection):
+        return dataset.coco
+def build_dataset(image_set, args):
+    if args.dataset_file == "objects365":
+        return build_objects365(image_set, args)
+    if args.dataset_file == "coco":
+        return build_coco(image_set, args)
+    if args.dataset_file == "coco_panoptic":
+        # to avoid making panopticapi required for coco
+        from .coco_panoptic import build as build_coco_panoptic
+        return build_coco_panoptic(image_set, args)
+    raise ValueError(f"dataset {args.dataset_file} not supported")

perception_models/apps/detection/DETA_pe/datasets/coco.py ADDED Viewed

	@@ -0,0 +1,345 @@

+# ------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+"""
+COCO dataset which returns image_id for evaluation.
+Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py
+"""
+import random
+from pathlib import Path
+import datasets.transforms as T
+import torch
+import torch.utils.data
+import torchvision.transforms.functional as F
+from pycocotools import mask as coco_mask
+from util.misc import get_local_rank, get_local_size
+from .torchvision_datasets import CocoDetection as TvCocoDetection
+class CocoDetection(TvCocoDetection):
+    def __init__(
+        self,
+        img_folder,
+        ann_file,
+        transforms,
+        return_masks,
+        cache_mode=False,
+        local_rank=0,
+        local_size=1,
+        test_hflip_aug=False,
+        tta=False,
+        is_train=False,
+        lsj_img_size=1824,
+    ):
+        super(CocoDetection, self).__init__(
+            img_folder,
+            ann_file,
+            cache_mode=cache_mode,
+            local_rank=local_rank,
+            local_size=local_size,
+        )
+        self._transforms = transforms
+        self.prepare = ConvertCocoPolysToMask(return_masks)
+        self.test_hflip_aug = test_hflip_aug
+        self.tta = tta
+        if lsj_img_size == 1728: # for back-compatibility
+            self.tta_image_size = [1536, 1152,]
+        else:
+            self.tta_image_size = [1728, 1536, 1344,]
+        self.is_train = is_train
+    def __getitem__(self, idx):
+        img, target = super(CocoDetection, self).__getitem__(idx)
+        image_id = self.ids[idx]
+        target = {"image_id": image_id, "annotations": target}
+        img, target = self.prepare(img, target)
+        if self._transforms is not None:
+            img, target = self._transforms(img, target)
+        if self.test_hflip_aug:
+            flipped_img = torch.flip(img, dims=[-1])
+            new_img = torch.cat([img, flipped_img], dim=0)
+            return new_img, target
+        elif self.tta:
+            tta_images = [img]
+            flipped_img = torch.flip(img, dims=[-1])
+            tta_images.append(flipped_img)
+            _, height, width = img.shape
+            max_size_len = height if height >= width else width
+            for new_max_size in self.tta_image_size:
+                scale = new_max_size / max_size_len
+                new_height, new_width = int(scale * height), int(scale * width)
+                new_img = F.resize(img, size=(new_height, new_width))
+                tta_images.append(new_img)
+                flipped_img = torch.flip(new_img, dims=[-1])
+                tta_images.append(flipped_img)
+            return tta_images, target
+        else:
+            return img, target
+def convert_coco_poly_to_mask(segmentations, height, width):
+    masks = []
+    for polygons in segmentations:
+        rles = coco_mask.frPyObjects(polygons, height, width)
+        mask = coco_mask.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        mask = torch.as_tensor(mask, dtype=torch.uint8)
+        mask = mask.any(dim=2)
+        masks.append(mask)
+    if masks:
+        masks = torch.stack(masks, dim=0)
+    else:
+        masks = torch.zeros((0, height, width), dtype=torch.uint8)
+    return masks
+class ConvertCocoPolysToMask(object):
+    def __init__(self, return_masks=False):
+        self.return_masks = return_masks
+    def __call__(self, image, target):
+        w, h = image.size
+        image_id = target["image_id"]
+        image_id = torch.tensor([image_id])
+        anno = target["annotations"]
+        anno = [obj for obj in anno if "iscrowd" not in obj or obj["iscrowd"] == 0]
+        boxes = [obj["bbox"] for obj in anno]
+        # guard against no boxes via resizing
+        boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
+        boxes[:, 2:] += boxes[:, :2]
+        boxes[:, 0::2].clamp_(min=0, max=w)
+        boxes[:, 1::2].clamp_(min=0, max=h)
+        classes = [obj["category_id"] for obj in anno]
+        classes = torch.tensor(classes, dtype=torch.int64)
+        if self.return_masks:
+            segmentations = [obj["segmentation"] for obj in anno]
+            masks = convert_coco_poly_to_mask(segmentations, h, w)
+        keypoints = None
+        if anno and "keypoints" in anno[0]:
+            keypoints = [obj["keypoints"] for obj in anno]
+            keypoints = torch.as_tensor(keypoints, dtype=torch.float32)
+            num_keypoints = keypoints.shape[0]
+            if num_keypoints:
+                keypoints = keypoints.view(num_keypoints, -1, 3)
+        keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+        boxes = boxes[keep]
+        classes = classes[keep]
+        if self.return_masks:
+            masks = masks[keep]
+        if keypoints is not None:
+            keypoints = keypoints[keep]
+        target = {}
+        target["boxes"] = boxes
+        target["labels"] = classes
+        if self.return_masks:
+            target["masks"] = masks
+        target["image_id"] = image_id
+        if keypoints is not None:
+            target["keypoints"] = keypoints
+        # for conversion to coco api
+        area = torch.tensor([obj["area"] for obj in anno])
+        iscrowd = torch.tensor(
+            [obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno]
+        )
+        target["area"] = area[keep]
+        target["iscrowd"] = iscrowd[keep]
+        target["orig_size"] = torch.as_tensor([int(h), int(w)])
+        target["size"] = torch.as_tensor([int(h), int(w)])
+        return image, target
+def make_coco_transforms(image_set, bigger):
+    normalize = T.Compose(
+        [T.ToTensor(), T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]
+    )
+    if "train" in image_set:
+        scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
+    if "val" in image_set or "test" in image_set:
+        scales = [800]
+    max_size = 1333
+    if bigger:
+        scales = [int(1.5 * s) for s in scales]
+        max_size = 2000
+    if image_set == "train":
+        augmentation_list = [
+            T.RandomHorizontalFlip(),
+            T.RandomSelect(
+                T.RandomResize(scales, max_size=max_size),
+                T.Compose(
+                    [
+                        T.RandomResize([400, 500, 600]),
+                        T.RandomSizeCrop(384, 600),
+                        T.RandomResize(scales, max_size=max_size),
+                    ]
+                ),
+            ),
+            normalize,
+        ]
+        return T.Compose(augmentation_list)
+    if image_set == "val":
+        return T.Compose(
+            [
+                T.RandomResize(scales, max_size=max_size),
+                normalize,
+            ]
+        )
+    raise ValueError(f"unknown {image_set}")
+def make_coco_transforms_lsj(
+    image_set, image_size, lsj_img_train_min=480, lsj_strong_aug=False
+):
+    """
+    Reference: https://github.com/facebookresearch/detectron2/blob/main/projects/ViTDet/configs/common/coco_loader_lsj.py
+    import detectron2.data.transforms as T
+    from detectron2 import model_zoo
+    from detectron2.config import LazyCall as L
+    # Data using LSJ
+    image_size = 1024
+    dataloader = model_zoo.get_config("common/data/coco.py").dataloader
+    dataloader.train.mapper.augmentations = [
+        L(T.RandomFlip)(horizontal=True),  # flip first
+        L(T.ResizeScale)(
+            min_scale=0.1, max_scale=2.0, target_height=image_size, target_width=image_size
+        ),
+        L(T.FixedSizeCrop)(crop_size=(image_size, image_size), pad=False),
+    ]
+    dataloader.train.mapper.image_format = "RGB"
+    dataloader.train.total_batch_size = 64
+    # recompute boxes due to cropping
+    dataloader.train.mapper.recompute_boxes = True
+    dataloader.test.mapper.augmentations = [
+        L(T.ResizeShortestEdge)(short_edge_length=image_size, max_size=image_size),
+    ]
+    """
+    """
+    In our implementation, we simulate lsj data augmentation by:
+    (1) first the following augmentations
+    (2) then padding to (image_size, image_size) in collator, see util/misc/collate_fn_lsj.py
+    """
+    normalize = T.Compose(
+        [T.ToTensor(), T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]
+    )
+    if "train" in image_set:
+        scales = [scale for scale in range(lsj_img_train_min, image_size, 32)]
+    if "val" in image_set or "test" in image_set or "unlabel" in image_set:
+        scales = [image_size - 32]
+    # max_size = 1333
+    # if bigger:
+    #     scales = [int(1.5 * s) for s in scales]
+    #     max_size = 2000
+    max_size = image_size - 32  # for some wired bugs
+    augmentation_list = []
+    if "train" in image_set:
+        if lsj_strong_aug:
+            augmentation_list.extend(
+                [
+                    T.ColorJitter((0.4, 0.4, 0.4, 0.1), p=0.5),
+                    T.RandomGrayscale(p=0.2),
+                    # T.RandomErasingP05(),
+                ]
+            )
+        augmentation_list.extend(
+            [
+                T.RandomHorizontalFlip(),
+                T.RandomSelect(
+                    # similar to (T.ResizeScale)(min_scale=0.1, max_scale=1.0, target_height=image_size, target_width=image_size) and pad
+                    T.RandomResize(scales, max_size=max_size),
+                    # similar to (T.ResizeScale)(min_scale=1.0, max_scale=2.0, target_height=image_size, target_width=image_size) and crop
+                    T.Compose(
+                        [
+                            T.RandomResize([400, 500, 600]),
+                            T.RandomSizeCrop(384, 600),
+                            T.RandomResize([max_size], max_size=max_size),
+                        ]
+                    ),
+                ),
+                normalize,
+            ]
+        )
+        return T.Compose(augmentation_list)
+    if image_set == "val":
+        return T.Compose(
+            [
+                T.RandomResize(scales, max_size=max_size),
+                normalize,
+            ]
+        )
+    raise ValueError(f"unknown {image_set}")
+def build(image_set, args):
+    root = Path(args.coco_path)
+    assert root.exists(), f"provided COCO path {root} does not exist"
+    mode = "instances"
+    PATHS = {
+        "train": (root / "train2017", root / "annotations" / f"{mode}_train2017.json"),
+        "val": (root / "val2017", root / "annotations" / f"{mode}_val2017.json"),
+    }
+    img_folder, ann_file = PATHS[image_set]
+    if args.lsj:
+        coco_transform = make_coco_transforms_lsj(
+            image_set,
+            args.lsj_img_size,
+            args.lsj_img_train_min,
+            args.lsj_strong_aug,
+        )
+    else:
+        coco_transform = make_coco_transforms(image_set, args.bigger)
+    dataset = CocoDetection(
+        img_folder,
+        ann_file,
+        transforms=coco_transform,
+        return_masks=args.masks,
+        cache_mode=args.cache_mode,
+        local_rank=get_local_rank(),
+        local_size=get_local_size(),
+        test_hflip_aug=args.test_hflip_aug,
+        tta=args.tta,
+        is_train=("train" in image_set),
+        lsj_img_size=args.lsj_img_size,
+    )
+    return dataset

perception_models/apps/detection/DETA_pe/datasets/coco_eval.py ADDED Viewed

	@@ -0,0 +1,265 @@

+# ------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+"""
+COCO evaluator that works in distributed mode.
+Mostly copy-paste from https://github.com/pytorch/vision/blob/edfd5a7/references/detection/coco_eval.py
+The difference is that there is less copy-pasting from pycocotools
+in the end of the file, as python3 can suppress prints with contextlib
+"""
+import os
+import contextlib
+import copy
+import numpy as np
+import torch
+from pycocotools.cocoeval import COCOeval
+from pycocotools.coco import COCO
+import pycocotools.mask as mask_util
+from util.misc import all_gather
+class CocoEvaluator(object):
+    def __init__(self, coco_gt, iou_types):
+        assert isinstance(iou_types, (list, tuple))
+        coco_gt = copy.deepcopy(coco_gt)
+        self.coco_gt = coco_gt
+        self.iou_types = iou_types
+        self.coco_eval = {}
+        for iou_type in iou_types:
+            self.coco_eval[iou_type] = COCOeval(coco_gt, iouType=iou_type)
+        self.img_ids = []
+        self.eval_imgs = {k: [] for k in iou_types}
+    def update(self, predictions):
+        img_ids = list(np.unique(list(predictions.keys())))
+        self.img_ids.extend(img_ids)
+        for iou_type in self.iou_types:
+            results = self.prepare(predictions, iou_type)
+            # suppress pycocotools prints
+            with open(os.devnull, 'w') as devnull:
+                with contextlib.redirect_stdout(devnull):
+                    coco_dt = COCO.loadRes(self.coco_gt, results) if results else COCO()
+            coco_eval = self.coco_eval[iou_type]
+            coco_eval.cocoDt = coco_dt
+            coco_eval.params.imgIds = list(img_ids)
+            img_ids, eval_imgs = evaluate(coco_eval)
+            self.eval_imgs[iou_type].append(eval_imgs)
+    def synchronize_between_processes(self):
+        for iou_type in self.iou_types:
+            self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2)
+            create_common_coco_eval(self.coco_eval[iou_type], self.img_ids, self.eval_imgs[iou_type])
+    def accumulate(self):
+        for coco_eval in self.coco_eval.values():
+            coco_eval.accumulate()
+    def summarize(self):
+        for iou_type, coco_eval in self.coco_eval.items():
+            print("IoU metric: {}".format(iou_type))
+            coco_eval.summarize()
+    def prepare(self, predictions, iou_type):
+        if iou_type == "bbox":
+            return self.prepare_for_coco_detection(predictions)
+        elif iou_type == "segm":
+            return self.prepare_for_coco_segmentation(predictions)
+        elif iou_type == "keypoints":
+            return self.prepare_for_coco_keypoint(predictions)
+        else:
+            raise ValueError("Unknown iou type {}".format(iou_type))
+    def prepare_for_coco_detection(self, predictions):
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+            boxes = prediction["boxes"]
+            boxes = convert_to_xywh(boxes).tolist()
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        "bbox": box,
+                        "score": scores[k],
+                    }
+                    for k, box in enumerate(boxes)
+                ]
+            )
+        return coco_results
+    def prepare_for_coco_segmentation(self, predictions):
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+            scores = prediction["scores"]
+            labels = prediction["labels"]
+            masks = prediction["masks"]
+            masks = masks > 0.5
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+            rles = [
+                mask_util.encode(np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order="F"))[0]
+                for mask in masks
+            ]
+            for rle in rles:
+                rle["counts"] = rle["counts"].decode("utf-8")
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        "segmentation": rle,
+                        "score": scores[k],
+                    }
+                    for k, rle in enumerate(rles)
+                ]
+            )
+        return coco_results
+    def prepare_for_coco_keypoint(self, predictions):
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+            boxes = prediction["boxes"]
+            boxes = convert_to_xywh(boxes).tolist()
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+            keypoints = prediction["keypoints"]
+            keypoints = keypoints.flatten(start_dim=1).tolist()
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        'keypoints': keypoint,
+                        "score": scores[k],
+                    }
+                    for k, keypoint in enumerate(keypoints)
+                ]
+            )
+        return coco_results
+def convert_to_xywh(boxes):
+    xmin, ymin, xmax, ymax = boxes.unbind(1)
+    return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1)
+def merge(img_ids, eval_imgs):
+    all_img_ids = all_gather(img_ids)
+    all_eval_imgs = all_gather(eval_imgs)
+    merged_img_ids = []
+    for p in all_img_ids:
+        merged_img_ids.extend(p)
+    merged_eval_imgs = []
+    for p in all_eval_imgs:
+        merged_eval_imgs.append(p)
+    merged_img_ids = np.array(merged_img_ids)
+    merged_eval_imgs = np.concatenate(merged_eval_imgs, 2)
+    # keep only unique (and in sorted order) images
+    merged_img_ids, idx = np.unique(merged_img_ids, return_index=True)
+    merged_eval_imgs = merged_eval_imgs[..., idx]
+    return merged_img_ids, merged_eval_imgs
+def create_common_coco_eval(coco_eval, img_ids, eval_imgs):
+    img_ids, eval_imgs = merge(img_ids, eval_imgs)
+    img_ids = list(img_ids)
+    eval_imgs = list(eval_imgs.flatten())
+    coco_eval.evalImgs = eval_imgs
+    coco_eval.params.imgIds = img_ids
+    coco_eval._paramsEval = copy.deepcopy(coco_eval.params)
+#################################################################
+# From pycocotools, just removed the prints and fixed
+# a Python3 bug about unicode not defined
+#################################################################
+def evaluate(self):
+    '''
+    Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
+    :return: None
+    '''
+    # tic = time.time()
+    # print('Running per image evaluation...')
+    p = self.params
+    # add backward compatibility if useSegm is specified in params
+    if p.useSegm is not None:
+        p.iouType = 'segm' if p.useSegm == 1 else 'bbox'
+        print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType))
+    # print('Evaluate annotation type *{}*'.format(p.iouType))
+    p.imgIds = list(np.unique(p.imgIds))
+    if p.useCats:
+        p.catIds = list(np.unique(p.catIds))
+    p.maxDets = sorted(p.maxDets)
+    self.params = p
+    self._prepare()
+    # loop through images, area range, max detection number
+    catIds = p.catIds if p.useCats else [-1]
+    if p.iouType == 'segm' or p.iouType == 'bbox':
+        computeIoU = self.computeIoU
+    elif p.iouType == 'keypoints':
+        computeIoU = self.computeOks
+    self.ious = {
+        (imgId, catId): computeIoU(imgId, catId)
+        for imgId in p.imgIds
+        for catId in catIds}
+    evaluateImg = self.evaluateImg
+    maxDet = p.maxDets[-1]
+    evalImgs = [
+        evaluateImg(imgId, catId, areaRng, maxDet)
+        for catId in catIds
+        for areaRng in p.areaRng
+        for imgId in p.imgIds
+    ]
+    # this is NOT in the pycocotools code, but could be done outside
+    evalImgs = np.asarray(evalImgs).reshape(len(catIds), len(p.areaRng), len(p.imgIds))
+    self._paramsEval = copy.deepcopy(self.params)
+    # toc = time.time()
+    # print('DONE (t={:0.2f}s).'.format(toc-tic))
+    return p.imgIds, evalImgs
+#################################################################
+# end of straight copy from pycocotools, just removing the prints
+#################################################################

perception_models/apps/detection/DETA_pe/datasets/coco_panoptic.py ADDED Viewed

	@@ -0,0 +1,107 @@

+# ------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+import json
+from pathlib import Path
+import numpy as np
+import torch
+from PIL import Image
+from panopticapi.utils import rgb2id
+from util.box_ops import masks_to_boxes
+from .coco import make_coco_transforms
+class CocoPanoptic:
+    def __init__(self, img_folder, ann_folder, ann_file, transforms=None, return_masks=True):
+        with open(ann_file, 'r') as f:
+            self.coco = json.load(f)
+        # sort 'images' field so that they are aligned with 'annotations'
+        # i.e., in alphabetical order
+        self.coco['images'] = sorted(self.coco['images'], key=lambda x: x['id'])
+        # sanity check
+        if "annotations" in self.coco:
+            for img, ann in zip(self.coco['images'], self.coco['annotations']):
+                assert img['file_name'][:-4] == ann['file_name'][:-4]
+        self.img_folder = img_folder
+        self.ann_folder = ann_folder
+        self.ann_file = ann_file
+        self.transforms = transforms
+        self.return_masks = return_masks
+    def __getitem__(self, idx):
+        ann_info = self.coco['annotations'][idx] if "annotations" in self.coco else self.coco['images'][idx]
+        img_path = Path(self.img_folder) / ann_info['file_name'].replace('.png', '.jpg')
+        ann_path = Path(self.ann_folder) / ann_info['file_name']
+        img = Image.open(img_path).convert('RGB')
+        w, h = img.size
+        if "segments_info" in ann_info:
+            masks = np.asarray(Image.open(ann_path), dtype=np.uint32)
+            masks = rgb2id(masks)
+            ids = np.array([ann['id'] for ann in ann_info['segments_info']])
+            masks = masks == ids[:, None, None]
+            masks = torch.as_tensor(masks, dtype=torch.uint8)
+            labels = torch.tensor([ann['category_id'] for ann in ann_info['segments_info']], dtype=torch.int64)
+        target = {}
+        target['image_id'] = torch.tensor([ann_info['image_id'] if "image_id" in ann_info else ann_info["id"]])
+        if self.return_masks:
+            target['masks'] = masks
+        target['labels'] = labels
+        target["boxes"] = masks_to_boxes(masks)
+        target['size'] = torch.as_tensor([int(h), int(w)])
+        target['orig_size'] = torch.as_tensor([int(h), int(w)])
+        if "segments_info" in ann_info:
+            for name in ['iscrowd', 'area']:
+                target[name] = torch.tensor([ann[name] for ann in ann_info['segments_info']])
+        if self.transforms is not None:
+            img, target = self.transforms(img, target)
+        return img, target
+    def __len__(self):
+        return len(self.coco['images'])
+    def get_height_and_width(self, idx):
+        img_info = self.coco['images'][idx]
+        height = img_info['height']
+        width = img_info['width']
+        return height, width
+def build(image_set, args):
+    img_folder_root = Path(args.coco_path)
+    ann_folder_root = Path(args.coco_panoptic_path)
+    assert img_folder_root.exists(), f'provided COCO path {img_folder_root} does not exist'
+    assert ann_folder_root.exists(), f'provided COCO path {ann_folder_root} does not exist'
+    mode = 'panoptic'
+    PATHS = {
+        "train": ("train2017", Path("annotations") / f'{mode}_train2017.json'),
+        "val": ("val2017", Path("annotations") / f'{mode}_val2017.json'),
+    }
+    img_folder, ann_file = PATHS[image_set]
+    img_folder_path = img_folder_root / img_folder
+    ann_folder = ann_folder_root / f'{mode}_{img_folder}'
+    ann_file = ann_folder_root / ann_file
+    dataset = CocoPanoptic(img_folder_path, ann_folder, ann_file,
+                           transforms=make_coco_transforms(image_set), return_masks=args.masks)
+    return dataset

perception_models/apps/detection/DETA_pe/datasets/data_prefetcher.py ADDED Viewed

	@@ -0,0 +1,70 @@

+# ------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+import torch
+def to_cuda(samples, targets, device):
+    samples = samples.to(device, non_blocking=True)
+    targets = [{k: v.to(device, non_blocking=True) for k, v in t.items()} for t in targets]
+    return samples, targets
+class data_prefetcher():
+    def __init__(self, loader, device, prefetch=True):
+        self.loader = iter(loader)
+        self.prefetch = prefetch
+        self.device = device
+        if prefetch:
+            self.stream = torch.cuda.Stream()
+            self.preload()
+    def preload(self):
+        try:
+            self.next_samples, self.next_targets = next(self.loader)
+        except StopIteration:
+            self.next_samples = None
+            self.next_targets = None
+            return
+        # if record_stream() doesn't work, another option is to make sure device inputs are created
+        # on the main stream.
+        # self.next_input_gpu = torch.empty_like(self.next_input, device='cuda')
+        # self.next_target_gpu = torch.empty_like(self.next_target, device='cuda')
+        # Need to make sure the memory allocated for next_* is not still in use by the main stream
+        # at the time we start copying to next_*:
+        # self.stream.wait_stream(torch.cuda.current_stream())
+        with torch.cuda.stream(self.stream):
+            self.next_samples, self.next_targets = to_cuda(self.next_samples, self.next_targets, self.device)
+            # more code for the alternative if record_stream() doesn't work:
+            # copy_ will record the use of the pinned source tensor in this side stream.
+            # self.next_input_gpu.copy_(self.next_input, non_blocking=True)
+            # self.next_target_gpu.copy_(self.next_target, non_blocking=True)
+            # self.next_input = self.next_input_gpu
+            # self.next_target = self.next_target_gpu
+            # With Amp, it isn't necessary to manually convert data to half.
+            # if args.fp16:
+            #     self.next_input = self.next_input.half()
+            # else:
+    def next(self):
+        if self.prefetch:
+            torch.cuda.current_stream().wait_stream(self.stream)
+            samples = self.next_samples
+            targets = self.next_targets
+            if samples is not None:
+                samples.record_stream(torch.cuda.current_stream())
+            if targets is not None:
+                for t in targets:
+                    for k, v in t.items():
+                        v.record_stream(torch.cuda.current_stream())
+            self.preload()
+        else:
+            try:
+                samples, targets = next(self.loader)
+                samples, targets = to_cuda(samples, targets, self.device)
+            except StopIteration:
+                samples = None
+                targets = None
+        return samples, targets

perception_models/apps/detection/DETA_pe/datasets/objects365.py ADDED Viewed

	@@ -0,0 +1,54 @@

+# ------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+"""
+COCO dataset which returns image_id for evaluation.
+Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py
+"""
+from pathlib import Path
+import datasets.transforms as T
+import torch
+import torch.utils.data
+from pycocotools import mask as coco_mask
+from util.misc import get_local_rank, get_local_size
+from .coco import CocoDetection, make_coco_transforms, make_coco_transforms_lsj
+from .torchvision_datasets import CocoDetection as TvCocoDetection
+def build(image_set, args):
+    root = Path(args.coco_path)
+    assert root.exists(), f"provided Objects365 path {root} does not exist"
+    mode = "instances"
+    PATHS = {
+        "train": (
+            root / "train",
+            root / "annotations" / "zhiyuan_objv2_train_fixmiss.json",
+        ),
+        "val": (root / "val", root / "annotations" / "zhiyuan_objv2_val.json"),
+    }
+    img_folder, ann_file = PATHS[image_set]
+    if args.lsj:
+        coco_transform = make_coco_transforms_lsj(image_set, args.lsj_img_size)
+    else:
+        coco_transform = make_coco_transforms(image_set, args.bigger)
+    dataset = CocoDetection(
+        img_folder,
+        ann_file,
+        transforms=coco_transform,
+        return_masks=args.masks,
+        cache_mode=args.cache_mode,
+        local_rank=get_local_rank(),
+        local_size=get_local_size(),
+    )
+    return dataset

perception_models/apps/detection/DETA_pe/datasets/panoptic_eval.py ADDED Viewed

	@@ -0,0 +1,52 @@

+# ------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+import json
+import os
+import util.misc as utils
+try:
+    from panopticapi.evaluation import pq_compute
+except ImportError:
+    pass
+class PanopticEvaluator(object):
+    def __init__(self, ann_file, ann_folder, output_dir="panoptic_eval"):
+        self.gt_json = ann_file
+        self.gt_folder = ann_folder
+        if utils.is_main_process():
+            if not os.path.exists(output_dir):
+                os.mkdir(output_dir)
+        self.output_dir = output_dir
+        self.predictions = []
+    def update(self, predictions):
+        for p in predictions:
+            with open(os.path.join(self.output_dir, p["file_name"]), "wb") as f:
+                f.write(p.pop("png_string"))
+        self.predictions += predictions
+    def synchronize_between_processes(self):
+        all_predictions = utils.all_gather(self.predictions)
+        merged_predictions = []
+        for p in all_predictions:
+            merged_predictions += p
+        self.predictions = merged_predictions
+    def summarize(self):
+        if utils.is_main_process():
+            json_data = {"annotations": self.predictions}
+            predictions_json = os.path.join(self.output_dir, "predictions.json")
+            with open(predictions_json, "w") as f:
+                f.write(json.dumps(json_data))
+            return pq_compute(self.gt_json, predictions_json, gt_folder=self.gt_folder, pred_folder=self.output_dir)
+        return None

perception_models/apps/detection/DETA_pe/datasets/samplers.py ADDED Viewed

	@@ -0,0 +1,348 @@

+# ------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from codes in torch.utils.data.distributed
+# ------------------------------------------------------------------------
+import json
+import math
+import os
+from collections import defaultdict
+import torch
+import torch.distributed as dist
+from fvcore.common.timer import Timer
+from lvis import LVIS
+from torch.utils.data.sampler import Sampler
+def load_dataset_dicts(json_file):
+    timer = Timer()
+    lvis_api = LVIS(json_file)
+    if timer.seconds() > 1:
+        print("Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds()))
+    img_ids = sorted(lvis_api.imgs.keys())
+    imgs = lvis_api.load_imgs(img_ids)
+    anns = [lvis_api.img_ann_map[img_id] for img_id in img_ids]
+    imgs_anns = list(zip(imgs, anns))
+    print(
+        "Loaded {} images in the LVIS format from {}".format(len(imgs_anns), json_file)
+    )
+    dataset_dicts = []
+    for img_dict, anno_dict_list in imgs_anns:
+        record = {}
+        image_id = record["image_id"] = img_dict["id"]
+        objs = []
+        for anno in anno_dict_list:
+            # Check that the image_id in this annotation is the same as
+            # the image_id we're looking at.
+            # This fails only when the data parsing logic or the annotation file is buggy.
+            assert anno["image_id"] == image_id
+            obj = {}
+            # Convert 1-indexed to 0-indexed
+            obj["category_id"] = anno["category_id"] - 1
+            objs.append(obj)
+        record["annotations"] = objs
+        dataset_dicts.append(record)
+    return dataset_dicts
+def repeat_factors_from_category_frequency(dataset_dicts, repeat_thresh, sqrt=True):
+    # 1. For each category c, compute the fraction of images that contain it: f(c)
+    category_freq = defaultdict(int)
+    for dataset_dict in dataset_dicts:  # For each image (without repeats)
+        cat_ids = {ann["category_id"] for ann in dataset_dict["annotations"]}
+        for cat_id in cat_ids:
+            category_freq[cat_id] += 1
+    num_images = len(dataset_dicts)
+    for k, v in category_freq.items():
+        category_freq[k] = v / num_images
+    # 2. For each category c, compute the category-level repeat factor:
+    #    r(c) = max(1, sqrt(t / f(c)))
+    category_rep = {
+        cat_id: max(
+            1.0,
+            (
+                math.sqrt(repeat_thresh / cat_freq)
+                if sqrt
+                else (repeat_thresh / cat_freq)
+            ),
+        )
+        for cat_id, cat_freq in category_freq.items()
+    }
+    for cat_id in sorted(category_rep.keys()):
+        print(
+            f"Cat ID {cat_id}: freq={category_freq[cat_id]:.2f}, rep={category_rep[cat_id]:.2f}"
+        )
+    # 3. For each image I, compute the image-level repeat factor:
+    #    r(I) = max_{c in I} r(c)
+    rep_factors = []
+    for dataset_dict in dataset_dicts:
+        cat_ids = {ann["category_id"] for ann in dataset_dict["annotations"]}
+        rep_factor = max({category_rep[cat_id] for cat_id in cat_ids}, default=1.0)
+        rep_factors.append(rep_factor)
+    return torch.tensor(rep_factors, dtype=torch.float32)
+class RepeatFactorTrainingSampler(Sampler):
+    def __init__(
+        self,
+        dataset,
+        num_replicas=None,
+        rank=None,
+        local_rank=None,
+        local_size=None,
+        shuffle=True,
+    ):
+        if num_replicas is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            num_replicas = dist.get_world_size()
+        if rank is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            rank = dist.get_rank()
+        self.dataset = dataset
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+        self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
+        self.total_size = self.num_samples * self.num_replicas
+        self.shuffle = shuffle
+        json_file = (
+            "/checkpoint/onevision/peizesun/public_data/d2_data/lvis/lvis_v1_train.json"
+        )
+        dataset_dicts = load_dataset_dicts(json_file)
+        repeat_factors = repeat_factors_from_category_frequency(
+            dataset_dicts, repeat_thresh=0.001
+        )
+        # Split into whole number (_int_part) and fractional (_frac_part) parts.
+        self._int_part = torch.trunc(repeat_factors)
+        self._frac_part = repeat_factors - self._int_part
+    def _get_epoch_indices(self, generator):
+        """
+        Create a list of dataset indices (with repeats) to use for one epoch.
+        Args:
+            generator (torch.Generator): pseudo random number generator used for
+                stochastic rounding.
+        Returns:
+            torch.Tensor: list of dataset indices to use in one epoch. Each index
+                is repeated based on its calculated repeat factor.
+        """
+        # Since repeat factors are fractional, we use stochastic rounding so
+        # that the target repeat factor is achieved in expectation over the
+        # course of training
+        rands = torch.rand(len(self._frac_part), generator=generator)
+        rep_factors = self._int_part + (rands < self._frac_part).float()
+        # Construct a list of indices in which we repeat images as specified
+        indices = []
+        for dataset_index, rep_factor in enumerate(rep_factors):
+            indices.extend([dataset_index] * int(rep_factor.item()))
+        return torch.tensor(indices, dtype=torch.int64)
+    def __iter__(self):
+        if self.shuffle:
+            g = torch.Generator()
+            g.manual_seed(self.epoch)
+            # Sample indices with repeats determined by stochastic rounding; each
+            # "epoch" may have a slightly different size due to the rounding.
+            rfs_indices = self._get_epoch_indices(g)
+            # deterministically shuffle based on epoch
+            randperm = torch.randperm(len(rfs_indices), generator=g)
+            indices = rfs_indices[randperm].tolist()
+        else:
+            g = torch.Generator()
+            g.manual_seed(0)
+            # Sample indices with repeats determined by stochastic rounding; each
+            # "epoch" may have a slightly different size due to the rounding.
+            rfs_indices = self._get_epoch_indices(g)
+            indices = rfs_indices.tolist()
+        # add extra samples to make it evenly divisible
+        if self.total_size > len(indices):
+            indices += indices[: (self.total_size - len(indices))]
+            assert len(indices) == self.total_size
+            # subsample
+            offset = self.num_samples * self.rank
+            indices = indices[offset : offset + self.num_samples]
+            assert len(indices) == self.num_samples
+            return iter(indices)
+        else:
+            self.num_samples = int(math.ceil(len(indices) * 1.0 / self.num_replicas))
+            self.total_size = self.num_samples * self.num_replicas
+            indices += indices[: (self.total_size - len(indices))]
+            assert len(indices) == self.total_size
+            # subsample
+            offset = self.num_samples * self.rank
+            indices = indices[offset : offset + self.num_samples]
+            assert len(indices) == self.num_samples
+            return iter(indices)
+    def __len__(self):
+        return self.num_samples
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+class DistributedSampler(Sampler):
+    """Sampler that restricts data loading to a subset of the dataset.
+    It is especially useful in conjunction with
+    :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
+    process can pass a DistributedSampler instance as a DataLoader sampler,
+    and load a subset of the original dataset that is exclusive to it.
+    .. note::
+        Dataset is assumed to be of constant size.
+    Arguments:
+        dataset: Dataset used for sampling.
+        num_replicas (optional): Number of processes participating in
+            distributed training.
+        rank (optional): Rank of the current process within num_replicas.
+    """
+    def __init__(
+        self,
+        dataset,
+        num_replicas=None,
+        rank=None,
+        local_rank=None,
+        local_size=None,
+        shuffle=True,
+    ):
+        if num_replicas is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            num_replicas = dist.get_world_size()
+        if rank is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            rank = dist.get_rank()
+        self.dataset = dataset
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+        self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
+        self.total_size = self.num_samples * self.num_replicas
+        self.shuffle = shuffle
+    def __iter__(self):
+        if self.shuffle:
+            # deterministically shuffle based on epoch
+            g = torch.Generator()
+            g.manual_seed(self.epoch)
+            indices = torch.randperm(len(self.dataset), generator=g).tolist()
+        else:
+            indices = torch.arange(len(self.dataset)).tolist()
+        # add extra samples to make it evenly divisible
+        indices += indices[: (self.total_size - len(indices))]
+        assert len(indices) == self.total_size
+        # subsample
+        offset = self.num_samples * self.rank
+        indices = indices[offset : offset + self.num_samples]
+        assert len(indices) == self.num_samples
+        return iter(indices)
+    def __len__(self):
+        return self.num_samples
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+class NodeDistributedSampler(Sampler):
+    """Sampler that restricts data loading to a subset of the dataset.
+    It is especially useful in conjunction with
+    :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
+    process can pass a DistributedSampler instance as a DataLoader sampler,
+    and load a subset of the original dataset that is exclusive to it.
+    .. note::
+        Dataset is assumed to be of constant size.
+    Arguments:
+        dataset: Dataset used for sampling.
+        num_replicas (optional): Number of processes participating in
+            distributed training.
+        rank (optional): Rank of the current process within num_replicas.
+    """
+    def __init__(
+        self,
+        dataset,
+        num_replicas=None,
+        rank=None,
+        local_rank=None,
+        local_size=None,
+        shuffle=True,
+    ):
+        if num_replicas is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            num_replicas = dist.get_world_size()
+        if rank is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            rank = dist.get_rank()
+        if local_rank is None:
+            local_rank = int(os.environ.get("LOCAL_RANK", 0))
+        if local_size is None:
+            local_size = int(os.environ.get("LOCAL_SIZE", 1))
+        self.dataset = dataset
+        self.shuffle = shuffle
+        self.num_replicas = num_replicas
+        self.num_parts = local_size
+        self.rank = rank
+        self.local_rank = local_rank
+        self.epoch = 0
+        self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
+        self.total_size = self.num_samples * self.num_replicas
+        self.total_size_parts = self.num_samples * self.num_replicas // self.num_parts
+    def __iter__(self):
+        if self.shuffle:
+            # deterministically shuffle based on epoch
+            g = torch.Generator()
+            g.manual_seed(self.epoch)
+            indices = torch.randperm(len(self.dataset), generator=g).tolist()
+        else:
+            indices = torch.arange(len(self.dataset)).tolist()
+        indices = [i for i in indices if i % self.num_parts == self.local_rank]
+        # add extra samples to make it evenly divisible
+        indices += indices[: (self.total_size_parts - len(indices))]
+        assert len(indices) == self.total_size_parts
+        # subsample
+        indices = indices[
+            self.rank
+            // self.num_parts : self.total_size_parts : self.num_replicas
+            // self.num_parts
+        ]
+        assert len(indices) == self.num_samples
+        return iter(indices)
+    def __len__(self):
+        return self.num_samples
+    def set_epoch(self, epoch):
+        self.epoch = epoch

perception_models/apps/detection/DETA_pe/datasets/torchvision_datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# ------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+from .coco import CocoDetection

perception_models/apps/detection/DETA_pe/datasets/torchvision_datasets/coco.py ADDED Viewed

	@@ -0,0 +1,84 @@

+# ------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from torchvision
+# ------------------------------------------------------------------------
+"""
+Copy-Paste from torchvision, but add utility of caching images on memory
+"""
+from torchvision.datasets.vision import VisionDataset
+from PIL import Image
+import os
+import os.path
+import tqdm
+from io import BytesIO
+class CocoDetection(VisionDataset):
+    """`MS Coco Detection <http://mscoco.org/dataset/#detections-challenge2016>`_ Dataset.
+    Args:
+        root (string): Root directory where images are downloaded to.
+        annFile (string): Path to json annotation file.
+        transform (callable, optional): A function/transform that  takes in an PIL image
+            and returns a transformed version. E.g, ``transforms.ToTensor``
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+        transforms (callable, optional): A function/transform that takes input sample and its target as entry
+            and returns a transformed version.
+    """
+    def __init__(self, root, annFile, transform=None, target_transform=None, transforms=None,
+                 cache_mode=False, local_rank=0, local_size=1):
+        super(CocoDetection, self).__init__(root, transforms, transform, target_transform)
+        from pycocotools.coco import COCO
+        self.coco = COCO(annFile)
+        self.ids = list(sorted(self.coco.imgs.keys()))
+        self.cache_mode = cache_mode
+        self.local_rank = local_rank
+        self.local_size = local_size
+        if cache_mode:
+            self.cache = {}
+            self.cache_images()
+    def cache_images(self):
+        self.cache = {}
+        for index, img_id in zip(tqdm.trange(len(self.ids)), self.ids):
+            if index % self.local_size != self.local_rank:
+                continue
+            path = self.coco.loadImgs(img_id)[0]['file_name']
+            with open(os.path.join(self.root, path), 'rb') as f:
+                self.cache[path] = f.read()
+    def get_image(self, path):
+        if self.cache_mode:
+            if path not in self.cache.keys():
+                with open(os.path.join(self.root, path), 'rb') as f:
+                    self.cache[path] = f.read()
+            return Image.open(BytesIO(self.cache[path])).convert('RGB')
+        return Image.open(os.path.join(self.root, path)).convert('RGB')
+    def __getitem__(self, index):
+        """
+        Args:
+            index (int): Index
+        Returns:
+            tuple: Tuple (image, target). target is the object returned by ``coco.loadAnns``.
+        """
+        coco = self.coco
+        img_id = self.ids[index]
+        ann_ids = coco.getAnnIds(imgIds=img_id)
+        target = coco.loadAnns(ann_ids)
+        path = coco.loadImgs(img_id)[0]['file_name']
+        img = self.get_image(path)
+        if self.transforms is not None:
+            img, target = self.transforms(img, target)
+        return img, target
+    def __len__(self):
+        return len(self.ids)

perception_models/apps/detection/DETA_pe/datasets/transforms.py ADDED Viewed

	@@ -0,0 +1,327 @@

+# ------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+"""
+Transforms and data augmentation for both image + bbox.
+"""
+import random
+import PIL
+import torch
+import torchvision.transforms as T
+import torchvision.transforms.functional as F
+from util.box_ops import box_xyxy_to_cxcywh
+from util.misc import interpolate
+def crop(image, target, region):
+    cropped_image = F.crop(image, *region)
+    target = target.copy()
+    i, j, h, w = region
+    # should we do something wrt the original size?
+    target["size"] = torch.tensor([h, w])
+    fields = ["labels", "area", "iscrowd"]
+    if "boxes" in target:
+        boxes = target["boxes"]
+        max_size = torch.as_tensor([w, h], dtype=torch.float32)
+        cropped_boxes = boxes - torch.as_tensor([j, i, j, i])
+        cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size)
+        cropped_boxes = cropped_boxes.clamp(min=0)
+        area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1)
+        target["boxes"] = cropped_boxes.reshape(-1, 4)
+        target["area"] = area
+        fields.append("boxes")
+    if "masks" in target:
+        # FIXME should we update the area here if there are no boxes?
+        target["masks"] = target["masks"][:, i : i + h, j : j + w]
+        fields.append("masks")
+    # remove elements for which the boxes or masks that have zero area
+    if "boxes" in target or "masks" in target:
+        # favor boxes selection when defining which elements to keep
+        # this is compatible with previous implementation
+        if "boxes" in target:
+            cropped_boxes = target["boxes"].reshape(-1, 2, 2)
+            keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1)
+        else:
+            keep = target["masks"].flatten(1).any(1)
+        for field in fields:
+            target[field] = target[field][keep]
+    return cropped_image, target
+def hflip(image, target):
+    flipped_image = F.hflip(image)
+    w, h = image.size
+    target = target.copy()
+    if "boxes" in target:
+        boxes = target["boxes"]
+        boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor(
+            [-1, 1, -1, 1]
+        ) + torch.as_tensor([w, 0, w, 0])
+        target["boxes"] = boxes
+    if "masks" in target:
+        target["masks"] = target["masks"].flip(-1)
+    return flipped_image, target
+def resize(image, target, size, max_size=None):
+    # size can be min_size (scalar) or (w, h) tuple
+    def get_size_with_aspect_ratio(image_size, size, max_size=None):
+        w, h = image_size
+        if max_size is not None:
+            min_original_size = float(min((w, h)))
+            max_original_size = float(max((w, h)))
+            if max_original_size / min_original_size * size > max_size:
+                size = int(round(max_size * min_original_size / max_original_size))
+        if (w <= h and w == size) or (h <= w and h == size):
+            return (h, w)
+        if w < h:
+            ow = size
+            oh = int(size * h / w)
+        else:
+            oh = size
+            ow = int(size * w / h)
+        return (oh, ow)
+    def get_size(image_size, size, max_size=None):
+        if isinstance(size, (list, tuple)):
+            return size[::-1]
+        else:
+            return get_size_with_aspect_ratio(image_size, size, max_size)
+    size = get_size(image.size, size, max_size)
+    rescaled_image = F.resize(image, size)
+    if target is None:
+        return rescaled_image, None
+    ratios = tuple(
+        float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size)
+    )
+    ratio_width, ratio_height = ratios
+    target = target.copy()
+    if "boxes" in target:
+        boxes = target["boxes"]
+        scaled_boxes = boxes * torch.as_tensor(
+            [ratio_width, ratio_height, ratio_width, ratio_height]
+        )
+        target["boxes"] = scaled_boxes
+    if "area" in target:
+        area = target["area"]
+        scaled_area = area * (ratio_width * ratio_height)
+        target["area"] = scaled_area
+    h, w = size
+    target["size"] = torch.tensor([h, w])
+    if "masks" in target:
+        target["masks"] = (
+            interpolate(target["masks"][:, None].float(), size, mode="nearest")[:, 0]
+            > 0.5
+        )
+    return rescaled_image, target
+def pad(image, target, padding):
+    # assumes that we only pad on the bottom right corners
+    padded_image = F.pad(image, (0, 0, padding[0], padding[1]))
+    if target is None:
+        return padded_image, None
+    target = target.copy()
+    # should we do something wrt the original size?
+    target["size"] = torch.tensor(padded_image[::-1])
+    if "masks" in target:
+        target["masks"] = torch.nn.functional.pad(
+            target["masks"], (0, padding[0], 0, padding[1])
+        )
+    return padded_image, target
+class RandomCrop(object):
+    def __init__(self, size):
+        self.size = size
+    def __call__(self, img, target):
+        region = T.RandomCrop.get_params(img, self.size)
+        return crop(img, target, region)
+class RandomSizeCrop(object):
+    def __init__(self, min_size: int, max_size: int):
+        self.min_size = min_size
+        self.max_size = max_size
+    def __call__(self, img: PIL.Image.Image, target: dict):
+        w = random.randint(self.min_size, min(img.width, self.max_size))
+        h = random.randint(self.min_size, min(img.height, self.max_size))
+        region = T.RandomCrop.get_params(img, [h, w])
+        return crop(img, target, region)
+class CenterCrop(object):
+    def __init__(self, size):
+        self.size = size
+    def __call__(self, img, target):
+        image_width, image_height = img.size
+        crop_height, crop_width = self.size
+        crop_top = int(round((image_height - crop_height) / 2.0))
+        crop_left = int(round((image_width - crop_width) / 2.0))
+        return crop(img, target, (crop_top, crop_left, crop_height, crop_width))
+class RandomHorizontalFlip(object):
+    def __init__(self, p=0.5):
+        self.p = p
+    def __call__(self, img, target):
+        if random.random() < self.p:
+            return hflip(img, target)
+        return img, target
+class RandomResize(object):
+    def __init__(self, sizes, max_size=None):
+        assert isinstance(sizes, (list, tuple))
+        self.sizes = sizes
+        self.max_size = max_size
+    def __call__(self, img, target=None):
+        size = random.choice(self.sizes)
+        return resize(img, target, size, self.max_size)
+class RandomPad(object):
+    def __init__(self, max_pad):
+        self.max_pad = max_pad
+    def __call__(self, img, target):
+        pad_x = random.randint(0, self.max_pad)
+        pad_y = random.randint(0, self.max_pad)
+        return pad(img, target, (pad_x, pad_y))
+class RandomSelect(object):
+    """
+    Randomly selects between transforms1 and transforms2,
+    with probability p for transforms1 and (1 - p) for transforms2
+    """
+    def __init__(self, transforms1, transforms2, p=0.5):
+        self.transforms1 = transforms1
+        self.transforms2 = transforms2
+        self.p = p
+    def __call__(self, img, target):
+        if random.random() < self.p:
+            return self.transforms1(img, target)
+        return self.transforms2(img, target)
+class ToTensor(object):
+    def __call__(self, img, target):
+        return F.to_tensor(img), target
+class RandomErasingP05(object):
+    def __init__(self):
+        self.eraser = T.Compose(
+            [
+                T.ToTensor(),
+                T.RandomErasing(
+                    p=0.5, scale=(0.02, 0.2), ratio=(0.1, 6), value="random"
+                ),
+                T.ToPILImage(),
+            ]
+        )
+    def __call__(self, img, target):
+        return self.eraser(img), target
+class RandomErasing(object):
+    def __init__(self, *args, **kwargs):
+        self.eraser = T.RandomErasing(*args, **kwargs)
+    def __call__(self, img, target):
+        return self.eraser(img), target
+class ColorJitter(object):
+    def __init__(self, jitter=(0.2, 0.2, 0.2, 0.1), p=0.5):
+        self.color_jitter = T.ColorJitter(*jitter)
+        self.p = p
+    def __call__(self, img, target):
+        if random.random() < self.p:
+            return self.color_jitter(img), target
+        return img, target
+class RandomGrayscale(object):
+    def __init__(self, p=0.5):
+        self.random_gray = T.RandomGrayscale(p=p)
+    def __call__(self, img, target):
+        return self.random_gray(img), target
+class Normalize(object):
+    def __init__(self, mean, std):
+        self.mean = mean
+        self.std = std
+    def __call__(self, image, target=None):
+        image = F.normalize(image, mean=self.mean, std=self.std)
+        if target is None:
+            return image, None
+        target = target.copy()
+        h, w = image.shape[-2:]
+        if "boxes" in target:
+            boxes = target["boxes"]
+            boxes = box_xyxy_to_cxcywh(boxes)
+            boxes = boxes / torch.tensor([w, h, w, h], dtype=torch.float32)
+            target["boxes"] = boxes
+        return image, target
+class Compose(object):
+    def __init__(self, transforms):
+        self.transforms = transforms
+    def __call__(self, image, target):
+        for t in self.transforms:
+            image, target = t(image, target)
+        return image, target
+    def __repr__(self):
+        format_string = self.__class__.__name__ + "("
+        for t in self.transforms:
+            format_string += "\n"
+            format_string += "    {0}".format(t)
+        format_string += "\n)"
+        return format_string

perception_models/apps/detection/DETA_pe/engine.py ADDED Viewed

	@@ -0,0 +1,303 @@

+# ------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+"""
+Train and eval functions used in main.py
+"""
+import math
+import os
+import sys
+from typing import Iterable
+import torch
+import util.misc as utils
+from datasets.coco_eval import CocoEvaluator, convert_to_xywh
+from datasets.data_prefetcher import data_prefetcher
+from datasets.panoptic_eval import PanopticEvaluator
+from util.ema import requires_grad, update_ema
+from util.misc import NestedTensor
+def train_one_epoch(
+    model: torch.nn.Module,
+    criterion: torch.nn.Module,
+    data_loader: Iterable,
+    optimizer: torch.optim.Optimizer,
+    device: torch.device,
+    epoch: int,
+    max_norm: float = 0,
+    ema: torch.nn.Module = None,
+    ema_decay: float = 0.999,
+):
+    model.train()
+    criterion.train()
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    metric_logger.add_meter("lr", utils.SmoothedValue(window_size=1, fmt="{value:.6f}"))
+    metric_logger.add_meter(
+        "class_error", utils.SmoothedValue(window_size=1, fmt="{value:.2f}")
+    )
+    metric_logger.add_meter(
+        "grad_norm", utils.SmoothedValue(window_size=1, fmt="{value:.2f}")
+    )
+    header = "Epoch: [{}]".format(epoch)
+    print_freq = 10
+    prefetcher = data_prefetcher(data_loader, device, prefetch=True)
+    samples, targets = prefetcher.next()
+    # for samples, targets in metric_logger.log_every(data_loader, print_freq, header):
+    for _ in metric_logger.log_every(range(len(data_loader)), print_freq, header):
+        outputs = model(samples)
+        loss_dict = criterion(outputs, targets)
+        weight_dict = criterion.weight_dict
+        losses = sum(
+            loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict
+        )
+        # reduce losses over all GPUs for logging purposes
+        loss_dict_reduced = utils.reduce_dict(loss_dict)
+        loss_dict_reduced_unscaled = {
+            f"{k}_unscaled": v for k, v in loss_dict_reduced.items()
+        }
+        loss_dict_reduced_scaled = {
+            k: v * weight_dict[k]
+            for k, v in loss_dict_reduced.items()
+            if k in weight_dict
+        }
+        losses_reduced_scaled = sum(loss_dict_reduced_scaled.values())
+        loss_value = losses_reduced_scaled.item()
+        if not math.isfinite(loss_value):
+            print("Loss is {}, stopping training".format(loss_value))
+            print(loss_dict_reduced)
+            sys.exit(1)
+        optimizer.zero_grad()
+        losses.backward()
+        if max_norm > 0:
+            grad_total_norm = torch.nn.utils.clip_grad_norm_(
+                model.parameters(), max_norm
+            )
+        else:
+            grad_total_norm = utils.get_total_grad_norm(model.parameters(), max_norm)
+        optimizer.step()
+        if ema is not None:
+            update_ema(ema, model.module, ema_decay)
+        # torch.cuda.empty_cache()
+        metric_logger.update(
+            loss=loss_value, **loss_dict_reduced_scaled, **loss_dict_reduced_unscaled
+        )
+        metric_logger.update(class_error=loss_dict_reduced["class_error"])
+        metric_logger.update(lr=optimizer.param_groups[0]["lr"])
+        metric_logger.update(grad_norm=grad_total_norm)
+        samples, targets = prefetcher.next()
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes()
+    print("Averaged stats:", metric_logger)
+    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+@torch.no_grad()
+def evaluate(
+    model_no_ema,
+    criterion,
+    postprocessors,
+    data_loader,
+    base_ds,
+    device,
+    output_dir,
+    test_hflip_aug,
+    tta,
+    soft_nms,
+    ema=None,
+    save_result=False,
+    save_result_dir="",
+    soft_nms_method="quad",
+    nms_thresh=0.7,
+    quad_scale=0.5,
+    lsj_img_size=1824,
+):
+    model = model_no_ema if ema is None else ema
+    model.eval()
+    criterion.eval()
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    metric_logger.add_meter(
+        "class_error", utils.SmoothedValue(window_size=1, fmt="{value:.2f}")
+    )
+    header = "Test:"
+    iou_types = tuple(k for k in ("segm", "bbox") if k in postprocessors.keys())
+    coco_evaluator = CocoEvaluator(base_ds, iou_types)
+    # coco_evaluator.coco_eval[iou_types[0]].params.iouThrs = [0, 0.1, 0.5, 0.75]
+    panoptic_evaluator = None
+    if "panoptic" in postprocessors.keys():
+        panoptic_evaluator = PanopticEvaluator(
+            data_loader.dataset.ann_file,
+            data_loader.dataset.ann_folder,
+            output_dir=os.path.join(output_dir, "panoptic_eval"),
+        )
+    prediction_list = []
+    for samples, targets in metric_logger.log_every(data_loader, 10, header):
+        samples = samples.to(device)
+        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
+        if test_hflip_aug:
+            assert (
+                samples.tensors.shape[0] == 1
+            ), "test_hflip_aug only supports batch size 1"
+            assert (
+                samples.tensors.shape[1] == 6
+            ), "test_hflip_aug requires two images in a batch"
+            first_samples = NestedTensor(samples.tensors[:, :3], samples.mask)
+            outputs = model(first_samples)
+            flipped_samples = NestedTensor(samples.tensors[:, 3:], samples.mask)
+            flipped_outputs = model(flipped_samples)
+        else:
+            outputs = model(samples)
+        loss_dict = criterion(outputs, targets)
+        weight_dict = criterion.weight_dict
+        # reduce losses over all GPUs for logging purposes
+        loss_dict_reduced = utils.reduce_dict(loss_dict)
+        loss_dict_reduced_scaled = {
+            k: v * weight_dict[k]
+            for k, v in loss_dict_reduced.items()
+            if k in weight_dict
+        }
+        loss_dict_reduced_unscaled = {
+            f"{k}_unscaled": v for k, v in loss_dict_reduced.items()
+        }
+        metric_logger.update(
+            loss=sum(loss_dict_reduced_scaled.values()),
+            **loss_dict_reduced_scaled,
+            **loss_dict_reduced_unscaled,
+        )
+        metric_logger.update(class_error=loss_dict_reduced["class_error"])
+        orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0)
+        if test_hflip_aug:
+            new_outputs = {}
+            pred_logits = outputs["pred_logits"]
+            pred_boxes = outputs["pred_boxes"]
+            flipped_pred_logits = flipped_outputs["pred_logits"]
+            flipped_pred_boxes = flipped_outputs["pred_boxes"]
+            reflipped_pred_boxes = flipped_pred_boxes[
+                :, :, [0, 1, 2, 3]
+            ] * torch.as_tensor([-1, 1, 1, 1]).to(
+                flipped_pred_boxes.device
+            ) + torch.as_tensor(
+                [1, 0, 0, 0]
+            ).to(
+                flipped_pred_boxes.device
+            )
+            new_pred_logits = torch.cat([pred_logits, flipped_pred_logits], dim=1)
+            new_pred_boxes = torch.cat([pred_boxes, reflipped_pred_boxes], dim=1)
+            new_outputs["pred_logits"] = new_pred_logits
+            new_outputs["pred_boxes"] = new_pred_boxes
+            results = postprocessors["bbox"](
+                new_outputs,
+                orig_target_sizes,
+                soft_nms=soft_nms,
+                method=soft_nms_method,
+                nms_thresh=nms_thresh,
+                quad_scale=quad_scale,
+            )
+        else:
+            results = postprocessors["bbox"](
+                outputs,
+                orig_target_sizes,
+                soft_nms=soft_nms,
+                method=soft_nms_method,
+                nms_thresh=nms_thresh,
+                quad_scale=quad_scale,
+            )
+        if "segm" in postprocessors.keys():
+            target_sizes = torch.stack([t["size"] for t in targets], dim=0)
+            results = postprocessors["segm"](
+                results, outputs, orig_target_sizes, target_sizes
+            )
+        res = {
+            target["image_id"].item(): output
+            for target, output in zip(targets, results)
+        }
+        if coco_evaluator is not None:
+            coco_evaluator.update(res)
+        if panoptic_evaluator is not None:
+            res_pano = postprocessors["panoptic"](
+                outputs, target_sizes, orig_target_sizes
+            )
+            for i, target in enumerate(targets):
+                image_id = target["image_id"].item()
+                file_name = f"{image_id:012d}.png"
+                res_pano[i]["image_id"] = image_id
+                res_pano[i]["file_name"] = file_name
+            panoptic_evaluator.update(res_pano)
+        for target, output in zip(targets, results):
+            res_cpu = {
+                target["image_id"].item(): {
+                    "boxes": output["boxes"].cpu(),
+                    "labels": output["labels"].cpu(),
+                    "scores": output["scores"].cpu(),
+                }
+            }
+            prediction_list.append(res_cpu)
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes()
+    print("Averaged stats:", metric_logger)
+    if save_result:
+        from torch import distributed as dist
+        os.makedirs(save_result_dir, exist_ok=True)
+        rank = dist.get_rank()
+        torch.save(
+            prediction_list,
+            os.path.join(save_result_dir, f"val2017_prediction_{rank}.pth"),
+        )
+    if coco_evaluator is not None:
+        coco_evaluator.synchronize_between_processes()
+    if panoptic_evaluator is not None:
+        panoptic_evaluator.synchronize_between_processes()
+    # accumulate predictions from all images
+    if coco_evaluator is not None:
+        coco_evaluator.accumulate()
+        coco_evaluator.summarize()
+    panoptic_res = None
+    if panoptic_evaluator is not None:
+        panoptic_res = panoptic_evaluator.summarize()
+    stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+    if coco_evaluator is not None:
+        if "bbox" in postprocessors.keys():
+            stats["coco_eval_bbox"] = coco_evaluator.coco_eval["bbox"].stats.tolist()
+        if "segm" in postprocessors.keys():
+            stats["coco_eval_masks"] = coco_evaluator.coco_eval["segm"].stats.tolist()
+    if panoptic_res is not None:
+        stats["PQ_all"] = panoptic_res["All"]
+        stats["PQ_th"] = panoptic_res["Things"]
+        stats["PQ_st"] = panoptic_res["Stuff"]
+    return stats, coco_evaluator

perception_models/apps/detection/DETA_pe/engine_tta.py ADDED Viewed

	@@ -0,0 +1,239 @@

+# ------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+"""
+Train and eval functions used in main.py
+"""
+import math
+import os
+import sys
+from typing import Iterable
+import torch
+import util.misc as utils
+from datasets.coco_eval import CocoEvaluator, convert_to_xywh
+from datasets.data_prefetcher import data_prefetcher
+from datasets.panoptic_eval import PanopticEvaluator
+from models.utils_softnms import batched_soft_nms
+from util.misc import NestedTensor
+# Make sure this is consistent with datasets/coco.py
+# TODO: make it configurable
+SCALE_RANGES_DICT = {
+    1728: [[0, 10000], [32, 10000], [32, 10000],],
+    1824: [[0, 10000], [0, 10000], [64, 10000], [64, 10000],],
+}
+def filter_boxes(boxes, min_scale, max_scale):
+    """
+    boxes: (N, 4) shape
+    """
+    w = boxes[:, 2] - boxes[:, 0]
+    h = boxes[:, 3] - boxes[:, 1]
+    keep = (w * h > min_scale * min_scale) & (w * h < max_scale * max_scale)
+    return keep
+@torch.no_grad()
+def evaluate_tta(
+    model_no_ema,
+    criterion,
+    postprocessors,
+    data_loader,
+    base_ds,
+    device,
+    output_dir,
+    test_hflip_aug,
+    tta,
+    soft_nms,
+    ema=None,
+    save_result=False,
+    save_result_dir="",
+    soft_nms_method="quad",
+    nms_thresh=0.7,
+    quad_scale=0.5,
+    lsj_img_size=1824,
+):
+    model = model_no_ema if ema is None else ema
+    model.eval()
+    criterion.eval()
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    metric_logger.add_meter(
+        "class_error", utils.SmoothedValue(window_size=1, fmt="{value:.2f}")
+    )
+    header = "Test:"
+    iou_types = tuple(k for k in ("segm", "bbox") if k in postprocessors.keys())
+    coco_evaluator = CocoEvaluator(base_ds, iou_types)
+    # coco_evaluator.coco_eval[iou_types[0]].params.iouThrs = [0, 0.1, 0.5, 0.75]
+    SCALE_RANGES = SCALE_RANGES_DICT[lsj_img_size]
+    IMAGE_SIZE = [lsj_img_size for _ in range(len(SCALE_RANGES))]
+    prediction_list = []
+    for samples, targets in metric_logger.log_every(data_loader, 10, header):
+        samples = samples.to(device)
+        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
+        orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0)
+        metric_logger.update(loss=0, class_error=0, loss_bbox=0, loss_ce=0)
+        ########################### Begin of inference_one_image ###########################
+        if tta:
+            assert samples.tensors.shape[0] == 1, "tta only supports batch size 1"
+            assert (
+                samples.tensors.shape[1] % 3 == 0
+            ), "tta requires dimensions of samples.tensors to be divisible by 3"
+            all_boxes = []
+            all_scores = []
+            all_classes = []
+            num_scales = samples.tensors.shape[1] // 3
+            for scale_ind in range(num_scales):
+                first_samples = NestedTensor(
+                    samples.tensors[
+                        :,
+                        scale_ind * 3 : (scale_ind + 1) * 3,
+                        : IMAGE_SIZE[scale_ind // 2],
+                        : IMAGE_SIZE[scale_ind // 2],
+                    ],
+                    samples.mask[
+                        :,
+                        scale_ind,
+                        : IMAGE_SIZE[scale_ind // 2],
+                        : IMAGE_SIZE[scale_ind // 2],
+                    ],
+                )
+                if scale_ind % 2 == 0:
+                    ######## no flip #######
+                    outputs = model(first_samples)
+                    noaug_results = postprocessors["bbox"](
+                        outputs,
+                        orig_target_sizes,
+                        soft_nms=soft_nms,
+                        method=soft_nms_method,
+                        nms_thresh=nms_thresh,
+                        quad_scale=quad_scale,
+                    )
+                    keep = filter_boxes(
+                        noaug_results[0]["boxes"], *SCALE_RANGES[scale_ind // 2]
+                    )
+                    all_boxes.append(noaug_results[0]["boxes"][keep])
+                    all_scores.append(noaug_results[0]["scores"][keep])
+                    all_classes.append(noaug_results[0]["labels"][keep])
+                else:
+                    ######## flipped #######
+                    flipped_outputs = model(first_samples)
+                    flipped_pred_logits = flipped_outputs["pred_logits"]
+                    flipped_pred_boxes = flipped_outputs["pred_boxes"]
+                    reflipped_pred_boxes = flipped_pred_boxes[
+                        :, :, [0, 1, 2, 3]
+                    ] * torch.as_tensor([-1, 1, 1, 1]).to(
+                        flipped_pred_boxes.device
+                    ) + torch.as_tensor(
+                        [1, 0, 0, 0]
+                    ).to(
+                        flipped_pred_boxes.device
+                    )
+                    new_outputs = {}
+                    new_outputs["pred_logits"] = flipped_pred_logits
+                    new_outputs["pred_boxes"] = reflipped_pred_boxes
+                    new_results = postprocessors["bbox"](
+                        new_outputs,
+                        orig_target_sizes,
+                        soft_nms=soft_nms,
+                        method=soft_nms_method,
+                        nms_thresh=nms_thresh,
+                        quad_scale=quad_scale,
+                    )
+                    keep = filter_boxes(
+                        new_results[0]["boxes"], *SCALE_RANGES[scale_ind // 2]
+                    )
+                    all_boxes.append(new_results[0]["boxes"][keep])
+                    all_scores.append(new_results[0]["scores"][keep])
+                    all_classes.append(new_results[0]["labels"][keep])
+            ######## merge #######
+            all_boxes = torch.cat(all_boxes, dim=0)
+            all_scores = torch.cat(all_scores, dim=0)
+            all_classes = torch.cat(all_classes, dim=0)
+            keep_inds, updated_scores = batched_soft_nms(
+                all_boxes,
+                all_scores,
+                all_classes,
+                method=soft_nms_method,
+                threshold=nms_thresh,
+                quad_scale=quad_scale,
+            )
+            merged_scores = updated_scores
+            merged_classes = all_classes[keep_inds]
+            merged_boxes = all_boxes[keep_inds]
+            results = [
+                {
+                    "boxes": merged_boxes,
+                    "scores": merged_scores,
+                    "labels": merged_classes,
+                }
+            ]
+        else:
+            outputs = model(samples)
+            results = postprocessors["bbox"](outputs, orig_target_sizes)
+        ########################### End of inference_one_image ###########################
+        res = {
+            target["image_id"].item(): output
+            for target, output in zip(targets, results)
+        }
+        if coco_evaluator is not None:
+            coco_evaluator.update(res)
+        for target, output in zip(targets, results):
+            res_cpu = {
+                target["image_id"].item(): {
+                    "boxes": output["boxes"].cpu(),
+                    "labels": output["labels"].cpu(),
+                    "scores": output["scores"].cpu(),
+                }
+            }
+            prediction_list.append(res_cpu)
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes()
+    print("Averaged stats:", metric_logger)
+    if save_result:
+        from torch import distributed as dist
+        os.makedirs(save_result_dir, exist_ok=True)
+        rank = dist.get_rank()
+        torch.save(
+            prediction_list,
+            os.path.join(save_result_dir, f"val2017_prediction_{rank}.pth"),
+        )
+    if coco_evaluator is not None:
+        coco_evaluator.synchronize_between_processes()
+    # accumulate predictions from all images
+    if coco_evaluator is not None:
+        coco_evaluator.accumulate()
+        coco_evaluator.summarize()
+    stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+    if coco_evaluator is not None:
+        if "bbox" in postprocessors.keys():
+            stats["coco_eval_bbox"] = coco_evaluator.coco_eval["bbox"].stats.tolist()
+    return stats, coco_evaluator

perception_models/apps/detection/DETA_pe/main.py ADDED Viewed

	@@ -0,0 +1,754 @@

+# Modified from
+# ------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+import argparse
+import datetime
+import json
+import os
+import random
+import time
+from copy import deepcopy
+from pathlib import Path
+import datasets
+import datasets.samplers as samplers
+import numpy as np
+import torch
+import util.misc as utils
+from datasets import build_dataset, get_coco_api_from_dataset
+from engine import evaluate, train_one_epoch
+from engine_tta import evaluate_tta
+from models import build_model
+from torch.utils.data import DataLoader
+from util.ema import requires_grad, update_ema
+def get_args_parser():
+    parser = argparse.ArgumentParser("Deformable DETR Detector", add_help=False)
+    parser.add_argument("--lr", default=2e-4, type=float)
+    parser.add_argument(
+        "--lr_backbone_names", default=["backbone.0"], type=str, nargs="+"
+    )
+    parser.add_argument("--lr_backbone", default=2e-5, type=float)
+    parser.add_argument(
+        "--lr_linear_proj_names",
+        default=["reference_points", "sampling_offsets"],
+        type=str,
+        nargs="+",
+    )
+    parser.add_argument("--lr_linear_proj_mult", default=0.1, type=float)
+    parser.add_argument("--batch_size", default=2, type=int)
+    parser.add_argument("--weight_decay", default=1e-4, type=float)
+    parser.add_argument("--epochs", default=50, type=int)
+    parser.add_argument("--eval_per_epochs", default=1, type=int)
+    parser.add_argument("--save_per_epochs", default=1, type=int)
+    parser.add_argument("--lr_drop", default=40, type=int)
+    parser.add_argument("--lr_drop_epochs", default=None, type=int, nargs="+")
+    parser.add_argument(
+        "--clip_max_norm", default=0.1, type=float, help="gradient clipping max norm"
+    )
+    parser.add_argument("--sgd", action="store_true")
+    parser.add_argument("--ema", action="store_true")
+    parser.add_argument("--ema_decay", default=0.999, type=float)
+    # Variants of Deformable DETR
+    parser.add_argument("--with_box_refine", default=False, action="store_true")
+    parser.add_argument("--two_stage", default=False, action="store_true")
+    # Model parameters
+    parser.add_argument(
+        "--frozen_weights",
+        type=str,
+        default=None,
+        help="Path to the pretrained model. If set, only the mask head will be trained",
+    )
+    # * Backbone
+    parser.add_argument(
+        "--backbone",
+        default="resnet50",
+        type=str,
+        help="Name of the convolutional backbone to use",
+    )
+    parser.add_argument(
+        "--backbone_size",
+        default="Gwin384",
+        type=str,
+        help="backbone size",
+    )
+    parser.add_argument(
+        "--backbone_path",
+        default="",
+        type=str,
+    )
+    parser.add_argument(
+        "--backbone_lrd",
+        default=1.0,
+        type=float,
+    )
+    parser.add_argument(
+        "--backbone_layers",
+        default=12,
+        type=int,
+    )
+    parser.add_argument(
+        "--backbone_init_values",
+        default=0.0,
+        type=float,
+    )
+    parser.add_argument(
+        "--backbone_tile_posemb",
+        default=False,
+        type=bool,
+    )
+    parser.add_argument(
+        "--backbone_use_act_checkpoint",
+        action="store_true",
+        help="If true, we use act_checkpoint in backbone",
+    )
+    parser.add_argument(
+        "--backbone_act_checkpoint_ratio",
+        default=1.0,
+        type=float,
+    )
+    parser.add_argument(
+        "--backbone_tta_rope",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--backbone_multi_layer",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--backbone_win_aug",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--backbone_dp",
+        default=-1.0,
+        type=float,
+    )
+    parser.add_argument(
+        "--bf16",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--dilation",
+        action="store_true",
+        help="If true, we replace stride with dilation in the last convolutional block (DC5)",
+    )
+    parser.add_argument(
+        "--position_embedding",
+        default="sine",
+        type=str,
+        choices=("sine", "learned"),
+        help="Type of positional embedding to use on top of the image features",
+    )
+    parser.add_argument(
+        "--position_embedding_scale",
+        default=2 * np.pi,
+        type=float,
+        help="position / size * scale",
+    )
+    parser.add_argument(
+        "--num_feature_levels", default=4, type=int, help="number of feature levels"
+    )
+    # * Transformer
+    parser.add_argument(
+        "--enc_layers",
+        default=6,
+        type=int,
+        help="Number of encoding layers in the transformer",
+    )
+    parser.add_argument(
+        "--dec_layers",
+        default=6,
+        type=int,
+        help="Number of decoding layers in the transformer",
+    )
+    parser.add_argument(
+        "--dim_feedforward",
+        default=1024,
+        type=int,
+        help="Intermediate size of the feedforward layers in the transformer blocks",
+    )
+    parser.add_argument(
+        "--hidden_dim",
+        default=256,
+        type=int,
+        help="Size of the embeddings (dimension of the transformer)",
+    )
+    parser.add_argument(
+        "--dropout", default=0.1, type=float, help="Dropout applied in the transformer"
+    )
+    parser.add_argument(
+        "--nheads",
+        default=8,
+        type=int,
+        help="Number of attention heads inside the transformer's attentions",
+    )
+    parser.add_argument(
+        "--num_queries", default=300, type=int, help="Number of query slots"
+    )
+    parser.add_argument("--dec_n_points", default=4, type=int)
+    parser.add_argument("--enc_n_points", default=4, type=int)
+    # * Segmentation
+    parser.add_argument(
+        "--masks",
+        action="store_true",
+        help="Train segmentation head if the flag is provided",
+    )
+    # Loss
+    parser.add_argument(
+        "--no_aux_loss",
+        dest="aux_loss",
+        action="store_false",
+        help="Disables auxiliary decoding losses (loss at each layer)",
+    )
+    parser.add_argument("--use_fed_loss", action="store_true")
+    # * Matcher
+    parser.add_argument("--assign_first_stage", action="store_true")
+    parser.add_argument("--assign_second_stage", action="store_true")
+    parser.add_argument(
+        "--set_cost_class",
+        default=2,
+        type=float,
+        help="Class coefficient in the matching cost",
+    )
+    parser.add_argument(
+        "--set_cost_bbox",
+        default=5,
+        type=float,
+        help="L1 box coefficient in the matching cost",
+    )
+    parser.add_argument(
+        "--set_cost_giou",
+        default=2,
+        type=float,
+        help="giou box coefficient in the matching cost",
+    )
+    # * Loss coefficients
+    parser.add_argument("--mask_loss_coef", default=1, type=float)
+    parser.add_argument("--dice_loss_coef", default=1, type=float)
+    parser.add_argument("--cls_loss_coef", default=2, type=float)
+    parser.add_argument("--bbox_loss_coef", default=5, type=float)
+    parser.add_argument("--giou_loss_coef", default=2, type=float)
+    parser.add_argument("--focal_alpha", default=0.25, type=float)
+    # dataset parameters
+    parser.add_argument("--new_mean_std", action="store_true")
+    parser.add_argument("--dataset_file", default="coco")
+    parser.add_argument("--coco_path", default="./data/coco", type=str)
+    parser.add_argument("--coco_panoptic_path", type=str)
+    parser.add_argument("--remove_difficult", action="store_true")
+    parser.add_argument("--bigger", action="store_true")
+    parser.add_argument("--lsj", action="store_true")
+    parser.add_argument("--lsj_ms", action="store_true")
+    parser.add_argument("--lsj_img_size", default=1024, type=int)
+    parser.add_argument("--lsj_img_train_min", default=480, type=int)
+    parser.add_argument("--lsj_img_size_max", default=-1, type=int)
+    parser.add_argument("--lsj_strong_aug", action="store_true")
+    parser.add_argument("--save_result", action="store_true")
+    parser.add_argument("--save_result_dir", default="", type=str)
+    parser.add_argument("--test_hflip_aug", action="store_true")
+    parser.add_argument("--tta", action="store_true")
+    parser.add_argument("--soft_nms", action="store_true")
+    parser.add_argument("--soft_nms_method", default="quad", type=str)
+    parser.add_argument("--nms_thresh", default=0.7, type=float)
+    parser.add_argument("--quad_scale", default=0.5, type=float)
+    parser.add_argument(
+        "--output_dir", default="", help="path where to save, empty for no saving"
+    )
+    parser.add_argument(
+        "--device", default="cuda", help="device to use for training / testing"
+    )
+    parser.add_argument("--seed", default=42, type=int)
+    parser.add_argument("--resume", default="", help="resume from checkpoint")
+    parser.add_argument("--auto_resume", action="store_true")
+    parser.add_argument(
+        "--resume_norope",
+        action="store_true",
+        help="resume from checkpoint without rope params",
+    )
+    parser.add_argument("--finetune", default="", help="finetune from checkpoint")
+    parser.add_argument("--keep_class_embed", action="store_true")
+    parser.add_argument(
+        "--start_epoch", default=0, type=int, metavar="N", help="start epoch"
+    )
+    parser.add_argument("--eval", action="store_true")
+    parser.add_argument("--num_workers", default=8, type=int)
+    parser.add_argument(
+        "--cache_mode",
+        default=False,
+        action="store_true",
+        help="whether to cache images on memory",
+    )
+    return parser
+# lr_backbone_names = ["backbone.0", "backbone.neck", "input_proj", "transformer.encoder"]
+def match_name_keywords(n, name_keywords):
+    out = False
+    for b in name_keywords:
+        if b in n:
+            out = True
+            break
+    return out
+def get_vit_lr_decay_rate_vev01(name, lr_decay_rate=1.0, num_layers=12):
+    layer_id = num_layers + 1
+    if ".positional_embedding" in name or ".conv1" in name or ".ln_pre" in name:
+        layer_id = 0
+    elif ".resblocks." in name:
+        layer_id = int(name[name.find(".resblocks.") :].split(".")[2]) + 1
+    return lr_decay_rate ** (num_layers + 1 - layer_id)
+def custom_lr(model_without_ddp, args):
+    param_dicts = [
+        {
+            "params": [
+                p
+                for n, p in model_without_ddp.named_parameters()
+                if not match_name_keywords(n, args.lr_backbone_names)
+                and not match_name_keywords(n, args.lr_linear_proj_names)
+                and p.requires_grad
+            ],
+            "lr": args.lr,
+        },
+        {
+            "params": [
+                p
+                for n, p in model_without_ddp.named_parameters()
+                if match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad
+            ],
+            "lr": args.lr * args.lr_linear_proj_mult,
+        },
+    ]
+    if "vev01" in args.backbone:
+        for p_key, p_value in model_without_ddp.named_parameters():
+            if (
+                match_name_keywords(p_key, args.lr_backbone_names)
+                and p_value.requires_grad
+            ):
+                p_lr = args.lr_backbone * get_vit_lr_decay_rate_vev01(
+                    p_key, args.backbone_lrd, args.backbone_layers
+                )
+                param_dicts.append(
+                    {
+                        "params": [p_value],
+                        "lr": p_lr,
+                    }
+                )
+                print(f"param_name: {p_key}, lr: {p_lr}")
+    else:
+        param_groups_backbone = {
+            "params": [
+                p
+                for n, p in model_without_ddp.named_parameters()
+                if match_name_keywords(n, args.lr_backbone_names) and p.requires_grad
+            ],
+            "lr": args.lr_backbone,
+        }
+        param_dicts.append(param_groups_backbone)
+    return param_dicts
+def main(args):
+    utils.init_distributed_mode(args)
+    print("git:\n  {}\n".format(utils.get_sha()))
+    if args.frozen_weights is not None:
+        assert args.masks, "Frozen training is meant for segmentation only"
+    print(args)
+    device = torch.device(args.device)
+    # fix the seed for reproducibility
+    seed = args.seed + utils.get_rank()
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    model, criterion, postprocessors = build_model(args)
+    model.to(device)
+    model_without_ddp = model
+    n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print("model:", model_without_ddp)
+    for n, p in model_without_ddp.named_parameters():
+        print(n)
+    print("number of params:", n_parameters)
+    if args.ema:
+        ema = deepcopy(model).to(device)
+        requires_grad(ema, False)
+        print(f"EMA Parameters: {sum(p.numel() for p in ema.parameters()):,}")
+    dataset_train = build_dataset(image_set="train", args=args)
+    dataset_val = build_dataset(image_set="val", args=args)
+    if args.distributed:
+        if args.cache_mode:
+            sampler_train = samplers.NodeDistributedSampler(dataset_train)
+            sampler_val = samplers.NodeDistributedSampler(dataset_val, shuffle=False)
+        else:
+            if args.dataset_file == "lvis":
+                sampler_train = samplers.RepeatFactorTrainingSampler(dataset_train)
+            else:
+                sampler_train = samplers.DistributedSampler(dataset_train)
+            sampler_val = samplers.DistributedSampler(dataset_val, shuffle=False)
+    else:
+        sampler_train = torch.utils.data.RandomSampler(dataset_train)
+        sampler_val = torch.utils.data.SequentialSampler(dataset_val)
+    batch_sampler_train = torch.utils.data.BatchSampler(
+        sampler_train, args.batch_size, drop_last=True
+    )
+    if args.lsj_ms:
+        collator = utils.CollatorLSJMultiscale(args.lsj_img_size, args.tta)
+    elif args.lsj:
+        lsj_img_size_colla = (
+            args.lsj_img_size_max if args.lsj_img_size_max > 0 else args.lsj_img_size
+        )
+        collator = utils.CollatorLSJ(lsj_img_size_colla, args.tta)
+    else:
+        collator = utils.collate_fn
+    data_loader_train = DataLoader(
+        dataset_train,
+        batch_sampler=batch_sampler_train,
+        collate_fn=collator,
+        num_workers=args.num_workers,
+        pin_memory=True,
+    )
+    data_loader_val = DataLoader(
+        dataset_val,
+        args.batch_size,
+        sampler=sampler_val,
+        drop_last=False,
+        collate_fn=collator,
+        num_workers=args.num_workers,
+        pin_memory=True,
+    )
+    param_dicts = custom_lr(model_without_ddp, args)
+    if args.sgd:
+        optimizer = torch.optim.SGD(
+            param_dicts, lr=args.lr, momentum=0.9, weight_decay=args.weight_decay
+        )
+    else:
+        optimizer = torch.optim.AdamW(
+            param_dicts, lr=args.lr, weight_decay=args.weight_decay
+        )
+    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)
+    if args.distributed:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
+        model_without_ddp = model.module
+    if args.dataset_file == "coco_panoptic":
+        # We also evaluate AP during panoptic training, on original coco DS
+        coco_val = datasets.coco.build("val", args)
+        base_ds = get_coco_api_from_dataset(coco_val)
+    else:
+        base_ds = get_coco_api_from_dataset(dataset_val)
+    if args.frozen_weights is not None:
+        checkpoint = torch.load(args.frozen_weights, map_location="cpu")
+        model_without_ddp.detr.load_state_dict(checkpoint["model"])
+    if args.tta:
+        evaluate_fn = evaluate_tta
+    else:
+        evaluate_fn = evaluate
+    output_dir = Path(args.output_dir)
+    if args.auto_resume:
+        resumed_ckpt = os.path.join(args.output_dir, "checkpoint.pth")
+        if os.path.exists(resumed_ckpt):
+            args.resume = resumed_ckpt
+            args.finetune = None
+    if args.finetune:
+        checkpoint = torch.load(args.finetune, map_location="cpu")
+        state_dict = checkpoint["model"]
+        for k in list(state_dict.keys()):
+            if "class_embed" in k and not args.keep_class_embed:
+                print("removing", k)
+                del state_dict[k]
+            if "freqs" in k:
+                print("removing", k)
+                del state_dict[k]
+        missing_keys, unexpected_keys = model_without_ddp.load_state_dict(
+            state_dict, strict=False
+        )
+        unexpected_keys = [
+            k
+            for k in unexpected_keys
+            if not (k.endswith("total_params") or k.endswith("total_ops"))
+        ]
+        if len(missing_keys) > 0:
+            print("Missing Keys: {}".format(missing_keys))
+        if len(unexpected_keys) > 0:
+            print("Unexpected Keys: {}".format(unexpected_keys))
+        if "epoch" in checkpoint:
+            print("finetuning from epoch", checkpoint["epoch"])
+        if args.ema:
+            ema.load_state_dict(
+                checkpoint["ema"] if "ema" in checkpoint else state_dict, strict=False
+            )
+    if args.resume:
+        print("Resuming training from {}".format(args.resume))
+        if args.resume.startswith("https"):
+            checkpoint = torch.hub.load_state_dict_from_url(
+                args.resume, map_location="cpu", check_hash=True
+            )
+        else:
+            checkpoint = torch.load(args.resume, map_location="cpu")
+        if args.resume_norope:
+            state_dict = checkpoint["model"]
+            for k in list(state_dict.keys()):
+                if "freqs" in k:
+                    print("removing", k)
+                    del state_dict[k]
+            missing_keys, unexpected_keys = model_without_ddp.load_state_dict(
+                state_dict, strict=False
+            )
+            if args.ema:
+                ema.load_state_dict(
+                    checkpoint["ema"] if "ema" in checkpoint else state_dict,
+                    strict=False,
+                )
+        else:
+            missing_keys, unexpected_keys = model_without_ddp.load_state_dict(
+                checkpoint["model"], strict=False
+            )
+            if args.ema:
+                ema.load_state_dict(
+                    checkpoint["ema"] if "ema" in checkpoint else state_dict,
+                    strict=False,
+                )
+        unexpected_keys = [
+            k
+            for k in unexpected_keys
+            if not (k.endswith("total_params") or k.endswith("total_ops"))
+        ]
+        if len(missing_keys) > 0:
+            print("Missing Keys: {}".format(missing_keys))
+        if len(unexpected_keys) > 0:
+            print("Unexpected Keys: {}".format(unexpected_keys))
+        if (
+            not args.eval
+            and "optimizer" in checkpoint
+            and "lr_scheduler" in checkpoint
+            and "epoch" in checkpoint
+        ):
+            import copy
+            p_groups = copy.deepcopy(optimizer.param_groups)
+            optimizer.load_state_dict(checkpoint["optimizer"])
+            for pg, pg_old in zip(optimizer.param_groups, p_groups):
+                pg["lr"] = pg_old["lr"]
+                pg["initial_lr"] = pg_old["initial_lr"]
+            print(optimizer.param_groups)
+            lr_scheduler.load_state_dict(checkpoint["lr_scheduler"])
+            # todo: this is a hack for doing experiment that resume from checkpoint and also modify lr scheduler (e.g., decrease lr in advance).
+            args.override_resumed_lr_drop = True
+            if args.override_resumed_lr_drop:
+                print(
+                    "Warning: (hack) args.override_resumed_lr_drop is set to True, so args.lr_drop would override lr_drop in resumed lr_scheduler."
+                )
+                lr_scheduler.step_size = args.lr_drop
+                lr_scheduler.base_lrs = list(
+                    map(lambda group: group["initial_lr"], optimizer.param_groups)
+                )
+            lr_scheduler.step(lr_scheduler.last_epoch)
+            args.start_epoch = checkpoint["epoch"] + 1
+        # check the resumed model
+        if not args.eval:
+            test_stats, coco_evaluator = evaluate_fn(
+                model,
+                criterion,
+                postprocessors,
+                data_loader_val,
+                base_ds,
+                device,
+                args.output_dir,
+                args.test_hflip_aug,
+                args.tta,
+                args.soft_nms,
+                ema if args.ema else None,
+                args.save_result,
+                args.save_result_dir,
+                soft_nms_method=args.soft_nms_method,
+                nms_thresh=args.nms_thresh,
+                quad_scale=args.quad_scale,
+                lsj_img_size=args.lsj_img_size,
+            )
+        torch.cuda.empty_cache()
+    if args.eval:
+        test_stats, coco_evaluator = evaluate_fn(
+            model,
+            criterion,
+            postprocessors,
+            data_loader_val,
+            base_ds,
+            device,
+            args.output_dir,
+            args.test_hflip_aug,
+            args.tta,
+            args.soft_nms,
+            ema if args.ema else None,
+            args.save_result,
+            args.save_result_dir,
+            soft_nms_method=args.soft_nms_method,
+            nms_thresh=args.nms_thresh,
+            quad_scale=args.quad_scale,
+            lsj_img_size=args.lsj_img_size,
+        )
+        if args.output_dir:
+            utils.save_on_master(
+                coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth"
+            )
+        return
+    print("Start training")
+    start_time = time.time()
+    if args.ema:
+        ema.eval()  # EMA model should always be in eval mode
+    for epoch in range(args.start_epoch, args.epochs):
+        if args.distributed:
+            sampler_train.set_epoch(epoch)
+        train_stats = train_one_epoch(
+            model,
+            criterion,
+            data_loader_train,
+            optimizer,
+            device,
+            epoch,
+            args.clip_max_norm,
+            ema if args.ema else None,
+            ema_decay=args.ema_decay,
+        )
+        lr_scheduler.step()
+        if args.output_dir:
+            checkpoint_paths = [output_dir / "checkpoint.pth"]
+            # extra checkpoint before LR drop and every 5 epochs
+            if (
+                (epoch + 1) % args.lr_drop == 0
+                or (epoch + 1) % args.save_per_epochs == 0
+                or epoch + 1 == args.epochs
+            ):
+                checkpoint_paths.append(output_dir / f"checkpoint{epoch:04}.pth")
+            for checkpoint_path in checkpoint_paths:
+                ckpt_dict = {
+                    "model": model_without_ddp.state_dict(),
+                    "optimizer": optimizer.state_dict(),
+                    "lr_scheduler": lr_scheduler.state_dict(),
+                    "epoch": epoch,
+                    "args": args,
+                }
+                if args.ema:
+                    ckpt_dict["ema"] = ema.state_dict()
+                utils.save_on_master(
+                    ckpt_dict,
+                    checkpoint_path,
+                )
+        torch.cuda.empty_cache()
+        if epoch % args.eval_per_epochs == 0 or epoch + 1 == args.epochs:
+            test_stats, coco_evaluator = evaluate_fn(
+                model,
+                criterion,
+                postprocessors,
+                data_loader_val,
+                base_ds,
+                device,
+                args.output_dir,
+                args.test_hflip_aug,
+                args.tta,
+                args.soft_nms,
+                ema if args.ema else None,
+                args.save_result,
+                args.save_result_dir,
+                soft_nms_method=args.soft_nms_method,
+                nms_thresh=args.nms_thresh,
+                quad_scale=args.quad_scale,
+                lsj_img_size=args.lsj_img_size,
+            )
+            log_stats = {
+                **{f"train_{k}": v for k, v in train_stats.items()},
+                **{f"test_{k}": v for k, v in test_stats.items()},
+                "epoch": epoch,
+                "n_parameters": n_parameters,
+            }
+            if args.output_dir and utils.is_main_process():
+                with (output_dir / "log.txt").open("a") as f:
+                    f.write(json.dumps(log_stats) + "\n")
+                # for evaluation logs
+                if coco_evaluator is not None:
+                    (output_dir / "eval").mkdir(exist_ok=True)
+                    if "bbox" in coco_evaluator.coco_eval:
+                        filenames = ["latest.pth"]
+                        if epoch % 50 == 0:
+                            filenames.append(f"{epoch:03}.pth")
+                        for name in filenames:
+                            torch.save(
+                                coco_evaluator.coco_eval["bbox"].eval,
+                                output_dir / "eval" / name,
+                            )
+        torch.cuda.empty_cache()
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    print("Training time {}".format(total_time_str))
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        "Deformable DETR training and evaluation script", parents=[get_args_parser()]
+    )
+    args = parser.parse_args()
+    if args.output_dir:
+        Path(args.output_dir).mkdir(parents=True, exist_ok=True)
+    main(args)

perception_models/apps/detection/DETA_pe/models/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# ------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+from .deformable_detr import build
+def build_model(args):
+    return build(args)

perception_models/apps/detection/DETA_pe/models/assigner.py ADDED Viewed

	@@ -0,0 +1,378 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Jeffrey Ouyang-Zhang
+from typing import List
+import torch
+import torch.nn as nn
+from util.box_ops import (
+    box_cxcywh_to_xyxy,
+    box_iou,
+    box_xyxy_to_cxcywh,
+    generalized_box_iou,
+)
+# from https://github.com/facebookresearch/detectron2/blob/cbbc1ce26473cb2a5cc8f58e8ada9ae14cb41052/detectron2/layers/wrappers.py#L100
+def nonzero_tuple(x):
+    """
+    A 'as_tuple=True' version of torch.nonzero to support torchscript.
+    because of https://github.com/pytorch/pytorch/issues/38718
+    """
+    if torch.jit.is_scripting():
+        if x.dim() == 0:
+            return x.unsqueeze(0).nonzero().unbind(1)
+        return x.nonzero().unbind(1)
+    else:
+        return x.nonzero(as_tuple=True)
+# from https://github.com/facebookresearch/detectron2/blob/9921a2caa585d4fa66c4b534b6fab6e74d89b582/detectron2/modeling/matcher.py#L9
+class Matcher(object):
+    """
+    This class assigns to each predicted "element" (e.g., a box) a ground-truth
+    element. Each predicted element will have exactly zero or one matches; each
+    ground-truth element may be matched to zero or more predicted elements.
+    The matching is determined by the MxN match_quality_matrix, that characterizes
+    how well each (ground-truth, prediction)-pair match each other. For example,
+    if the elements are boxes, this matrix may contain box intersection-over-union
+    overlap values.
+    The matcher returns (a) a vector of length N containing the index of the
+    ground-truth element m in [0, M) that matches to prediction n in [0, N).
+    (b) a vector of length N containing the labels for each prediction.
+    """
+    def __init__(
+        self,
+        thresholds: List[float],
+        labels: List[int],
+        allow_low_quality_matches: bool = False,
+    ):
+        """
+        Args:
+            thresholds (list): a list of thresholds used to stratify predictions
+                into levels.
+            labels (list): a list of values to label predictions belonging at
+                each level. A label can be one of {-1, 0, 1} signifying
+                {ignore, negative class, positive class}, respectively.
+            allow_low_quality_matches (bool): if True, produce additional matches
+                for predictions with maximum match quality lower than high_threshold.
+                See set_low_quality_matches_ for more details.
+            For example,
+                thresholds = [0.3, 0.5]
+                labels = [0, -1, 1]
+                All predictions with iou < 0.3 will be marked with 0 and
+                thus will be considered as false positives while training.
+                All predictions with 0.3 <= iou < 0.5 will be marked with -1 and
+                thus will be ignored.
+                All predictions with 0.5 <= iou will be marked with 1 and
+                thus will be considered as true positives.
+        """
+        # Add -inf and +inf to first and last position in thresholds
+        thresholds = thresholds[:]
+        assert thresholds[0] > 0
+        thresholds.insert(0, -float("inf"))
+        thresholds.append(float("inf"))
+        # Currently torchscript does not support all + generator
+        assert all(
+            [low <= high for (low, high) in zip(thresholds[:-1], thresholds[1:])]
+        ), thresholds
+        assert all([l in [-1, 0, 1] for l in labels])
+        assert len(labels) == len(thresholds) - 1
+        self.thresholds = thresholds
+        self.labels = labels
+        self.allow_low_quality_matches = allow_low_quality_matches
+    def __call__(self, match_quality_matrix):
+        """
+        Args:
+            match_quality_matrix (Tensor[float]): an MxN tensor, containing the
+                pairwise quality between M ground-truth elements and N predicted
+                elements. All elements must be >= 0 (due to the us of `torch.nonzero`
+                for selecting indices in :meth:`set_low_quality_matches_`).
+        Returns:
+            matches (Tensor[int64]): a vector of length N, where matches[i] is a matched
+                ground-truth index in [0, M)
+            match_labels (Tensor[int8]): a vector of length N, where pred_labels[i] indicates
+                whether a prediction is a true or false positive or ignored
+        """
+        assert match_quality_matrix.dim() == 2
+        if match_quality_matrix.numel() == 0:
+            default_matches = match_quality_matrix.new_full(
+                (match_quality_matrix.size(1),), 0, dtype=torch.int64
+            )
+            # When no gt boxes exist, we define IOU = 0 and therefore set labels
+            # to `self.labels[0]`, which usually defaults to background class 0
+            # To choose to ignore instead, can make labels=[-1,0,-1,1] + set appropriate thresholds
+            default_match_labels = match_quality_matrix.new_full(
+                (match_quality_matrix.size(1),), self.labels[0], dtype=torch.int8
+            )
+            return default_matches, default_match_labels
+        assert torch.all(match_quality_matrix >= 0)
+        # match_quality_matrix is M (gt) x N (predicted)
+        # Max over gt elements (dim 0) to find best gt candidate for each prediction
+        matched_vals, matches = match_quality_matrix.max(dim=0)
+        match_labels = matches.new_full(matches.size(), 1, dtype=torch.int8)
+        for l, low, high in zip(self.labels, self.thresholds[:-1], self.thresholds[1:]):
+            low_high = (matched_vals >= low) & (matched_vals < high)
+            match_labels[low_high] = l
+        if self.allow_low_quality_matches:
+            self.set_low_quality_matches_(match_labels, match_quality_matrix)
+        return matches, match_labels
+    def set_low_quality_matches_(self, match_labels, match_quality_matrix):
+        """
+        Produce additional matches for predictions that have only low-quality matches.
+        Specifically, for each ground-truth G find the set of predictions that have
+        maximum overlap with it (including ties); for each prediction in that set, if
+        it is unmatched, then match it to the ground-truth G.
+        This function implements the RPN assignment case (i) in Sec. 3.1.2 of
+        :paper:`Faster R-CNN`.
+        """
+        # For each gt, find the prediction with which it has highest quality
+        highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1)
+        # Find the highest quality match available, even if it is low, including ties.
+        # Note that the matches qualities must be positive due to the use of
+        # `torch.nonzero`.
+        _, pred_inds_with_highest_quality = nonzero_tuple(
+            match_quality_matrix == highest_quality_foreach_gt[:, None]
+        )
+        # If an anchor was labeled positive only due to a low-quality match
+        # with gt_A, but it has larger overlap with gt_B, it's matched index will still be gt_B.
+        # This follows the implementation in Detectron, and is found to have no significant impact.
+        match_labels[pred_inds_with_highest_quality] = 1
+# from https://github.com/facebookresearch/detectron2/blob/cbbc1ce26473cb2a5cc8f58e8ada9ae14cb41052/detectron2/modeling/sampling.py#L9
+def subsample_labels(
+    labels: torch.Tensor, num_samples: int, positive_fraction: float, bg_label: int
+):
+    """
+    Return `num_samples` (or fewer, if not enough found)
+    random samples from `labels` which is a mixture of positives & negatives.
+    It will try to return as many positives as possible without
+    exceeding `positive_fraction * num_samples`, and then try to
+    fill the remaining slots with negatives.
+    Args:
+        labels (Tensor): (N, ) label vector with values:
+            * -1: ignore
+            * bg_label: background ("negative") class
+            * otherwise: one or more foreground ("positive") classes
+        num_samples (int): The total number of labels with value >= 0 to return.
+            Values that are not sampled will be filled with -1 (ignore).
+        positive_fraction (float): The number of subsampled labels with values > 0
+            is `min(num_positives, int(positive_fraction * num_samples))`. The number
+            of negatives sampled is `min(num_negatives, num_samples - num_positives_sampled)`.
+            In order words, if there are not enough positives, the sample is filled with
+            negatives. If there are also not enough negatives, then as many elements are
+            sampled as is possible.
+        bg_label (int): label index of background ("negative") class.
+    Returns:
+        pos_idx, neg_idx (Tensor):
+            1D vector of indices. The total length of both is `num_samples` or fewer.
+    """
+    positive = nonzero_tuple((labels != -1) & (labels != bg_label))[0]
+    negative = nonzero_tuple(labels == bg_label)[0]
+    num_pos = int(num_samples * positive_fraction)
+    # protect against not enough positive examples
+    num_pos = min(positive.numel(), num_pos)
+    num_neg = num_samples - num_pos
+    # protect against not enough negative examples
+    num_neg = min(negative.numel(), num_neg)
+    # randomly select positive and negative examples
+    perm1 = torch.randperm(positive.numel(), device=positive.device)[:num_pos]
+    perm2 = torch.randperm(negative.numel(), device=negative.device)[:num_neg]
+    pos_idx = positive[perm1]
+    neg_idx = negative[perm2]
+    return pos_idx, neg_idx
+def sample_topk_per_gt(pr_inds, gt_inds, iou, k):
+    if len(gt_inds) == 0:
+        return pr_inds, gt_inds
+    # find topk matches for each gt
+    gt_inds2, counts = gt_inds.unique(return_counts=True)
+    scores, pr_inds2 = iou[gt_inds2].topk(k, dim=1)
+    gt_inds2 = gt_inds2[:, None].repeat(1, k)
+    # filter to as many matches that gt has
+    pr_inds3 = torch.cat([pr[:c] for c, pr in zip(counts, pr_inds2)])
+    gt_inds3 = torch.cat([gt[:c] for c, gt in zip(counts, gt_inds2)])
+    return pr_inds3, gt_inds3
+# modified from https://github.com/facebookresearch/detectron2/blob/cbbc1ce26473cb2a5cc8f58e8ada9ae14cb41052/detectron2/modeling/roi_heads/roi_heads.py#L123
+class Stage2Assigner(nn.Module):
+    def __init__(self, num_queries, max_k=4):
+        super().__init__()
+        self.positive_fraction = 0.25
+        self.bg_label = 400  # number > 91 to filter out later
+        self.batch_size_per_image = num_queries
+        self.proposal_matcher = Matcher(
+            thresholds=[0.6], labels=[0, 1], allow_low_quality_matches=True
+        )
+        self.k = max_k
+    def _sample_proposals(
+        self,
+        matched_idxs: torch.Tensor,
+        matched_labels: torch.Tensor,
+        gt_classes: torch.Tensor,
+    ):
+        """
+        Based on the matching between N proposals and M groundtruth,
+        sample the proposals and set their classification labels.
+        Args:
+            matched_idxs (Tensor): a vector of length N, each is the best-matched
+                gt index in [0, M) for each proposal.
+            matched_labels (Tensor): a vector of length N, the matcher's label
+                (one of cfg.MODEL.ROI_HEADS.IOU_LABELS) for each proposal.
+            gt_classes (Tensor): a vector of length M.
+        Returns:
+            Tensor: a vector of indices of sampled proposals. Each is in [0, N).
+            Tensor: a vector of the same length, the classification label for
+                each sampled proposal. Each sample is labeled as either a category in
+                [0, num_classes) or the background (num_classes).
+        """
+        has_gt = gt_classes.numel() > 0
+        # Get the corresponding GT for each proposal
+        if has_gt:
+            gt_classes = gt_classes[matched_idxs]
+            # Label unmatched proposals (0 label from matcher) as background (label=num_classes)
+            gt_classes[matched_labels == 0] = self.bg_label
+            # Label ignore proposals (-1 label)
+            gt_classes[matched_labels == -1] = -1
+        else:
+            gt_classes = torch.zeros_like(matched_idxs) + self.bg_label
+        sampled_fg_idxs, sampled_bg_idxs = subsample_labels(
+            gt_classes, self.batch_size_per_image, self.positive_fraction, self.bg_label
+        )
+        sampled_idxs = torch.cat([sampled_fg_idxs, sampled_bg_idxs], dim=0)
+        return sampled_idxs, gt_classes[sampled_idxs]
+    def forward(self, outputs, targets, return_cost_matrix=False):
+        # COCO categories are from 1 to 90. They set num_classes=91 and apply sigmoid.
+        bs = len(targets)
+        indices = []
+        ious = []
+        for b in range(bs):
+            iou, _ = box_iou(
+                box_cxcywh_to_xyxy(targets[b]["boxes"]),
+                box_cxcywh_to_xyxy(outputs["init_reference"][b].detach()),
+            )
+            matched_idxs, matched_labels = self.proposal_matcher(
+                iou
+            )  # proposal_id -> highest_iou_gt_id, proposal_id -> [1 if iou > 0.6, 0 ow]
+            sampled_idxs, sampled_gt_classes = (
+                self._sample_proposals(  # list of sampled proposal_ids, sampled_id -> [0, num_classes)+[bg_label]
+                    matched_idxs, matched_labels, targets[b]["labels"]
+                )
+            )
+            pos_pr_inds = sampled_idxs[sampled_gt_classes != self.bg_label]
+            pos_gt_inds = matched_idxs[pos_pr_inds]
+            pos_pr_inds, pos_gt_inds = self.postprocess_indices(
+                pos_pr_inds, pos_gt_inds, iou
+            )
+            indices.append((pos_pr_inds, pos_gt_inds))
+            ious.append(iou)
+        if return_cost_matrix:
+            return indices, ious
+        return indices
+    def postprocess_indices(self, pr_inds, gt_inds, iou):
+        return sample_topk_per_gt(pr_inds, gt_inds, iou, self.k)
+# modified from https://github.com/facebookresearch/detectron2/blob/cbbc1ce26473cb2a5cc8f58e8ada9ae14cb41052/detectron2/modeling/proposal_generator/rpn.py#L181
+class Stage1Assigner(nn.Module):
+    def __init__(self, t_low=0.3, t_high=0.7, max_k=4):
+        super().__init__()
+        self.positive_fraction = 0.5
+        self.batch_size_per_image = 256
+        self.k = max_k
+        self.t_low = t_low
+        self.t_high = t_high
+        self.anchor_matcher = Matcher(
+            thresholds=[t_low, t_high],
+            labels=[0, -1, 1],
+            allow_low_quality_matches=True,
+        )
+    def _subsample_labels(self, label):
+        """
+        Randomly sample a subset of positive and negative examples, and overwrite
+        the label vector to the ignore value (-1) for all elements that are not
+        included in the sample.
+        Args:
+            labels (Tensor): a vector of -1, 0, 1. Will be modified in-place and returned.
+        """
+        pos_idx, neg_idx = subsample_labels(
+            label, self.batch_size_per_image, self.positive_fraction, 0
+        )
+        # Fill with the ignore label (-1), then set positive and negative labels
+        label.fill_(-1)
+        label.scatter_(0, pos_idx, 1)
+        label.scatter_(0, neg_idx, 0)
+        return label
+    def forward(self, outputs, targets):
+        bs = len(targets)
+        indices = []
+        for b in range(bs):
+            anchors = outputs["anchors"][b]
+            if len(targets[b]["boxes"]) == 0:
+                indices.append(
+                    (
+                        torch.tensor([], dtype=torch.long, device=anchors.device),
+                        torch.tensor([], dtype=torch.long, device=anchors.device),
+                    )
+                )
+                continue
+            iou, _ = box_iou(
+                box_cxcywh_to_xyxy(targets[b]["boxes"]),
+                box_cxcywh_to_xyxy(anchors),
+            )
+            matched_idxs, matched_labels = self.anchor_matcher(
+                iou
+            )  # proposal_id -> highest_iou_gt_id, proposal_id -> [1 if iou > 0.7, 0 if iou < 0.3, -1 ow]
+            matched_labels = self._subsample_labels(matched_labels)
+            all_pr_inds = torch.arange(len(anchors)).to(anchors.device)
+            pos_pr_inds = all_pr_inds[matched_labels == 1]
+            pos_gt_inds = matched_idxs[pos_pr_inds]
+            pos_ious = iou[pos_gt_inds, pos_pr_inds]
+            pos_pr_inds, pos_gt_inds = self.postprocess_indices(
+                pos_pr_inds, pos_gt_inds, iou
+            )
+            pos_pr_inds, pos_gt_inds = pos_pr_inds.to(anchors.device), pos_gt_inds.to(
+                anchors.device
+            )
+            indices.append((pos_pr_inds, pos_gt_inds))
+        return indices
+    def postprocess_indices(self, pr_inds, gt_inds, iou):
+        return sample_topk_per_gt(pr_inds, gt_inds, iou, self.k)

perception_models/apps/detection/DETA_pe/models/backbone.py ADDED Viewed

	@@ -0,0 +1,235 @@

+# ------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+"""
+Backbone modules.
+"""
+from collections import OrderedDict
+from functools import partial
+from typing import Dict, List
+import torch
+import torch.nn.functional as F
+import torchvision
+from torch import nn
+from torch.cuda.amp import autocast
+from torchvision.models._utils import IntermediateLayerGetter
+from util.misc import is_main_process, NestedTensor
+from .position_encoding import build_position_encoding
+from .swin import get_swinl
+from .pev1 import get_pev1_and_fpn_backbone
+class FrozenBatchNorm2d(torch.nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt,
+    without which any other models than torchvision.models.resnet[18,34,50,101]
+    produce nans.
+    """
+    def __init__(self, n, eps=1e-5):
+        super(FrozenBatchNorm2d, self).__init__()
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+        self.eps = eps
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        num_batches_tracked_key = prefix + "num_batches_tracked"
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+        super(FrozenBatchNorm2d, self)._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+    def forward(self, x):
+        # move reshapes to the beginning
+        # to make it fuser-friendly
+        w = self.weight.reshape(1, -1, 1, 1)
+        b = self.bias.reshape(1, -1, 1, 1)
+        rv = self.running_var.reshape(1, -1, 1, 1)
+        rm = self.running_mean.reshape(1, -1, 1, 1)
+        eps = self.eps
+        scale = w * (rv + eps).rsqrt()
+        bias = b - rm * scale
+        return x * scale + bias
+class BackboneBase(nn.Module):
+    def __init__(
+        self, backbone: nn.Module, train_backbone: bool, return_interm_layers: bool
+    ):
+        super().__init__()
+        for name, parameter in backbone.named_parameters():
+            if (
+                not train_backbone
+                or "layer2" not in name
+                and "layer3" not in name
+                and "layer4" not in name
+            ):
+                parameter.requires_grad_(False)
+        if return_interm_layers:
+            # return_layers = {"layer1": "0", "layer2": "1", "layer3": "2", "layer4": "3"}
+            return_layers = {"layer2": "0", "layer3": "1", "layer4": "2"}
+            self.strides = [8, 16, 32]
+            self.num_channels = [512, 1024, 2048]
+        else:
+            return_layers = {"layer4": "0"}
+            self.strides = [32]
+            self.num_channels = [2048]
+        self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
+    def forward(self, tensor_list: NestedTensor):
+        xs = self.body(tensor_list.tensors)
+        out: Dict[str, NestedTensor] = {}
+        for name, x in xs.items():
+            m = tensor_list.mask
+            assert m is not None
+            mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
+            out[name] = NestedTensor(x, mask)
+        return out
+class Backbone(BackboneBase):
+    """ResNet backbone with frozen BatchNorm."""
+    def __init__(
+        self,
+        name: str,
+        train_backbone: bool,
+        return_interm_layers: bool,
+        dilation: bool,
+    ):
+        norm_layer = FrozenBatchNorm2d
+        backbone = getattr(torchvision.models, name)(
+            replace_stride_with_dilation=[False, False, dilation],
+            pretrained=is_main_process(),
+            norm_layer=norm_layer,
+        )
+        assert name not in ("resnet18", "resnet34"), "number of channels are hard coded"
+        super().__init__(backbone, train_backbone, return_interm_layers)
+        if dilation:
+            self.strides[-1] = self.strides[-1] // 2
+class SwinBackbone(nn.Module):
+    def __init__(self):
+        # we skip R50 FrozenBatchNorm2d, dilation, train l{2,3,4} only
+        super().__init__()
+        self.body = get_swinl()
+        self.features = ["res3", "res4", "res5"]
+        self.strides = [8, 16, 32]
+        self.num_channels = [384, 768, 1536]
+    def forward(self, tensor_list: NestedTensor):
+        xs = self.body(tensor_list.tensors)
+        m = tensor_list.mask[None]
+        assert m is not None
+        out: Dict[str, NestedTensor] = {}
+        for name in self.features:
+            mask = F.interpolate(m.float(), size=xs[name].shape[-2:]).to(torch.bool)[0]
+            out[name] = NestedTensor(xs[name], mask)
+        return out
+class PEv1Backbone(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        self.body = get_pev1_and_fpn_backbone(args)
+        self.features = self.body._out_features
+        self.bf16 = args.bf16
+        self.fp16 = args.fp16
+        _out_feature_strides = self.body._out_feature_strides
+        _out_feature_channels = self.body._out_feature_channels
+        self.strides = [_out_feature_strides[f] for f in _out_feature_strides.keys()]
+        self.num_channels = [
+            _out_feature_channels[f] for f in _out_feature_channels.keys()
+        ]
+    def forward(self, tensor_list: NestedTensor):
+        # xs = self.body(tensor_list.tensors)
+        # backbone
+        if self.bf16:
+            with autocast(dtype=torch.bfloat16):
+                xs = self.body(tensor_list.tensors.to(torch.bfloat16))
+            xs = {k: v.float() for k, v in xs.items()}
+        elif self.fp16:
+            with autocast(dtype=torch.float16):
+                xs = self.body(tensor_list.tensors.half())
+            xs = {k: v.float() for k, v in xs.items()}
+        else:
+            xs = self.body(tensor_list.tensors)
+        m = tensor_list.mask[None]
+        assert m is not None
+        out: Dict[str, NestedTensor] = {}
+        for name in self.features:
+            mask = F.interpolate(m.float(), size=xs[name].shape[-2:]).to(torch.bool)[0]
+            out[name] = NestedTensor(xs[name], mask)
+        return out
+class Joiner(nn.Sequential):
+    def __init__(self, backbone, position_embedding):
+        super().__init__(backbone, position_embedding)
+        self.strides = backbone.strides
+        self.num_channels = backbone.num_channels
+    def forward(self, tensor_list: NestedTensor):
+        xs = self[0](tensor_list)
+        out: List[NestedTensor] = []
+        pos = []
+        for name, x in sorted(xs.items()):
+            out.append(x)
+        # position encoding
+        for x in out:
+            pos.append(self[1](x).to(x.tensors.dtype))
+        return out, pos
+def build_backbone(args):
+    position_embedding = build_position_encoding(args)
+    train_backbone = args.lr_backbone > 0
+    return_interm_layers = args.masks or (args.num_feature_levels > 1)
+    if "swin" in args.backbone:
+        backbone = SwinBackbone()
+    elif "pev1" in args.backbone:
+        backbone = PEv1Backbone(args)
+    else:
+        backbone = Backbone(
+            args.backbone, train_backbone, return_interm_layers, args.dilation
+        )
+    model = Joiner(backbone, position_embedding)
+    return model

perception_models/apps/detection/DETA_pe/models/deformable_detr.py ADDED Viewed

	@@ -0,0 +1,776 @@

+# ------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+"""
+Deformable DETR model and criterion classes.
+"""
+import copy
+import math
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torchvision.ops.boxes import batched_nms
+from util import box_ops
+from util.misc import (
+    accuracy,
+    get_world_size,
+    interpolate,
+    inverse_sigmoid,
+    is_dist_avail_and_initialized,
+    nested_tensor_from_tensor_list,
+    NestedTensor,
+)
+from .assigner import Stage1Assigner, Stage2Assigner
+from .backbone import build_backbone
+from .deformable_transformer import build_deforamble_transformer
+from .matcher import build_matcher
+from .segmentation import (
+    DETRsegm,
+    dice_loss,
+    PostProcessPanoptic,
+    PostProcessSegm,
+    sigmoid_focal_loss,
+)
+from .utils_fed_loss import get_fed_loss_inds, load_class_freq
+from .utils_softnms import batched_soft_nms
+def _get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+class DeformableDETR(nn.Module):
+    """This is the Deformable DETR module that performs object detection"""
+    def __init__(
+        self,
+        backbone,
+        transformer,
+        num_classes,
+        num_queries,
+        num_feature_levels,
+        aux_loss=True,
+        with_box_refine=False,
+        two_stage=False,
+    ):
+        """Initializes the model.
+        Parameters:
+            backbone: torch module of the backbone to be used. See backbone.py
+            transformer: torch module of the transformer architecture. See transformer.py
+            num_classes: number of object classes
+            num_queries: number of object queries, ie detection slot. This is the maximal number of objects
+                         DETR can detect in a single image. For COCO, we recommend 100 queries.
+            aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
+            with_box_refine: iterative bounding box refinement
+            two_stage: two-stage Deformable DETR
+        """
+        super().__init__()
+        self.num_queries = num_queries
+        self.transformer = transformer
+        hidden_dim = transformer.d_model
+        self.class_embed = nn.Linear(hidden_dim, num_classes)
+        self.bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)
+        self.num_feature_levels = num_feature_levels
+        if not two_stage:
+            self.query_embed = nn.Embedding(num_queries, hidden_dim * 2)
+        if num_feature_levels > 1:
+            num_backbone_outs = len(backbone.strides)
+            input_proj_list = []
+            for _ in range(num_backbone_outs):
+                in_channels = backbone.num_channels[_]
+                input_proj_list.append(
+                    nn.Sequential(
+                        nn.Conv2d(in_channels, hidden_dim, kernel_size=1),
+                        nn.GroupNorm(32, hidden_dim),
+                    )
+                )
+            for _ in range(num_feature_levels - num_backbone_outs):
+                input_proj_list.append(
+                    nn.Sequential(
+                        nn.Conv2d(
+                            in_channels, hidden_dim, kernel_size=3, stride=2, padding=1
+                        ),
+                        nn.GroupNorm(32, hidden_dim),
+                    )
+                )
+                in_channels = hidden_dim
+            self.input_proj = nn.ModuleList(input_proj_list)
+        else:
+            self.input_proj = nn.ModuleList(
+                [
+                    nn.Sequential(
+                        nn.Conv2d(backbone.num_channels[0], hidden_dim, kernel_size=1),
+                        nn.GroupNorm(32, hidden_dim),
+                    )
+                ]
+            )
+        self.backbone = backbone
+        self.aux_loss = aux_loss
+        self.with_box_refine = with_box_refine
+        self.two_stage = two_stage
+        prior_prob = 0.01
+        bias_value = -math.log((1 - prior_prob) / prior_prob)
+        self.class_embed.bias.data = torch.ones(num_classes) * bias_value
+        nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0)
+        for proj in self.input_proj:
+            nn.init.xavier_uniform_(proj[0].weight, gain=1)
+            nn.init.constant_(proj[0].bias, 0)
+        # if two-stage, the last class_embed and bbox_embed is for region proposal generation
+        num_pred = (
+            (transformer.decoder.num_layers + 1)
+            if two_stage
+            else transformer.decoder.num_layers
+        )
+        if with_box_refine:
+            self.class_embed = _get_clones(self.class_embed, num_pred)
+            self.bbox_embed = _get_clones(self.bbox_embed, num_pred)
+            nn.init.constant_(self.bbox_embed[0].layers[-1].bias.data[2:], -2.0)
+            # hack implementation for iterative bounding box refinement
+            self.transformer.decoder.bbox_embed = self.bbox_embed
+        else:
+            nn.init.constant_(self.bbox_embed.layers[-1].bias.data[2:], -2.0)
+            self.class_embed = nn.ModuleList(
+                [self.class_embed for _ in range(num_pred)]
+            )
+            self.bbox_embed = nn.ModuleList([self.bbox_embed for _ in range(num_pred)])
+            self.transformer.decoder.bbox_embed = None
+        if two_stage:
+            # hack implementation for two-stage
+            self.transformer.decoder.class_embed = self.class_embed
+            for box_embed in self.bbox_embed:
+                nn.init.constant_(box_embed.layers[-1].bias.data[2:], 0.0)
+    def forward(self, samples: NestedTensor):
+        """The forward expects a NestedTensor, which consists of:
+           - samples.tensor: batched images, of shape [batch_size x 3 x H x W]
+           - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels
+        It returns a dict with the following elements:
+           - "pred_logits": the classification logits (including no-object) for all queries.
+                            Shape= [batch_size x num_queries x (num_classes + 1)]
+           - "pred_boxes": The normalized boxes coordinates for all queries, represented as
+                           (center_x, center_y, height, width). These values are normalized in [0, 1],
+                           relative to the size of each individual image (disregarding possible padding).
+                           See PostProcess for information on how to retrieve the unnormalized bounding box.
+           - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
+                            dictionnaries containing the two above keys for each decoder layer.
+        """
+        if not isinstance(samples, NestedTensor):
+            samples = nested_tensor_from_tensor_list(samples)
+        features, pos = self.backbone(samples)
+        srcs = []
+        masks = []
+        for l, feat in enumerate(features):
+            src, mask = feat.decompose()
+            srcs.append(self.input_proj[l](src))
+            masks.append(mask)
+            assert mask is not None
+        if self.num_feature_levels > len(srcs):
+            _len_srcs = len(srcs)
+            for l in range(_len_srcs, self.num_feature_levels):
+                if l == _len_srcs:
+                    src = self.input_proj[l](features[-1].tensors)
+                else:
+                    src = self.input_proj[l](srcs[-1])
+                m = samples.mask
+                mask = F.interpolate(m[None].float(), size=src.shape[-2:]).to(
+                    torch.bool
+                )[0]
+                pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype)
+                srcs.append(src)
+                masks.append(mask)
+                pos.append(pos_l)
+        query_embeds = None
+        if not self.two_stage:
+            query_embeds = self.query_embed.weight
+        (
+            hs,
+            init_reference,
+            inter_references,
+            enc_outputs_class,
+            enc_outputs_coord_unact,
+            anchors,
+        ) = self.transformer(srcs, masks, pos, query_embeds)
+        outputs_classes = []
+        outputs_coords = []
+        for lvl in range(hs.shape[0]):
+            if lvl == 0:
+                reference = init_reference
+            else:
+                reference = inter_references[lvl - 1]
+            reference = inverse_sigmoid(reference)
+            outputs_class = self.class_embed[lvl](hs[lvl])
+            tmp = self.bbox_embed[lvl](hs[lvl])
+            if reference.shape[-1] == 4:
+                tmp += reference
+            else:
+                assert reference.shape[-1] == 2
+                tmp[..., :2] += reference
+            outputs_coord = tmp.sigmoid()
+            outputs_classes.append(outputs_class)
+            outputs_coords.append(outputs_coord)
+        outputs_class = torch.stack(outputs_classes)
+        outputs_coord = torch.stack(outputs_coords)
+        out = {
+            "pred_logits": outputs_class[-1],
+            "pred_boxes": outputs_coord[-1],
+            "init_reference": init_reference,
+        }
+        if self.aux_loss:
+            out["aux_outputs"] = self._set_aux_loss(outputs_class, outputs_coord)
+        if self.two_stage:
+            enc_outputs_coord = enc_outputs_coord_unact.sigmoid()
+            out["enc_outputs"] = {
+                "pred_logits": enc_outputs_class,
+                "pred_boxes": enc_outputs_coord,
+                "anchors": anchors,
+            }
+        return out
+    @torch.jit.unused
+    def _set_aux_loss(self, outputs_class, outputs_coord):
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        return [
+            {"pred_logits": a, "pred_boxes": b}
+            for a, b in zip(outputs_class[:-1], outputs_coord[:-1])
+        ]
+class SetCriterion(nn.Module):
+    """This class computes the loss for DETR.
+    The process happens in two steps:
+        1) we compute hungarian assignment between ground truth boxes and the outputs of the model
+        2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
+    """
+    def __init__(
+        self,
+        num_classes,
+        matcher,
+        weight_dict,
+        losses,
+        focal_alpha=0.25,
+        num_queries=300,
+        assign_first_stage=False,
+        assign_second_stage=False,
+        use_fed_loss=False,
+    ):
+        """Create the criterion.
+        Parameters:
+            num_classes: number of object categories, omitting the special no-object category
+            matcher: module able to compute a matching between targets and proposals
+            weight_dict: dict containing as key the names of the losses and as values their relative weight.
+            losses: list of all the losses to be applied. See get_loss for list of available losses.
+            focal_alpha: alpha in Focal Loss
+        """
+        super().__init__()
+        self.num_classes = num_classes
+        self.matcher = matcher
+        self.weight_dict = weight_dict
+        self.losses = losses
+        self.focal_alpha = focal_alpha
+        self.assign_first_stage = assign_first_stage
+        self.assign_second_stage = assign_second_stage
+        if self.assign_first_stage:
+            self.stg1_assigner = Stage1Assigner()
+        if self.assign_second_stage:
+            self.stg2_assigner = Stage2Assigner(num_queries)
+        self.use_fed_loss = use_fed_loss
+        if self.use_fed_loss:
+            print("Using federated loss")
+            print("Using federated loss")
+            print("Using federated loss")
+            self.register_buffer("fed_loss_weight", load_class_freq(freq_weight=0.5))
+    def loss_labels(self, outputs, targets, indices, num_boxes, log=True):
+        """Classification loss (NLL)
+        targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
+        """
+        assert "pred_logits" in outputs
+        src_logits = outputs["pred_logits"]
+        idx = self._get_src_permutation_idx(indices)
+        target_classes_o = torch.cat(
+            [t["labels"][J] for t, (_, J) in zip(targets, indices)]
+        )
+        target_classes = torch.full(
+            src_logits.shape[:2],
+            self.num_classes,
+            dtype=torch.int64,
+            device=src_logits.device,
+        )
+        target_classes[idx] = target_classes_o
+        target_classes_onehot = torch.zeros(
+            [src_logits.shape[0], src_logits.shape[1], src_logits.shape[2] + 1],
+            dtype=src_logits.dtype,
+            layout=src_logits.layout,
+            device=src_logits.device,
+        )
+        target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)
+        target_classes_onehot = target_classes_onehot[:, :, :-1]
+        if self.use_fed_loss:
+            inds = (
+                get_fed_loss_inds(
+                    gt_classes=target_classes_o - 1,
+                    num_sample_cats=50,
+                    weight=self.fed_loss_weight,
+                    C=target_classes_onehot.shape[2] - 1,
+                )
+                + 1
+            )  # pay attention to the -1 and +1
+            loss_ce = (
+                sigmoid_focal_loss(
+                    src_logits[:, :, inds],
+                    target_classes_onehot[:, :, inds],
+                    num_boxes,
+                    alpha=self.focal_alpha,
+                    gamma=2,
+                )
+                * src_logits.shape[1]
+            )
+        else:
+            loss_ce = (
+                sigmoid_focal_loss(
+                    src_logits,
+                    target_classes_onehot,
+                    num_boxes,
+                    alpha=self.focal_alpha,
+                    gamma=2,
+                )
+                * src_logits.shape[1]
+            )
+        losses = {"loss_ce": loss_ce}
+        if log:
+            # TODO this should probably be a separate loss, not hacked in this one here
+            losses["class_error"] = 100 - accuracy(src_logits[idx], target_classes_o)[0]
+        return losses
+    @torch.no_grad()
+    def loss_cardinality(self, outputs, targets, indices, num_boxes):
+        """Compute the cardinality error, ie the absolute error in the number of predicted non-empty boxes
+        This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients
+        """
+        pred_logits = outputs["pred_logits"]
+        device = pred_logits.device
+        tgt_lengths = torch.as_tensor(
+            [len(v["labels"]) for v in targets], device=device
+        )
+        # Count the number of predictions that are NOT "no-object" (which is the last class)
+        card_pred = (pred_logits.argmax(-1) != pred_logits.shape[-1] - 1).sum(1)
+        card_err = F.l1_loss(card_pred.float(), tgt_lengths.float())
+        losses = {"cardinality_error": card_err}
+        return losses
+    def loss_boxes(self, outputs, targets, indices, num_boxes):
+        """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
+        targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
+        The target boxes are expected in format (center_x, center_y, h, w), normalized by the image size.
+        """
+        assert "pred_boxes" in outputs
+        idx = self._get_src_permutation_idx(indices)
+        src_boxes = outputs["pred_boxes"][idx]
+        target_boxes = torch.cat(
+            [t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0
+        )
+        loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction="none")
+        losses = {}
+        losses["loss_bbox"] = loss_bbox.sum() / num_boxes
+        loss_giou = 1 - torch.diag(
+            box_ops.generalized_box_iou(
+                box_ops.box_cxcywh_to_xyxy(src_boxes),
+                box_ops.box_cxcywh_to_xyxy(target_boxes),
+            )
+        )
+        losses["loss_giou"] = loss_giou.sum() / num_boxes
+        return losses
+    def loss_masks(self, outputs, targets, indices, num_boxes):
+        """Compute the losses related to the masks: the focal loss and the dice loss.
+        targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w]
+        """
+        assert "pred_masks" in outputs
+        src_idx = self._get_src_permutation_idx(indices)
+        tgt_idx = self._get_tgt_permutation_idx(indices)
+        src_masks = outputs["pred_masks"]
+        # TODO use valid to mask invalid areas due to padding in loss
+        target_masks, valid = nested_tensor_from_tensor_list(
+            [t["masks"] for t in targets]
+        ).decompose()
+        target_masks = target_masks.to(src_masks)
+        src_masks = src_masks[src_idx]
+        # upsample predictions to the target size
+        src_masks = interpolate(
+            src_masks[:, None],
+            size=target_masks.shape[-2:],
+            mode="bilinear",
+            align_corners=False,
+        )
+        src_masks = src_masks[:, 0].flatten(1)
+        target_masks = target_masks[tgt_idx].flatten(1)
+        losses = {
+            "loss_mask": sigmoid_focal_loss(src_masks, target_masks, num_boxes),
+            "loss_dice": dice_loss(src_masks, target_masks, num_boxes),
+        }
+        return losses
+    def _get_src_permutation_idx(self, indices):
+        # permute predictions following indices
+        batch_idx = torch.cat(
+            [torch.full_like(src, i) for i, (src, _) in enumerate(indices)]
+        )
+        src_idx = torch.cat([src for (src, _) in indices])
+        return batch_idx, src_idx
+    def _get_tgt_permutation_idx(self, indices):
+        # permute targets following indices
+        batch_idx = torch.cat(
+            [torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)]
+        )
+        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
+        return batch_idx, tgt_idx
+    def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs):
+        loss_map = {
+            "labels": self.loss_labels,
+            "cardinality": self.loss_cardinality,
+            "boxes": self.loss_boxes,
+            "masks": self.loss_masks,
+        }
+        assert loss in loss_map, f"do you really want to compute {loss} loss?"
+        return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs)
+    def forward(self, outputs, targets):
+        """This performs the loss computation.
+        Parameters:
+             outputs: dict of tensors, see the output specification of the model for the format
+             targets: list of dicts, such that len(targets) == batch_size.
+                      The expected keys in each dict depends on the losses applied, see each loss' doc
+        """
+        outputs_without_aux = {
+            k: v
+            for k, v in outputs.items()
+            if k != "aux_outputs" and k != "enc_outputs"
+        }
+        # Retrieve the matching between the outputs of the last layer and the targets
+        if self.assign_second_stage:
+            indices = self.stg2_assigner(outputs_without_aux, targets)
+        else:
+            indices = self.matcher(outputs_without_aux, targets)
+        # Compute the average number of target boxes accross all nodes, for normalization purposes
+        num_boxes = sum(len(t["labels"]) for t in targets)
+        num_boxes = torch.as_tensor(
+            [num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device
+        )
+        if is_dist_avail_and_initialized():
+            torch.distributed.all_reduce(num_boxes)
+        num_boxes = torch.clamp(num_boxes / get_world_size(), min=1).item()
+        # Compute all the requested losses
+        losses = {}
+        for loss in self.losses:
+            kwargs = {}
+            losses.update(
+                self.get_loss(loss, outputs, targets, indices, num_boxes, **kwargs)
+            )
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if "aux_outputs" in outputs:
+            for i, aux_outputs in enumerate(outputs["aux_outputs"]):
+                if not self.assign_second_stage:
+                    indices = self.matcher(aux_outputs, targets)
+                for loss in self.losses:
+                    if loss == "masks":
+                        # Intermediate masks losses are too costly to compute, we ignore them.
+                        continue
+                    kwargs = {}
+                    if loss == "labels":
+                        # Logging is enabled only for the last layer
+                        kwargs["log"] = False
+                    l_dict = self.get_loss(
+                        loss, aux_outputs, targets, indices, num_boxes, **kwargs
+                    )
+                    l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+        if "enc_outputs" in outputs:
+            enc_outputs = outputs["enc_outputs"]
+            bin_targets = copy.deepcopy(targets)
+            for bt in bin_targets:
+                bt["labels"] = torch.zeros_like(bt["labels"])
+            if self.assign_first_stage:
+                indices = self.stg1_assigner(enc_outputs, bin_targets)
+            else:
+                indices = self.matcher(enc_outputs, bin_targets)
+            for loss in self.losses:
+                if loss == "masks":
+                    # Intermediate masks losses are too costly to compute, we ignore them.
+                    continue
+                kwargs = {}
+                if loss == "labels":
+                    # Logging is enabled only for the last layer
+                    kwargs["log"] = False
+                l_dict = self.get_loss(
+                    loss, enc_outputs, bin_targets, indices, num_boxes, **kwargs
+                )
+                l_dict = {k + f"_enc": v for k, v in l_dict.items()}
+                losses.update(l_dict)
+        return losses
+class PostProcess(nn.Module):
+    """This module converts the model's output into the format expected by the coco api"""
+    @torch.no_grad()
+    def forward(self, outputs, target_sizes, num_topk=100):
+        """Perform the computation
+        Parameters:
+            outputs: raw outputs of the model
+            target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch
+                          For evaluation, this must be the original image size (before any data augmentation)
+                          For visualization, this should be the image size after data augment, but before padding
+        """
+        out_logits, out_bbox = outputs["pred_logits"], outputs["pred_boxes"]
+        assert len(out_logits) == len(target_sizes)
+        assert target_sizes.shape[1] == 2
+        prob = out_logits.sigmoid()
+        topk_values, topk_indexes = torch.topk(
+            prob.view(out_logits.shape[0], -1), num_topk, dim=1
+        )
+        scores = topk_values
+        topk_boxes = topk_indexes // out_logits.shape[2]
+        labels = topk_indexes % out_logits.shape[2]
+        boxes = box_ops.box_cxcywh_to_xyxy(out_bbox)
+        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
+        # and from relative [0, 1] to absolute [0, height] coordinates
+        img_h, img_w = target_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
+        boxes = boxes * scale_fct[:, None, :]
+        results = [
+            {"scores": s, "labels": l, "boxes": b}
+            for s, l, b in zip(scores, labels, boxes)
+        ]
+        return results
+class NMSPostProcess(nn.Module):
+    """This module converts the model's output into the format expected by the coco api"""
+    @torch.no_grad()
+    def forward(
+        self,
+        outputs,
+        target_sizes,
+        num_topk=100,
+        soft_nms=False,
+        nms_thresh=0.7,
+        method="quad",
+        quad_scale=1.0,
+    ):
+        """Perform the computation
+        Parameters:
+            outputs: raw outputs of the model
+            target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch
+                          For evaluation, this must be the original image size (before any data augmentation)
+                          For visualization, this should be the image size after data augment, but before padding
+        """
+        out_logits, out_bbox = outputs["pred_logits"], outputs["pred_boxes"]
+        bs, n_queries, n_cls = out_logits.shape
+        assert len(out_logits) == len(target_sizes)
+        assert target_sizes.shape[1] == 2
+        prob = out_logits.sigmoid()
+        all_scores = prob.view(bs, n_queries * n_cls).to(out_logits.device)
+        all_indexes = (
+            torch.arange(n_queries * n_cls)[None].repeat(bs, 1).to(out_logits.device)
+        )
+        all_boxes = all_indexes // out_logits.shape[2]
+        all_labels = all_indexes % out_logits.shape[2]
+        boxes = box_ops.box_cxcywh_to_xyxy(out_bbox)
+        boxes = torch.gather(boxes, 1, all_boxes.unsqueeze(-1).repeat(1, 1, 4))
+        # and from relative [0, 1] to absolute [0, height] coordinates
+        img_h, img_w = target_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
+        boxes = boxes * scale_fct[:, None, :]
+        results = []
+        for b in range(bs):
+            box = boxes[b]
+            score = all_scores[b]
+            lbls = all_labels[b]
+            if soft_nms:
+                if n_queries * n_cls > 2000:
+                    pre_topk = score.topk(2000).indices
+                    box = box[pre_topk]
+                    score = score[pre_topk]
+                    lbls = lbls[pre_topk]
+                # Apply soft-NMS to get indices and updated scores
+                keep_inds, updated_scores = batched_soft_nms(
+                    box,
+                    score,
+                    lbls,
+                    nms_thresh,
+                    method=method,
+                    quad_scale=quad_scale,
+                )[:num_topk]
+                results.append(
+                    {
+                        "scores": updated_scores,
+                        "labels": lbls[keep_inds],
+                        "boxes": box[keep_inds],
+                    }
+                )
+            else:
+                if n_queries * n_cls > 10000:
+                    pre_topk = score.topk(10000).indices
+                    box = box[pre_topk]
+                    score = score[pre_topk]
+                    lbls = lbls[pre_topk]
+                keep_inds = batched_nms(box, score, lbls, nms_thresh)[:num_topk]
+                results.append(
+                    {
+                        "scores": score[keep_inds],
+                        "labels": lbls[keep_inds],
+                        "boxes": box[keep_inds],
+                    }
+                )
+        return results
+class MLP(nn.Module):
+    """Very simple multi-layer perceptron (also called FFN)"""
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(
+            nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])
+        )
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+def build(args):
+    # num_classes = 20 if args.dataset_file != 'coco' else 91
+    if args.dataset_file == "coco_panoptic":
+        num_classes = 250
+    elif args.dataset_file == "voc":
+        num_classes = 20
+    elif args.dataset_file == "objects365":
+        num_classes = 366
+    elif args.dataset_file == "lvis":
+        num_classes = 1204
+    else:  # coco
+        num_classes = 91
+    device = torch.device(args.device)
+    backbone = build_backbone(args)
+    transformer = build_deforamble_transformer(args)
+    model = DeformableDETR(
+        backbone,
+        transformer,
+        num_classes=num_classes,
+        num_queries=args.num_queries,
+        num_feature_levels=args.num_feature_levels,
+        aux_loss=args.aux_loss,
+        with_box_refine=args.with_box_refine,
+        two_stage=args.two_stage,
+    )
+    if args.masks:
+        model = DETRsegm(model, freeze_detr=(args.frozen_weights is not None))
+    matcher = build_matcher(args)
+    weight_dict = {"loss_ce": args.cls_loss_coef, "loss_bbox": args.bbox_loss_coef}
+    weight_dict["loss_giou"] = args.giou_loss_coef
+    if args.masks:
+        weight_dict["loss_mask"] = args.mask_loss_coef
+        weight_dict["loss_dice"] = args.dice_loss_coef
+    # TODO this is a hack
+    if args.aux_loss:
+        aux_weight_dict = {}
+        for i in range(args.dec_layers - 1):
+            aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
+        aux_weight_dict.update({k + f"_enc": v for k, v in weight_dict.items()})
+        weight_dict.update(aux_weight_dict)
+    losses = ["labels", "boxes", "cardinality"]
+    if args.masks:
+        losses += ["masks"]
+    # num_classes, matcher, weight_dict, losses, focal_alpha=0.25
+    criterion = SetCriterion(
+        num_classes,
+        matcher,
+        weight_dict,
+        losses,
+        focal_alpha=args.focal_alpha,
+        num_queries=args.num_queries,
+        assign_first_stage=args.assign_first_stage,
+        assign_second_stage=args.assign_second_stage,
+        use_fed_loss=args.use_fed_loss,
+    )
+    criterion.to(device)
+    if args.assign_second_stage:
+        postprocessors = {"bbox": NMSPostProcess()}
+    else:
+        postprocessors = {"bbox": PostProcess()}
+    if args.masks:
+        postprocessors["segm"] = PostProcessSegm()
+        if args.dataset_file == "coco_panoptic":
+            is_thing_map = {i: i <= 90 for i in range(201)}
+            postprocessors["panoptic"] = PostProcessPanoptic(
+                is_thing_map, threshold=0.85
+            )
+    return model, criterion, postprocessors

perception_models/apps/detection/DETA_pe/models/deformable_transformer.py ADDED Viewed

	@@ -0,0 +1,451 @@

+# ------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+import copy
+from typing import Optional, List
+import math
+import torch
+import torch.nn.functional as F
+from torch import nn, Tensor
+from torch.nn.init import xavier_uniform_, constant_, uniform_, normal_
+from util.misc import inverse_sigmoid
+from models.ops.modules import MSDeformAttn
+from torchvision.ops.boxes import batched_nms
+from util.box_ops import box_cxcywh_to_xyxy
+class DeformableTransformer(nn.Module):
+    def __init__(self, d_model=256, nhead=8,
+                 num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=1024, dropout=0.1,
+                 activation="relu", return_intermediate_dec=False,
+                 num_feature_levels=4, dec_n_points=4,  enc_n_points=4,
+                 two_stage=False, two_stage_num_proposals=300,
+                 assign_first_stage=False):
+        super().__init__()
+        self.d_model = d_model
+        self.nhead = nhead
+        self.two_stage = two_stage
+        self.two_stage_num_proposals = two_stage_num_proposals
+        self.assign_first_stage = assign_first_stage
+        encoder_layer = DeformableTransformerEncoderLayer(d_model, dim_feedforward,
+                                                          dropout, activation,
+                                                          num_feature_levels, nhead, enc_n_points)
+        self.encoder = DeformableTransformerEncoder(encoder_layer, num_encoder_layers)
+        decoder_layer = DeformableTransformerDecoderLayer(d_model, dim_feedforward,
+                                                          dropout, activation,
+                                                          num_feature_levels, nhead, dec_n_points)
+        self.decoder = DeformableTransformerDecoder(decoder_layer, num_decoder_layers, return_intermediate_dec)
+        self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model))
+        if two_stage:
+            self.enc_output = nn.Linear(d_model, d_model)
+            self.enc_output_norm = nn.LayerNorm(d_model)
+            self.pos_trans = nn.Linear(d_model * 2, d_model * 2)
+            self.pos_trans_norm = nn.LayerNorm(d_model * 2)
+            self.pix_trans = nn.Linear(d_model, d_model)
+            self.pix_trans_norm = nn.LayerNorm(d_model)
+        else:
+            self.reference_points = nn.Linear(d_model, 2)
+        self._reset_parameters()
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MSDeformAttn):
+                m._reset_parameters()
+        if not self.two_stage:
+            xavier_uniform_(self.reference_points.weight.data, gain=1.0)
+            constant_(self.reference_points.bias.data, 0.)
+        normal_(self.level_embed)
+    def get_proposal_pos_embed(self, proposals):
+        num_pos_feats = 128
+        temperature = 10000
+        scale = 2 * math.pi
+        dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=proposals.device)
+        dim_t = temperature ** (2 * (dim_t // 2) / num_pos_feats)
+        # N, L, 4
+        proposals = proposals.sigmoid() * scale
+        # N, L, 4, 128
+        pos = proposals[:, :, :, None] / dim_t
+        # N, L, 4, 64, 2
+        pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2)
+        return pos
+    def gen_encoder_output_proposals(self, memory, memory_padding_mask, spatial_shapes):
+        N_, S_, C_ = memory.shape
+        base_scale = 4.0
+        proposals = []
+        _cur = 0
+        level_ids = []
+        for lvl, (H_, W_) in enumerate(spatial_shapes):
+            mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H_ * W_)].view(N_, H_, W_, 1)
+            valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
+            valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
+            grid_y, grid_x = torch.meshgrid(torch.linspace(0, H_ - 1, H_, dtype=torch.float32, device=memory.device),
+                                            torch.linspace(0, W_ - 1, W_, dtype=torch.float32, device=memory.device))
+            grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
+            scale = torch.cat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).view(N_, 1, 1, 2)
+            grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale
+            wh = torch.ones_like(grid) * 0.05 * (2.0 ** lvl)
+            proposal = torch.cat((grid, wh), -1).view(N_, -1, 4)
+            proposals.append(proposal)
+            _cur += (H_ * W_)
+            level_ids.append(grid.new_ones(H_ * W_, dtype=torch.long) * lvl)
+        output_proposals = torch.cat(proposals, 1)
+        output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
+        output_proposals = torch.log(output_proposals / (1 - output_proposals))
+        output_proposals = output_proposals.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf'))
+        output_proposals = output_proposals.masked_fill(~output_proposals_valid, float('inf'))
+        output_memory = memory
+        output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float(0))
+        output_memory = output_memory.masked_fill(~output_proposals_valid, float(0))
+        output_memory = self.enc_output_norm(self.enc_output(output_memory))
+        level_ids = torch.cat(level_ids)
+        return output_memory, output_proposals, level_ids
+    def get_valid_ratio(self, mask):
+        _, H, W = mask.shape
+        valid_H = torch.sum(~mask[:, :, 0], 1)
+        valid_W = torch.sum(~mask[:, 0, :], 1)
+        valid_ratio_h = valid_H.float() / H
+        valid_ratio_w = valid_W.float() / W
+        valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1)
+        return valid_ratio
+    def forward(self, srcs, masks, pos_embeds, query_embed=None):
+        assert self.two_stage or query_embed is not None
+        # prepare input for encoder
+        src_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        spatial_shapes = []
+        for lvl, (src, mask, pos_embed) in enumerate(zip(srcs, masks, pos_embeds)):
+            bs, c, h, w = src.shape
+            spatial_shape = (h, w)
+            spatial_shapes.append(spatial_shape)
+            src = src.flatten(2).transpose(1, 2)
+            mask = mask.flatten(1)
+            pos_embed = pos_embed.flatten(2).transpose(1, 2)
+            lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1)
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            src_flatten.append(src)
+            mask_flatten.append(mask)
+        src_flatten = torch.cat(src_flatten, 1)
+        mask_flatten = torch.cat(mask_flatten, 1)
+        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
+        spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=src_flatten.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros((1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)
+        # encoder
+        memory = self.encoder(src_flatten, spatial_shapes, level_start_index, valid_ratios, lvl_pos_embed_flatten, mask_flatten)
+        # prepare input for decoder
+        bs, _, c = memory.shape
+        if self.two_stage:
+            output_memory, output_proposals, level_ids = self.gen_encoder_output_proposals(memory, mask_flatten, spatial_shapes)
+            # hack implementation for two-stage Deformable DETR
+            enc_outputs_class = self.decoder.class_embed[self.decoder.num_layers](output_memory)
+            enc_outputs_coord_unact = self.decoder.bbox_embed[self.decoder.num_layers](output_memory) + output_proposals
+            topk = self.two_stage_num_proposals
+            proposal_logit = enc_outputs_class[..., 0]
+            if self.assign_first_stage:
+                proposal_boxes = box_cxcywh_to_xyxy(enc_outputs_coord_unact.sigmoid().float()).clamp(0, 1)
+                topk_proposals = []
+                for b in range(bs):
+                    prop_boxes_b = proposal_boxes[b]
+                    prop_logits_b = proposal_logit[b]
+                    # pre-nms per-level topk
+                    pre_nms_topk = 1000
+                    pre_nms_inds = []
+                    for lvl in range(len(spatial_shapes)):
+                        lvl_mask = level_ids == lvl
+                        pre_nms_inds.append(torch.topk(prop_logits_b.sigmoid() * lvl_mask, pre_nms_topk)[1])
+                    pre_nms_inds = torch.cat(pre_nms_inds)
+                    # nms on topk indices
+                    post_nms_inds = batched_nms(prop_boxes_b[pre_nms_inds], prop_logits_b[pre_nms_inds], level_ids[pre_nms_inds], 0.9)
+                    keep_inds = pre_nms_inds[post_nms_inds]
+                    if len(keep_inds) < self.two_stage_num_proposals:
+                        print(f'[WARNING] nms proposals ({len(keep_inds)}) < {self.two_stage_num_proposals}, running naive topk')
+                        keep_inds = torch.topk(proposal_logit[b], topk)[1]
+                    # keep top Q/L indices for L levels
+                    q_per_l = topk // len(spatial_shapes)
+                    is_level_ordered = level_ids[keep_inds][None] == torch.arange(len(spatial_shapes), device=level_ids.device)[:,None]  # LS
+                    keep_inds_mask = is_level_ordered & (is_level_ordered.cumsum(1) <= q_per_l)  # LS
+                    keep_inds_mask = keep_inds_mask.any(0)  # S
+                    # pad to Q indices (might let ones filtered from pre-nms sneak by... unlikely because we pick high conf anyways)
+                    if keep_inds_mask.sum() < topk:
+                        num_to_add = topk - keep_inds_mask.sum()
+                        pad_inds = (~keep_inds_mask).nonzero()[:num_to_add]
+                        keep_inds_mask[pad_inds] = True
+                    # index
+                    keep_inds_topk = keep_inds[keep_inds_mask]
+                    topk_proposals.append(keep_inds_topk)
+                topk_proposals = torch.stack(topk_proposals)
+            else:
+                topk_proposals = torch.topk(proposal_logit, topk, dim=1)[1]
+            topk_coords_unact = torch.gather(enc_outputs_coord_unact, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4))
+            topk_coords_unact = topk_coords_unact.detach()
+            reference_points = topk_coords_unact.sigmoid()
+            init_reference_out = reference_points
+            pos_trans_out = self.pos_trans_norm(self.pos_trans(self.get_proposal_pos_embed(topk_coords_unact)))
+            query_embed, tgt = torch.split(pos_trans_out, c, dim=2)
+            topk_feats = torch.stack([output_memory[b][topk_proposals[b]] for b in range(bs)]).detach()
+            tgt = tgt + self.pix_trans_norm(self.pix_trans(topk_feats))
+        else:
+            query_embed, tgt = torch.split(query_embed, c, dim=1)
+            query_embed = query_embed.unsqueeze(0).expand(bs, -1, -1)
+            tgt = tgt.unsqueeze(0).expand(bs, -1, -1)
+            reference_points = self.reference_points(query_embed).sigmoid()
+            init_reference_out = reference_points
+        # decoder
+        hs, inter_references = self.decoder(tgt, reference_points, memory,
+                                            spatial_shapes, level_start_index, valid_ratios, query_embed, mask_flatten)
+        inter_references_out = inter_references
+        if self.two_stage:
+            return hs, init_reference_out, inter_references_out, enc_outputs_class, enc_outputs_coord_unact, output_proposals.sigmoid()
+        return hs, init_reference_out, inter_references_out, None, None, None
+class DeformableTransformerEncoderLayer(nn.Module):
+    def __init__(self,
+                 d_model=256, d_ffn=1024,
+                 dropout=0.1, activation="relu",
+                 n_levels=4, n_heads=8, n_points=4):
+        super().__init__()
+        # self attention
+        self.self_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(d_model)
+        # ffn
+        self.linear1 = nn.Linear(d_model, d_ffn)
+        self.activation = _get_activation_fn(activation)
+        self.dropout2 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(d_ffn, d_model)
+        self.dropout3 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(d_model)
+    @staticmethod
+    def with_pos_embed(tensor, pos):
+        return tensor if pos is None else tensor + pos
+    def forward_ffn(self, src):
+        src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
+        src = src + self.dropout3(src2)
+        src = self.norm2(src)
+        return src
+    def forward(self, src, pos, reference_points, spatial_shapes, level_start_index, padding_mask=None):
+        # self attention
+        src2 = self.self_attn(self.with_pos_embed(src, pos), reference_points, src, spatial_shapes, level_start_index, padding_mask)
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        # ffn
+        src = self.forward_ffn(src)
+        return src
+class DeformableTransformerEncoder(nn.Module):
+    def __init__(self, encoder_layer, num_layers):
+        super().__init__()
+        self.layers = _get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+    @staticmethod
+    def get_reference_points(spatial_shapes, valid_ratios, device):
+        reference_points_list = []
+        for lvl, (H_, W_) in enumerate(spatial_shapes):
+            ref_y, ref_x = torch.meshgrid(torch.linspace(0.5, H_ - 0.5, H_, dtype=torch.float32, device=device),
+                                          torch.linspace(0.5, W_ - 0.5, W_, dtype=torch.float32, device=device))
+            ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * H_)
+            ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * W_)
+            ref = torch.stack((ref_x, ref_y), -1)
+            reference_points_list.append(ref)
+        reference_points = torch.cat(reference_points_list, 1)
+        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
+        return reference_points
+    def forward(self, src, spatial_shapes, level_start_index, valid_ratios, pos=None, padding_mask=None):
+        output = src
+        reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=src.device)
+        for _, layer in enumerate(self.layers):
+            output = layer(output, pos, reference_points, spatial_shapes, level_start_index, padding_mask)
+        return output
+class DeformableTransformerDecoderLayer(nn.Module):
+    def __init__(self, d_model=256, d_ffn=1024,
+                 dropout=0.1, activation="relu",
+                 n_levels=4, n_heads=8, n_points=4):
+        super().__init__()
+        # cross attention
+        self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(d_model)
+        # self attention
+        self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(d_model)
+        # ffn
+        self.linear1 = nn.Linear(d_model, d_ffn)
+        self.activation = _get_activation_fn(activation)
+        self.dropout3 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(d_ffn, d_model)
+        self.dropout4 = nn.Dropout(dropout)
+        self.norm3 = nn.LayerNorm(d_model)
+    @staticmethod
+    def with_pos_embed(tensor, pos):
+        return tensor if pos is None else tensor + pos
+    def forward_ffn(self, tgt):
+        tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout4(tgt2)
+        tgt = self.norm3(tgt)
+        return tgt
+    def forward(self, tgt, query_pos, reference_points, src, src_spatial_shapes, level_start_index, src_padding_mask=None):
+        # self attention
+        q = k = self.with_pos_embed(tgt, query_pos)
+        tgt2 = self.self_attn(q.transpose(0, 1), k.transpose(0, 1), tgt.transpose(0, 1))[0].transpose(0, 1)
+        tgt = tgt + self.dropout2(tgt2)
+        tgt = self.norm2(tgt)
+        # cross attention
+        tgt2 = self.cross_attn(self.with_pos_embed(tgt, query_pos),
+                               reference_points,
+                               src, src_spatial_shapes, level_start_index, src_padding_mask)
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+        # ffn
+        tgt = self.forward_ffn(tgt)
+        return tgt
+class DeformableTransformerDecoder(nn.Module):
+    def __init__(self, decoder_layer, num_layers, return_intermediate=False):
+        super().__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.return_intermediate = return_intermediate
+        # hack implementation for iterative bounding box refinement and two-stage Deformable DETR
+        self.bbox_embed = None
+        self.class_embed = None
+    def forward(self, tgt, reference_points, src, src_spatial_shapes, src_level_start_index, src_valid_ratios,
+                query_pos=None, src_padding_mask=None):
+        output = tgt
+        intermediate = []
+        intermediate_reference_points = []
+        for lid, layer in enumerate(self.layers):
+            if reference_points.shape[-1] == 4:
+                reference_points_input = reference_points[:, :, None] \
+                                         * torch.cat([src_valid_ratios, src_valid_ratios], -1)[:, None]
+            else:
+                assert reference_points.shape[-1] == 2
+                reference_points_input = reference_points[:, :, None] * src_valid_ratios[:, None]
+            output = layer(output, query_pos, reference_points_input, src, src_spatial_shapes, src_level_start_index, src_padding_mask)
+            # hack implementation for iterative bounding box refinement
+            if self.bbox_embed is not None:
+                tmp = self.bbox_embed[lid](output)
+                if reference_points.shape[-1] == 4:
+                    new_reference_points = tmp + inverse_sigmoid(reference_points)
+                    new_reference_points = new_reference_points.sigmoid()
+                else:
+                    assert reference_points.shape[-1] == 2
+                    new_reference_points = tmp
+                    new_reference_points[..., :2] = tmp[..., :2] + inverse_sigmoid(reference_points)
+                    new_reference_points = new_reference_points.sigmoid()
+                reference_points = new_reference_points.detach()
+            if self.return_intermediate:
+                intermediate.append(output)
+                intermediate_reference_points.append(reference_points)
+        if self.return_intermediate:
+            return torch.stack(intermediate), torch.stack(intermediate_reference_points)
+        return output, reference_points
+def _get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+def _get_activation_fn(activation):
+    """Return an activation function given a string"""
+    if activation == "relu":
+        return F.relu
+    if activation == "gelu":
+        return F.gelu
+    if activation == "glu":
+        return F.glu
+    raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
+def build_deforamble_transformer(args):
+    return DeformableTransformer(
+        d_model=args.hidden_dim,
+        nhead=args.nheads,
+        num_encoder_layers=args.enc_layers,
+        num_decoder_layers=args.dec_layers,
+        dim_feedforward=args.dim_feedforward,
+        dropout=args.dropout,
+        activation="relu",
+        return_intermediate_dec=True,
+        num_feature_levels=args.num_feature_levels,
+        dec_n_points=args.dec_n_points,
+        enc_n_points=args.enc_n_points,
+        two_stage=args.two_stage,
+        two_stage_num_proposals=args.num_queries,
+        assign_first_stage=args.assign_first_stage,
+        )

perception_models/apps/detection/DETA_pe/models/matcher.py ADDED Viewed

	@@ -0,0 +1,102 @@

+# ------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+"""
+Modules to compute the matching cost and solve the corresponding LSAP.
+"""
+import torch
+from scipy.optimize import linear_sum_assignment
+from torch import nn
+from util.box_ops import box_cxcywh_to_xyxy, generalized_box_iou
+class HungarianMatcher(nn.Module):
+    """This class computes an assignment between the targets and the predictions of the network
+    For efficiency reasons, the targets don't include the no_object. Because of this, in general,
+    there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
+    while the others are un-matched (and thus treated as non-objects).
+    """
+    def __init__(self,
+                 cost_class: float = 1,
+                 cost_bbox: float = 1,
+                 cost_giou: float = 1):
+        """Creates the matcher
+        Params:
+            cost_class: This is the relative weight of the classification error in the matching cost
+            cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
+            cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost
+        """
+        super().__init__()
+        self.cost_class = cost_class
+        self.cost_bbox = cost_bbox
+        self.cost_giou = cost_giou
+        assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0, "all costs cant be 0"
+    def forward(self, outputs, targets):
+        """ Performs the matching
+        Params:
+            outputs: This is a dict that contains at least these entries:
+                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
+                 "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates
+            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
+                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
+                           objects in the target) containing the class labels
+                 "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates
+        Returns:
+            A list of size batch_size, containing tuples of (index_i, index_j) where:
+                - index_i is the indices of the selected predictions (in order)
+                - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds:
+                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        with torch.no_grad():
+            bs, num_queries = outputs["pred_logits"].shape[:2]
+            # We flatten to compute the cost matrices in a batch
+            out_prob = outputs["pred_logits"].flatten(0, 1).sigmoid()
+            out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
+            # Also concat the target labels and boxes
+            tgt_ids = torch.cat([v["labels"] for v in targets])
+            tgt_bbox = torch.cat([v["boxes"] for v in targets])
+            # Compute the classification cost.
+            alpha = 0.25
+            gamma = 2.0
+            neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log())
+            pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
+            cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids]
+            # Compute the L1 cost between boxes
+            cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)
+            # Compute the giou cost betwen boxes
+            cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox),
+                                             box_cxcywh_to_xyxy(tgt_bbox))
+            # Final cost matrix
+            C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
+            C = C.view(bs, num_queries, -1).cpu()
+            sizes = [len(v["boxes"]) for v in targets]
+            indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
+            return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
+def build_matcher(args):
+    return HungarianMatcher(cost_class=args.set_cost_class,
+                            cost_bbox=args.set_cost_bbox,
+                            cost_giou=args.set_cost_giou)

perception_models/apps/detection/DETA_pe/models/ops/functions/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+from .ms_deform_attn_func import ms_deform_attn_core_pytorch, MSDeformAttnFunction

perception_models/apps/detection/DETA_pe/models/ops/functions/ms_deform_attn_func.py ADDED Viewed

	@@ -0,0 +1,106 @@

+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+from __future__ import absolute_import, division, print_function
+import MultiScaleDeformableAttention as MSDA
+import torch
+import torch.nn.functional as F
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+class MSDeformAttnFunction(Function):
+    @staticmethod
+    def forward(
+        ctx,
+        value,
+        value_spatial_shapes,
+        value_level_start_index,
+        sampling_locations,
+        attention_weights,
+        im2col_step,
+    ):
+        ctx.im2col_step = im2col_step
+        output = MSDA.ms_deform_attn_forward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            ctx.im2col_step,
+        )
+        ctx.save_for_backward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+        )
+        return output
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        (
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+        ) = ctx.saved_tensors
+        grad_value, grad_sampling_loc, grad_attn_weight = MSDA.ms_deform_attn_backward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            grad_output,
+            ctx.im2col_step,
+        )
+        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
+def ms_deform_attn_core_pytorch(
+    value, value_spatial_shapes, sampling_locations, attention_weights
+):
+    # for debug and test only,
+    # need to use cuda version instead
+    N_, S_, M_, D_ = value.shape
+    _, Lq_, M_, L_, P_, _ = sampling_locations.shape
+    value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
+    sampling_grids = 2 * sampling_locations - 1
+    sampling_value_list = []
+    for lid_, (H_, W_) in enumerate(value_spatial_shapes):
+        # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
+        value_l_ = (
+            value_list[lid_].flatten(2).transpose(1, 2).reshape(N_ * M_, D_, H_, W_)
+        )
+        # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
+        sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
+        # N_*M_, D_, Lq_, P_
+        sampling_value_l_ = F.grid_sample(
+            value_l_,
+            sampling_grid_l_,
+            mode="bilinear",
+            padding_mode="zeros",
+            align_corners=False,
+        )
+        sampling_value_list.append(sampling_value_l_)
+    # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
+    attention_weights = attention_weights.transpose(1, 2).reshape(
+        N_ * M_, 1, Lq_, L_ * P_
+    )
+    output = (
+        (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
+        .sum(-1)
+        .view(N_, M_ * D_, Lq_)
+    )
+    return output.transpose(1, 2).contiguous()

perception_models/apps/detection/DETA_pe/models/ops/make.sh ADDED Viewed

	@@ -0,0 +1,10 @@

+#!/usr/bin/env bash
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+python setup.py build install

perception_models/apps/detection/DETA_pe/models/ops/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+from .ms_deform_attn import MSDeformAttn

perception_models/apps/detection/DETA_pe/models/ops/modules/ms_deform_attn.py ADDED Viewed

	@@ -0,0 +1,161 @@

+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+from __future__ import absolute_import, division, print_function
+import math
+import warnings
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.nn.init import constant_, xavier_uniform_
+from ..functions import ms_deform_attn_core_pytorch, MSDeformAttnFunction
+def _is_power_of_2(n):
+    if (not isinstance(n, int)) or (n < 0):
+        raise ValueError(
+            "invalid input for _is_power_of_2: {} (type: {})".format(n, type(n))
+        )
+    return (n & (n - 1) == 0) and n != 0
+class MSDeformAttn(nn.Module):
+    def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
+        """
+        Multi-Scale Deformable Attention Module
+        :param d_model      hidden dimension
+        :param n_levels     number of feature levels
+        :param n_heads      number of attention heads
+        :param n_points     number of sampling points per attention head per feature level
+        """
+        super().__init__()
+        if d_model % n_heads != 0:
+            raise ValueError(
+                "d_model must be divisible by n_heads, but got {} and {}".format(
+                    d_model, n_heads
+                )
+            )
+        _d_per_head = d_model // n_heads
+        # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
+        if not _is_power_of_2(_d_per_head):
+            warnings.warn(
+                "You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
+                "which is more efficient in our CUDA implementation."
+            )
+        self.im2col_step = 64
+        self.d_model = d_model
+        self.n_levels = n_levels
+        self.n_heads = n_heads
+        self.n_points = n_points
+        self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
+        self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
+        self.value_proj = nn.Linear(d_model, d_model)
+        self.output_proj = nn.Linear(d_model, d_model)
+        self._reset_parameters()
+    def _reset_parameters(self):
+        constant_(self.sampling_offsets.weight.data, 0.0)
+        thetas = torch.arange(self.n_heads, dtype=torch.float32) * (
+            2.0 * math.pi / self.n_heads
+        )
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (
+            (grid_init / grid_init.abs().max(-1, keepdim=True)[0])
+            .view(self.n_heads, 1, 1, 2)
+            .repeat(1, self.n_levels, self.n_points, 1)
+        )
+        for i in range(self.n_points):
+            grid_init[:, :, i, :] *= i + 1
+        with torch.no_grad():
+            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
+        constant_(self.attention_weights.weight.data, 0.0)
+        constant_(self.attention_weights.bias.data, 0.0)
+        xavier_uniform_(self.value_proj.weight.data)
+        constant_(self.value_proj.bias.data, 0.0)
+        xavier_uniform_(self.output_proj.weight.data)
+        constant_(self.output_proj.bias.data, 0.0)
+    def forward(
+        self,
+        query,
+        reference_points,
+        input_flatten,
+        input_spatial_shapes,
+        input_level_start_index,
+        input_padding_mask=None,
+    ):
+        """
+        :param query                       (N, Length_{query}, C)
+        :param reference_points            (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
+                                        or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
+        :param input_flatten               (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
+        :param input_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
+        :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
+        :param input_padding_mask          (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
+        :return output                     (N, Length_{query}, C)
+        """
+        N, Len_q, _ = query.shape
+        N, Len_in, _ = input_flatten.shape
+        assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
+        value = self.value_proj(input_flatten)
+        if input_padding_mask is not None:
+            value = value.masked_fill(input_padding_mask[..., None], float(0))
+        value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
+        sampling_offsets = self.sampling_offsets(query).view(
+            N, Len_q, self.n_heads, self.n_levels, self.n_points, 2
+        )
+        attention_weights = self.attention_weights(query).view(
+            N, Len_q, self.n_heads, self.n_levels * self.n_points
+        )
+        attention_weights = F.softmax(attention_weights, -1).view(
+            N, Len_q, self.n_heads, self.n_levels, self.n_points
+        )
+        # N, Len_q, n_heads, n_levels, n_points, 2
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack(
+                [input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1
+            )
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :]
+                + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
+            )
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :2]
+                + sampling_offsets
+                / self.n_points
+                * reference_points[:, :, None, :, None, 2:]
+                * 0.5
+            )
+        else:
+            raise ValueError(
+                "Last dim of reference_points must be 2 or 4, but get {} instead.".format(
+                    reference_points.shape[-1]
+                )
+            )
+        output = MSDeformAttnFunction.apply(
+            value,
+            input_spatial_shapes,
+            input_level_start_index,
+            sampling_locations,
+            attention_weights,
+            self.im2col_step,
+        )
+        output = self.output_proj(output)
+        return output

perception_models/apps/detection/DETA_pe/models/ops/setup.py ADDED Viewed

	@@ -0,0 +1,71 @@

+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+import os
+import glob
+import torch
+from torch.utils.cpp_extension import CUDA_HOME
+from torch.utils.cpp_extension import CppExtension
+from torch.utils.cpp_extension import CUDAExtension
+from setuptools import find_packages
+from setuptools import setup
+requirements = ["torch", "torchvision"]
+def get_extensions():
+    this_dir = os.path.dirname(os.path.abspath(__file__))
+    extensions_dir = os.path.join(this_dir, "src")
+    main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
+    source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
+    source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
+    sources = main_file + source_cpu
+    extension = CppExtension
+    extra_compile_args = {"cxx": []}
+    define_macros = []
+    if torch.cuda.is_available() and CUDA_HOME is not None:
+        extension = CUDAExtension
+        sources += source_cuda
+        define_macros += [("WITH_CUDA", None)]
+        extra_compile_args["nvcc"] = [
+            "-DCUDA_HAS_FP16=1",
+            "-D__CUDA_NO_HALF_OPERATORS__",
+            "-D__CUDA_NO_HALF_CONVERSIONS__",
+            "-D__CUDA_NO_HALF2_OPERATORS__",
+        ]
+    else:
+        raise NotImplementedError('Cuda is not availabel')
+    sources = [os.path.join(extensions_dir, s) for s in sources]
+    include_dirs = [extensions_dir]
+    ext_modules = [
+        extension(
+            "MultiScaleDeformableAttention",
+            sources,
+            include_dirs=include_dirs,
+            define_macros=define_macros,
+            extra_compile_args=extra_compile_args,
+        )
+    ]
+    return ext_modules
+setup(
+    name="MultiScaleDeformableAttention",
+    version="1.0",
+    author="Weijie Su",
+    url="https://github.com/fundamentalvision/Deformable-DETR",
+    description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
+    packages=find_packages(exclude=("configs", "tests",)),
+    ext_modules=get_extensions(),
+    cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
+)

perception_models/apps/detection/DETA_pe/models/ops/src/cpu/ms_deform_attn_cpu.cpp ADDED Viewed

	@@ -0,0 +1,41 @@

+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+#include <vector>
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+at::Tensor
+ms_deform_attn_cpu_forward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step)
+{
+    AT_ERROR("Not implement on cpu");
+}
+std::vector<at::Tensor>
+ms_deform_attn_cpu_backward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step)
+{
+    AT_ERROR("Not implement on cpu");
+}

perception_models/apps/detection/DETA_pe/models/ops/src/cpu/ms_deform_attn_cpu.h ADDED Viewed

	@@ -0,0 +1,33 @@

+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+#pragma once
+#include <torch/extension.h>
+at::Tensor
+ms_deform_attn_cpu_forward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step);
+std::vector<at::Tensor>
+ms_deform_attn_cpu_backward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step);

perception_models/apps/detection/DETA_pe/models/ops/src/cuda/ms_deform_attn_cuda.cu ADDED Viewed

	@@ -0,0 +1,153 @@

+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+#include <vector>
+#include "cuda/ms_deform_im2col_cuda.cuh"
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+at::Tensor ms_deform_attn_cuda_forward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step)
+{
+    AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
+    AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
+    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
+    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
+    AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
+    AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
+    AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
+    AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
+    const int batch = value.size(0);
+    const int spatial_size = value.size(1);
+    const int num_heads = value.size(2);
+    const int channels = value.size(3);
+    const int num_levels = spatial_shapes.size(0);
+    const int num_query = sampling_loc.size(1);
+    const int num_point = sampling_loc.size(4);
+    const int im2col_step_ = std::min(batch, im2col_step);
+    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
+    auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
+    const int batch_n = im2col_step_;
+    auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
+    auto per_value_size = spatial_size * num_heads * channels;
+    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+    for (int n = 0; n < batch/im2col_step_; ++n)
+    {
+        auto columns = output_n.select(0, n);
+        AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] {
+            ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
+                value.data<scalar_t>() + n * im2col_step_ * per_value_size,
+                spatial_shapes.data<int64_t>(),
+                level_start_index.data<int64_t>(),
+                sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
+                batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
+                columns.data<scalar_t>());
+        }));
+    }
+    output = output.view({batch, num_query, num_heads*channels});
+    return output;
+}
+std::vector<at::Tensor> ms_deform_attn_cuda_backward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step)
+{
+    AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
+    AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
+    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
+    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
+    AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
+    AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
+    AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
+    AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
+    AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
+    AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
+    const int batch = value.size(0);
+    const int spatial_size = value.size(1);
+    const int num_heads = value.size(2);
+    const int channels = value.size(3);
+    const int num_levels = spatial_shapes.size(0);
+    const int num_query = sampling_loc.size(1);
+    const int num_point = sampling_loc.size(4);
+    const int im2col_step_ = std::min(batch, im2col_step);
+    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
+    auto grad_value = at::zeros_like(value);
+    auto grad_sampling_loc = at::zeros_like(sampling_loc);
+    auto grad_attn_weight = at::zeros_like(attn_weight);
+    const int batch_n = im2col_step_;
+    auto per_value_size = spatial_size * num_heads * channels;
+    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+    auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
+    for (int n = 0; n < batch/im2col_step_; ++n)
+    {
+        auto grad_output_g = grad_output_n.select(0, n);
+        AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] {
+            ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
+                                    grad_output_g.data<scalar_t>(),
+                                    value.data<scalar_t>() + n * im2col_step_ * per_value_size,
+                                    spatial_shapes.data<int64_t>(),
+                                    level_start_index.data<int64_t>(),
+                                    sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                                    attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
+                                    batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
+                                    grad_value.data<scalar_t>() +  n * im2col_step_ * per_value_size,
+                                    grad_sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                                    grad_attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size);
+        }));
+    }
+    return {
+        grad_value, grad_sampling_loc, grad_attn_weight
+    };
+}

perception_models/apps/detection/DETA_pe/models/ops/src/cuda/ms_deform_attn_cuda.h ADDED Viewed

	@@ -0,0 +1,30 @@

+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+#pragma once
+#include <torch/extension.h>
+at::Tensor ms_deform_attn_cuda_forward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step);
+std::vector<at::Tensor> ms_deform_attn_cuda_backward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step);

perception_models/apps/detection/DETA_pe/models/ops/src/cuda/ms_deform_im2col_cuda.cuh ADDED Viewed

	@@ -0,0 +1,1327 @@

+/*!
+**************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************
+* Modified from DCN (https://github.com/msracver/Deformable-ConvNets)
+* Copyright (c) 2018 Microsoft
+**************************************************************************
+*/
+#include <cstdio>
+#include <algorithm>
+#include <cstring>
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <THC/THCAtomics.cuh>
+#define CUDA_KERNEL_LOOP(i, n)                          \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x;   \
+      i < (n);                                          \
+      i += blockDim.x * gridDim.x)
+const int CUDA_NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int N, const int num_threads)
+{
+  return (N + num_threads - 1) / num_threads;
+}
+template <typename scalar_t>
+__device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data,
+                                                   const int &height, const int &width, const int &nheads, const int &channels,
+                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+  {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+  {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+  {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+  {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+  }
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+template <typename scalar_t>
+__device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data,
+                                                   const int &height, const int &width, const int &nheads, const int &channels,
+                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c,
+                                                   const scalar_t &top_grad,
+                                                   const scalar_t &attn_weight,
+                                                   scalar_t* &grad_value,
+                                                   scalar_t* grad_sampling_loc,
+                                                   scalar_t* grad_attn_weight)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t top_grad_value = top_grad * attn_weight;
+  scalar_t grad_h_weight = 0, grad_w_weight = 0;
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+  {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value+ptr1, w1*top_grad_value);
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+  {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value+ptr2, w2*top_grad_value);
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+  {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value+ptr3, w3*top_grad_value);
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+  {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value+ptr4, w4*top_grad_value);
+  }
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  *grad_attn_weight = top_grad * val;
+  *grad_sampling_loc = width * grad_w_weight * top_grad_value;
+  *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;
+}
+template <typename scalar_t>
+__device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data,
+                                                   const int &height, const int &width, const int &nheads, const int &channels,
+                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c,
+                                                   const scalar_t &top_grad,
+                                                   const scalar_t &attn_weight,
+                                                   scalar_t* &grad_value,
+                                                   scalar_t* grad_sampling_loc,
+                                                   scalar_t* grad_attn_weight)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t top_grad_value = top_grad * attn_weight;
+  scalar_t grad_h_weight = 0, grad_w_weight = 0;
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+  {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value+ptr1, w1*top_grad_value);
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+  {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value+ptr2, w2*top_grad_value);
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+  {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value+ptr3, w3*top_grad_value);
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+  {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value+ptr4, w4*top_grad_value);
+  }
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  atomicAdd(grad_attn_weight, top_grad * val);
+  atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
+  atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
+}
+template <typename scalar_t>
+__global__ void ms_deformable_im2col_gpu_kernel(const int n,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index,
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size,
+                                                const int spatial_size,
+                                                const int num_heads,
+                                                const int channels,
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *data_col)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+    scalar_t *data_col_ptr = data_col + index;
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+    scalar_t col = 0;
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const scalar_t *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride);
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col) * weight;
+        }
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+      }
+    }
+    *data_col_ptr = col;
+  }
+}
+template <typename scalar_t, unsigned int blockSize>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index,
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size,
+                                                const int spatial_size,
+                                                const int num_heads,
+                                                const int channels,
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+    __shared__ scalar_t cache_grad_attn_weight[blockSize];
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+    const scalar_t top_grad = grad_col[index];
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr,
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        __syncthreads();
+        if (tid == 0)
+        {
+          scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
+          int sid=2;
+          for (unsigned int tid = 1; tid < blockSize; ++tid)
+          {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[tid];
+            sid += 2;
+          }
+          *grad_sampling_loc = _grad_w;
+          *(grad_sampling_loc + 1) = _grad_h;
+          *grad_attn_weight = _grad_a;
+        }
+        __syncthreads();
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+template <typename scalar_t, unsigned int blockSize>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index,
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size,
+                                                const int spatial_size,
+                                                const int num_heads,
+                                                const int channels,
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+    __shared__ scalar_t cache_grad_attn_weight[blockSize];
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+    const scalar_t top_grad = grad_col[index];
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr,
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        __syncthreads();
+        for (unsigned int s=blockSize/2; s>0; s>>=1)
+        {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
+          }
+          __syncthreads();
+        }
+        if (tid == 0)
+        {
+          *grad_sampling_loc = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index,
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size,
+                                                const int spatial_size,
+                                                const int num_heads,
+                                                const int channels,
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    extern __shared__ int _s[];
+    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
+    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+    const scalar_t top_grad = grad_col[index];
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr,
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        __syncthreads();
+        if (tid == 0)
+        {
+          scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
+          int sid=2;
+          for (unsigned int tid = 1; tid < blockDim.x; ++tid)
+          {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[tid];
+            sid += 2;
+          }
+          *grad_sampling_loc = _grad_w;
+          *(grad_sampling_loc + 1) = _grad_h;
+          *grad_attn_weight = _grad_a;
+        }
+        __syncthreads();
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index,
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size,
+                                                const int spatial_size,
+                                                const int num_heads,
+                                                const int channels,
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    extern __shared__ int _s[];
+    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
+    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+    const scalar_t top_grad = grad_col[index];
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr,
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        __syncthreads();
+        for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
+        {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre)
+            {
+              cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            }
+          }
+          __syncthreads();
+        }
+        if (tid == 0)
+        {
+          *grad_sampling_loc = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index,
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size,
+                                                const int spatial_size,
+                                                const int num_heads,
+                                                const int channels,
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    extern __shared__ int _s[];
+    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
+    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+    const scalar_t top_grad = grad_col[index];
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr,
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        __syncthreads();
+        for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
+        {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre)
+            {
+              cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            }
+          }
+          __syncthreads();
+        }
+        if (tid == 0)
+        {
+          atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]);
+          atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]);
+          atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]);
+        }
+        __syncthreads();
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_gm(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index,
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size,
+                                                const int spatial_size,
+                                                const int num_heads,
+                                                const int channels,
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+    const scalar_t top_grad = grad_col[index];
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear_gm(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr,
+            grad_sampling_loc, grad_attn_weight);
+        }
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+template <typename scalar_t>
+void ms_deformable_im2col_cuda(cudaStream_t stream,
+                              const scalar_t* data_value,
+                              const int64_t* data_spatial_shapes,
+                              const int64_t* data_level_start_index,
+                              const scalar_t* data_sampling_loc,
+                              const scalar_t* data_attn_weight,
+                              const int batch_size,
+                              const int spatial_size,
+                              const int num_heads,
+                              const int channels,
+                              const int num_levels,
+                              const int num_query,
+                              const int num_point,
+                              scalar_t* data_col)
+{
+  const int num_kernels = batch_size * num_query * num_heads * channels;
+  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
+  const int num_threads = CUDA_NUM_THREADS;
+  ms_deformable_im2col_gpu_kernel<scalar_t>
+      <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+          0, stream>>>(
+      num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight,
+      batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, data_col);
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in ms_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
+  }
+}
+template <typename scalar_t>
+void ms_deformable_col2im_cuda(cudaStream_t stream,
+                              const scalar_t* grad_col,
+                              const scalar_t* data_value,
+                              const int64_t * data_spatial_shapes,
+                              const int64_t * data_level_start_index,
+                              const scalar_t * data_sampling_loc,
+                              const scalar_t * data_attn_weight,
+                              const int batch_size,
+                              const int spatial_size,
+                              const int num_heads,
+                              const int channels,
+                              const int num_levels,
+                              const int num_query,
+                              const int num_point,
+                              scalar_t* grad_value,
+                              scalar_t* grad_sampling_loc,
+                              scalar_t* grad_attn_weight)
+{
+  const int num_threads = (channels > CUDA_NUM_THREADS)?CUDA_NUM_THREADS:channels;
+  const int num_kernels = batch_size * num_query * num_heads * channels;
+  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
+  if (channels > 1024)
+  {
+    if ((channels & 1023) == 0)
+    {
+      ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+              num_threads*3*sizeof(scalar_t), stream>>>(
+                        num_kernels,
+                        grad_col,
+                        data_value,
+                        data_spatial_shapes,
+                        data_level_start_index,
+                        data_sampling_loc,
+                        data_attn_weight,
+                        batch_size,
+                        spatial_size,
+                        num_heads,
+                        channels,
+                        num_levels,
+                        num_query,
+                        num_point,
+                        grad_value,
+                        grad_sampling_loc,
+                        grad_attn_weight);
+    }
+    else
+    {
+      ms_deformable_col2im_gpu_kernel_gm<scalar_t>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+    }
+  }
+  else{
+    switch(channels)
+    {
+      case 1:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 1>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 2:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 2>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 4:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 4>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 8:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 8>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 16:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 16>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 32:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 32>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 64:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 64>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 128:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 128>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 256:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 256>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 512:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 512>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 1024:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 1024>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      default:
+        if (channels < 64)
+        {
+          ms_deformable_col2im_gpu_kernel_shm_reduce_v1<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+              num_threads*3*sizeof(scalar_t), stream>>>(
+                        num_kernels,
+                        grad_col,
+                        data_value,
+                        data_spatial_shapes,
+                        data_level_start_index,
+                        data_sampling_loc,
+                        data_attn_weight,
+                        batch_size,
+                        spatial_size,
+                        num_heads,
+                        channels,
+                        num_levels,
+                        num_query,
+                        num_point,
+                        grad_value,
+                        grad_sampling_loc,
+                        grad_attn_weight);
+        }
+        else
+        {
+          ms_deformable_col2im_gpu_kernel_shm_reduce_v2<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+              num_threads*3*sizeof(scalar_t), stream>>>(
+                        num_kernels,
+                        grad_col,
+                        data_value,
+                        data_spatial_shapes,
+                        data_level_start_index,
+                        data_sampling_loc,
+                        data_attn_weight,
+                        batch_size,
+                        spatial_size,
+                        num_heads,
+                        channels,
+                        num_levels,
+                        num_query,
+                        num_point,
+                        grad_value,
+                        grad_sampling_loc,
+                        grad_attn_weight);
+        }
+    }
+  }
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
+  }
+}

perception_models/apps/detection/DETA_pe/models/ops/src/ms_deform_attn.h ADDED Viewed

	@@ -0,0 +1,62 @@

+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+#pragma once
+#include "cpu/ms_deform_attn_cpu.h"
+#ifdef WITH_CUDA
+#include "cuda/ms_deform_attn_cuda.h"
+#endif
+at::Tensor
+ms_deform_attn_forward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step)
+{
+    if (value.type().is_cuda())
+    {
+#ifdef WITH_CUDA
+        return ms_deform_attn_cuda_forward(
+            value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
+#else
+        AT_ERROR("Not compiled with GPU support");
+#endif
+    }
+    AT_ERROR("Not implemented on the CPU");
+}
+std::vector<at::Tensor>
+ms_deform_attn_backward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step)
+{
+    if (value.type().is_cuda())
+    {
+#ifdef WITH_CUDA
+        return ms_deform_attn_cuda_backward(
+            value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
+#else
+        AT_ERROR("Not compiled with GPU support");
+#endif
+    }
+    AT_ERROR("Not implemented on the CPU");
+}

perception_models/apps/detection/DETA_pe/models/ops/src/vision.cpp ADDED Viewed

	@@ -0,0 +1,16 @@

+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+#include "ms_deform_attn.h"
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
+  m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
+}

perception_models/apps/detection/DETA_pe/models/ops/test.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+import time
+import torch
+import torch.nn as nn
+from torch.autograd import gradcheck
+from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
+N, M, D = 1, 2, 2
+Lq, L, P = 2, 2, 2
+shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
+level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
+S = sum([(H*W).item() for H, W in shapes])
+torch.manual_seed(3)
+@torch.no_grad()
+def check_forward_equal_with_pytorch_double():
+    value = torch.rand(N, S, M, D).cuda() * 0.01
+    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
+    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
+    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
+    im2col_step = 2
+    output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
+    output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
+    fwdok = torch.allclose(output_cuda, output_pytorch)
+    max_abs_err = (output_cuda - output_pytorch).abs().max()
+    max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
+    print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
+@torch.no_grad()
+def check_forward_equal_with_pytorch_float():
+    value = torch.rand(N, S, M, D).cuda() * 0.01
+    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
+    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
+    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
+    im2col_step = 2
+    output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
+    output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
+    fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
+    max_abs_err = (output_cuda - output_pytorch).abs().max()
+    max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
+    print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
+def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
+    value = torch.rand(N, S, M, channels).cuda() * 0.01
+    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
+    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
+    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
+    im2col_step = 2
+    func = MSDeformAttnFunction.apply
+    value.requires_grad = grad_value
+    sampling_locations.requires_grad = grad_sampling_loc
+    attention_weights.requires_grad = grad_attn_weight
+    gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
+    print(f'* {gradok} check_gradient_numerical(D={channels})')
+if __name__ == '__main__':
+    check_forward_equal_with_pytorch_double()
+    check_forward_equal_with_pytorch_float()
+    for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
+        check_gradient_numerical(channels, True, True, True)

perception_models/apps/detection/DETA_pe/models/pev1.py ADDED Viewed

	@@ -0,0 +1,686 @@

+import math
+from collections import OrderedDict
+from functools import partial
+from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from torch import broadcast_tensors, einsum, nn
+from torch.nn.parameter import Parameter
+from torch.utils.checkpoint import checkpoint
+from .utils_d2 import (
+    add_decomposed_rel_pos,
+    PatchEmbed,
+    window_partition,
+    window_unpartition,
+)
+def get_abs_pos(abs_pos, has_cls_token, hw, tile=False):
+    h, w = hw
+    if has_cls_token:
+        abs_pos = abs_pos[:, 1:]
+    xy_num = abs_pos.shape[1]
+    size = int(math.sqrt(xy_num))
+    assert size * size == xy_num
+    if size != h or size != w:
+        if tile == True:
+            new_abs_pos = abs_pos.reshape(1, size, size, -1).tile(
+                [1, h // size + 1, w // size + 1, 1]
+            )[:, :h, :w, :]
+            return new_abs_pos
+        else:
+            new_abs_pos = F.interpolate(
+                abs_pos.reshape(1, size, size, -1).permute(0, 3, 1, 2),
+                size=(h, w),
+                mode="bicubic",
+                align_corners=False,
+            )
+            return new_abs_pos.permute(0, 2, 3, 1)
+    else:
+        return abs_pos.reshape(1, h, w, -1)
+# broadcat, as tortoise-tts was using it
+def broadcat(tensors, dim=-1):
+    broadcasted_tensors = broadcast_tensors(*tensors)
+    return torch.cat(broadcasted_tensors, dim=dim)
+# rotary embedding helper functions
+def rotate_half(x):
+    x = rearrange(x, "... (d r) -> ... d r", r=2)
+    x1, x2 = x.unbind(dim=-1)
+    x = torch.stack((-x2, x1), dim=-1)
+    return rearrange(x, "... d r -> ... (d r)")
+class VisionRotaryEmbeddingFast(nn.Module):
+    def __init__(
+        self,
+        dim,
+        pt_seq_len=16,
+        ft_seq_len=None,
+        custom_freqs=None,
+        freqs_for="lang",
+        theta=10000,
+        max_freq=10,
+        num_freqs=1,
+    ):
+        super().__init__()
+        if custom_freqs:
+            freqs = custom_freqs
+        elif freqs_for == "lang":
+            freqs = 1.0 / (
+                theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)
+            )
+        elif freqs_for == "pixel":
+            freqs = torch.linspace(1.0, max_freq / 2, dim // 2) * pi
+        elif freqs_for == "constant":
+            freqs = torch.ones(num_freqs).float()
+        else:
+            raise ValueError(f"unknown modality {freqs_for}")
+        if ft_seq_len is None:
+            ft_seq_len = pt_seq_len
+        t = (
+            torch.arange(ft_seq_len) / ft_seq_len * pt_seq_len + 1
+        )  # + 1 is hacking vev0 pt code
+        freqs = torch.einsum("..., f -> ... f", t, freqs)
+        freqs = repeat(freqs, "... n -> ... (n r)", r=2)
+        # freqs = broadcat((freqs[:, None, :], freqs[None, :, :]), dim = -1)
+        freqs = broadcat(
+            (freqs[None, :, :], freqs[:, None, :]), dim=-1
+        )  # follow vev0 pt code
+        freqs_cos = freqs.cos().view(-1, freqs.shape[-1])
+        freqs_sin = freqs.sin().view(-1, freqs.shape[-1])
+        self.register_buffer("freqs_cos", freqs_cos)
+        self.register_buffer("freqs_sin", freqs_sin)
+        print("======== shape of rope freq", self.freqs_cos.shape, "========")
+    def forward(self, tt):
+        return tt * self.freqs_cos + rotate_half(tt) * self.freqs_sin
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        # ret = super().forward(x.type(torch.float32))
+        ret = F.layer_norm(
+            x.type(torch.float32),
+            self.normalized_shape,
+            self.weight.type(torch.float32),
+            self.bias.type(torch.float32),
+            self.eps,
+        )
+        return ret.type(orig_type)
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+def drop_path(
+    x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True
+):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (
+        x.ndim - 1
+    )  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""
+    def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+    def extra_repr(self):
+        return f"drop_prob={round(self.drop_prob,3):0.3f}"
+class Attention(nn.Module):
+    r"""
+    Implements attention based on Rope
+    """
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        bias: bool = True,
+        add_bias_kv: bool = False,
+        kdim: Optional[bool] = None,
+        vdim: Optional[bool] = None,
+        rope=None,
+    ):
+        super(Attention, self).__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        assert (
+            self.head_dim * num_heads == self.embed_dim
+        ), "embed_dim must be divisible by num_heads"
+        if self._qkv_same_embed_dim is False:
+            self.q_proj_weight = Parameter(torch.Tensor(embed_dim, embed_dim))
+            self.k_proj_weight = Parameter(torch.Tensor(embed_dim, self.kdim))
+            self.v_proj_weight = Parameter(torch.Tensor(embed_dim, self.vdim))
+        else:
+            self.in_proj_weight = Parameter(torch.empty(3 * embed_dim, embed_dim))
+        if bias:
+            self.in_proj_bias = Parameter(torch.empty(3 * embed_dim))
+        else:
+            self.register_parameter("in_proj_bias", None)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        if add_bias_kv:
+            self.bias_k = Parameter(torch.empty(1, 1, embed_dim))
+            self.bias_v = Parameter(torch.empty(1, 1, embed_dim))
+        else:
+            self.bias_k = self.bias_v = None
+        self.rope = rope
+        self.scale = self.head_dim ** (-0.5)
+    def forward(self, query, attn_mask: Optional[torch.Tensor] = None):
+        batch, seq, embed_dim = query.shape
+        proj = torch._C._nn.linear(query, self.in_proj_weight, self.in_proj_bias)
+        # reshape to 3, E and not E, 3 is deliberate for better memory coalescing and keeping same order as chunk()
+        proj = (
+            proj.unflatten(-1, (3, embed_dim))
+            .unsqueeze(0)
+            .transpose(0, -2)
+            .squeeze(-2)
+            .contiguous()
+        )
+        q_, k_, v_ = proj[0], proj[1], proj[2]
+        # Use "q_" so that we don't accidentally quit in pdb :)
+        q_ = rearrange(q_, "b s (h d) -> b h s d", h=self.num_heads)
+        k_ = rearrange(k_, "b s (h d) -> b h s d", h=self.num_heads)
+        v_ = rearrange(v_, "b s (h d) -> b h s d", h=self.num_heads)
+        ## rope
+        q_ = self.rope(q_).type_as(v_)
+        k_ = self.rope(k_).type_as(v_)
+        attn = (q_ * self.scale) @ k_.transpose(-2, -1)
+        attn = attn.softmax(dim=-1)
+        x_ = attn @ v_
+        x_ = rearrange(x_, "b h s d -> b s (h d)")
+        return torch._C._nn.linear(x_, self.out_proj.weight, self.out_proj.bias)
+class LayerScale(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        init_values: float = 1e-5,
+        inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma
+class ResidualAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        n_head: int,
+        mlp_ratio=4.0,
+        act_layer=nn.GELU,
+        norm_layer=LayerNorm,
+        drop_path=0.0,
+        use_rel_pos=False,
+        rel_pos_zero_init=True,
+        window_size=0,
+        rope=None,
+        input_size=None,
+        attn_mask=None,
+        init_values=0.0,
+    ):
+        super().__init__()
+        self.attn = Attention(embed_dim=d_model, num_heads=n_head, rope=rope)
+        self.ls_1 = (
+            LayerScale(d_model, init_values=init_values)
+            if init_values > 0.0
+            else nn.Identity()
+        )
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(
+            OrderedDict(
+                [
+                    ("c_fc", nn.Linear(d_model, int(d_model * mlp_ratio))),
+                    ("gelu", act_layer()),
+                    ("c_proj", nn.Linear(int(d_model * mlp_ratio), d_model)),
+                ]
+            )
+        )
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+        self.ls_2 = (
+            LayerScale(d_model, init_values=init_values)
+            if init_values > 0.0
+            else nn.Identity()
+        )
+        self.window_size = window_size
+    def attention_nhwc(self, x: torch.Tensor):
+        self.attn_mask = (
+            self.attn_mask.to(dtype=x.dtype, device=x.device)
+            if self.attn_mask is not None
+            else None
+        )
+        B, H, W, _ = x.shape
+        x = x.reshape(B, H * W, -1)
+        x = self.attn(x, attn_mask=self.attn_mask)
+        x = x.reshape(B, H, W, -1)
+        return x
+    def forward(self, x: torch.Tensor):
+        shortcut = x
+        x = self.ln_1(x)
+        # Window partition
+        if self.window_size > 0:
+            H, W = x.shape[1], x.shape[2]
+            x, pad_hw = window_partition(x, self.window_size)
+        x = self.attention_nhwc(x)
+        # Reverse window partition
+        if self.window_size > 0:
+            x = window_unpartition(x, self.window_size, pad_hw, (H, W))
+        x = shortcut + self.drop_path(self.ls_1(x))
+        x = x + self.drop_path(self.ls_2(self.mlp(self.ln_2(x))))
+        return x
+class Transformer(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int,
+        depth: int,
+        num_heads: int,
+        mlp_ratio=4.0,
+        act_layer=nn.GELU,
+        norm_layer=LayerNorm,
+        drop_path_rate=0.0,
+        use_rel_pos=False,
+        rel_pos_zero_init=True,
+        window_size=0,
+        window_block_indexes=(),
+        img_size=1024,
+        patch_size=16,
+        rope_win=None,
+        rope_glb=None,
+        use_act_checkpoint=False,
+        act_checkpoint_ratio=1.0,
+        attn_mask=None,
+        init_values=0.0,
+        return_layer=[-1],
+    ):
+        super().__init__()
+        self.use_act_checkpoint = use_act_checkpoint
+        self.act_checkpoint_ratio = act_checkpoint_ratio
+        # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]
+        self.resblocks = nn.ModuleList()
+        for i in range(depth):
+            block = ResidualAttentionBlock(
+                embed_dim,
+                num_heads,
+                attn_mask=attn_mask,
+                drop_path=dpr[i],
+                mlp_ratio=mlp_ratio,
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+                use_rel_pos=use_rel_pos,
+                rel_pos_zero_init=rel_pos_zero_init,
+                window_size=window_size if i in window_block_indexes else 0,
+                rope=rope_win if i in window_block_indexes else rope_glb,
+                input_size=(img_size // patch_size, img_size // patch_size),
+                init_values=init_values,
+            )
+            self.resblocks.append(block)
+        self.return_layer = return_layer
+    def forward(self, x: torch.Tensor):
+        x_list = []
+        for idx, blk in enumerate(self.resblocks):
+            if (
+                self.use_act_checkpoint
+                and (idx / len(self.resblocks)) <= self.act_checkpoint_ratio
+            ):
+                x = checkpoint(blk, x)
+            else:
+                x = blk(x)
+            if idx in self.return_layer or idx == len(self.resblocks) - 1:
+                x_list.append(x)
+        return x, x_list
+class PEv1_simpleFPN(nn.Module):
+    def __init__(
+        self,
+        img_size=1024,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        drop_path_rate=0.0,
+        norm_layer=nn.LayerNorm,
+        act_layer=nn.GELU,
+        use_abs_pos=True,
+        use_rel_pos=False,
+        rel_pos_zero_init=True,
+        rope=True,
+        pt_hw_seq_len=16,
+        intp_freq=True,
+        window_size=0,
+        window_block_indexes=(),
+        residual_block_indexes=(),
+        use_act_checkpoint=False,
+        act_checkpoint_ratio=1.0,
+        pretrain_img_size=336,
+        pretrain_use_cls_token=True,
+        out_feature="last_feat",
+        tile_posemb=False,
+        init_values=0.0,
+        tta_rope=False,
+        return_layer=[-1],
+    ):
+        super().__init__()
+        self.pretrain_use_cls_token = pretrain_use_cls_token
+        self.conv1 = nn.Conv2d(
+            in_channels=in_chans,
+            out_channels=embed_dim,
+            kernel_size=patch_size,
+            stride=patch_size,
+            bias=False,
+        )
+        if use_abs_pos:
+            # Initialize absolute positional embedding with pretrain image size.
+            num_patches = (pretrain_img_size // patch_size) * (
+                pretrain_img_size // patch_size
+            )
+            num_positions = (num_patches + 1) if pretrain_use_cls_token else num_patches
+            self.positional_embedding = nn.Parameter(
+                torch.zeros(1, num_positions, embed_dim)
+            )
+            print("positional_embedding:", self.positional_embedding.shape)
+            print("positional_embedding:", self.positional_embedding.shape)
+            print("positional_embedding:", self.positional_embedding.shape)
+        else:
+            self.positional_embedding = None
+        self.tile_posemb = tile_posemb
+        self.ln_pre = LayerNorm(embed_dim)
+        half_head_dim = embed_dim // num_heads // 2
+        hw_seq_len = img_size // patch_size
+        self.rope_win = VisionRotaryEmbeddingFast(
+            dim=half_head_dim,
+            pt_seq_len=pt_hw_seq_len,
+            ft_seq_len=window_size if intp_freq else None,
+        )
+        self.rope_glb = VisionRotaryEmbeddingFast(
+            dim=half_head_dim,
+            pt_seq_len=pt_hw_seq_len,
+            ft_seq_len=hw_seq_len if intp_freq else None,
+        )
+        self.transformer = Transformer(
+            embed_dim=embed_dim,
+            depth=depth,
+            num_heads=num_heads,
+            mlp_ratio=mlp_ratio,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+            drop_path_rate=drop_path_rate,
+            use_rel_pos=use_rel_pos,
+            rel_pos_zero_init=rel_pos_zero_init,
+            window_size=window_size,
+            window_block_indexes=window_block_indexes,
+            rope_win=self.rope_win,
+            rope_glb=self.rope_glb,
+            img_size=img_size,
+            patch_size=patch_size,
+            use_act_checkpoint=use_act_checkpoint,
+            act_checkpoint_ratio=act_checkpoint_ratio,
+            init_values=init_values,
+            return_layer=return_layer,
+        )
+        self._out_feature_channels = {out_feature: embed_dim}
+        self._out_feature_strides = {out_feature: patch_size}
+        self._out_features = [out_feature]
+        if self.positional_embedding is not None:
+            nn.init.trunc_normal_(self.positional_embedding, std=0.02)
+        self.return_layer = return_layer
+        # In our method, we don't use backbone feature with stride 4
+        self.fpn1 = nn.Sequential(
+            nn.ConvTranspose2d(embed_dim, embed_dim // 2, kernel_size=2, stride=2),
+        )
+        self.fpn2 = nn.Identity()
+        self.fpn3 = nn.MaxPool2d(kernel_size=2, stride=2)
+        self.apply(self._init_weights)
+        strides = [patch_size // 2, patch_size, patch_size * 2]
+        self._out_features = ["p{}".format(int(math.log2(s))) for s in strides]
+        self._out_feature_strides = {
+            "p3": 8,
+            "p4": 16,
+            "p5": 32,
+        }
+        self._out_feature_channels = {
+            "p3": embed_dim // 2,
+            "p4": embed_dim,
+            "p5": embed_dim,
+        }
+        self._size_divisibility = strides[-1]
+        self._square_pad = img_size
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            nn.init.trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = x.permute(0, 2, 3, 1)
+        if self.positional_embedding is not None:
+            x = x + get_abs_pos(
+                self.positional_embedding,
+                self.pretrain_use_cls_token,
+                (x.shape[1], x.shape[2]),
+                self.tile_posemb,
+            )
+        x = self.ln_pre(x)
+        x, x_list = self.transformer(x)
+        xp = x.permute(0, 3, 1, 2)  # (b, h, w, c) --> (b, c, h, w)
+        features = []
+        ops = [self.fpn1, self.fpn2, self.fpn3]
+        for i in range(len(ops)):
+            features.append(ops[i](xp))
+        rets = {"p{}".format(u + 3): v for (u, v) in enumerate(features)}
+        return rets
+def get_pev1_and_fpn_backbone(args):
+    if args.lsj_img_size_max > 0:
+        img_size = args.lsj_img_size_max
+    else:
+        img_size = args.lsj_img_size
+    use_act_checkpoint = args.backbone_use_act_checkpoint
+    act_checkpoint_ratio = args.backbone_act_checkpoint_ratio
+    init_values = args.backbone_init_values
+    tile_posemb = args.backbone_tile_posemb
+    tta_rope = args.backbone_tta_rope
+    multi_layer = args.backbone_multi_layer
+    backbone_dp = args.backbone_dp
+    if args.backbone_size == "G":
+        embed_dim, depth, num_heads, mlp_ratio, dp = 1536, 50, 16, 8960 / 1536, 0.5
+        pretrain_img_size, patch_size, window_size = 224, 16, 14
+        window_block_indexes = (
+            list(range(0, 12))
+            + list(range(13, 24))
+            + list(range(25, 36))
+            + list(range(37, 49))
+        )
+        pretrain_use_cls_token = False
+        if multi_layer:
+            return_layer = [12, 24, 36, 49]
+        else:
+            return_layer = [-1]
+    elif args.backbone_size == "Gwin384":
+        embed_dim, depth, num_heads, mlp_ratio, dp = 1536, 50, 16, 8960 / 1536, 0.5
+        pretrain_img_size, patch_size, window_size = 384, 16, 24
+        window_block_indexes = (
+            list(range(0, 12))
+            + list(range(13, 24))
+            + list(range(25, 36))
+            + list(range(37, 49))
+        )
+        pretrain_use_cls_token = False
+        if multi_layer:
+            return_layer = [12, 24, 36, 49]
+        else:
+            return_layer = [-1]
+    elif args.backbone_size == "Gwin512":
+        embed_dim, depth, num_heads, mlp_ratio, dp = 1536, 50, 16, 8960 / 1536, 0.5
+        pretrain_img_size, patch_size, window_size = 512, 16, 32
+        window_block_indexes = (
+            list(range(0, 12))
+            + list(range(13, 24))
+            + list(range(25, 36))
+            + list(range(37, 49))
+        )
+        pretrain_use_cls_token = False
+        if multi_layer:
+            return_layer = [12, 24, 36, 49]
+        else:
+            return_layer = [-1]
+    else:
+        raise ValueError("Unsupported backbone size")
+    if backbone_dp >= 0:
+        dp = backbone_dp
+    assert (
+        depth == args.backbone_layers
+    ), f"backbone depth {depth} and layers {args.backbone_layers}(from config) must be the same"
+    model = PEv1_simpleFPN(
+        use_act_checkpoint=use_act_checkpoint,
+        act_checkpoint_ratio=act_checkpoint_ratio,
+        pretrain_img_size=pretrain_img_size,
+        pretrain_use_cls_token=pretrain_use_cls_token,
+        img_size=img_size,
+        patch_size=patch_size,
+        embed_dim=embed_dim,
+        depth=depth,
+        num_heads=num_heads,
+        drop_path_rate=dp,
+        window_size=window_size,
+        pt_hw_seq_len=16, # Maybe a bug ?
+        mlp_ratio=mlp_ratio,
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        window_block_indexes=window_block_indexes,
+        residual_block_indexes=[],
+        use_rel_pos=True,
+        out_feature="last_feat",
+        tile_posemb=tile_posemb,
+        init_values=init_values,
+        tta_rope=tta_rope,
+        return_layer=return_layer,
+    )
+    pretrained_backbone_path = args.backbone_path
+    if pretrained_backbone_path:
+        state_dict = torch.load(pretrained_backbone_path, map_location="cpu")
+        load_info = model.load_state_dict(state_dict["model"], strict=False)
+        print("Missing keys", load_info.missing_keys)
+        print("Unexpected keys", load_info.unexpected_keys)
+    else:
+        print("Skip pretrained backbone loading")
+    return model

perception_models/apps/detection/DETA_pe/models/position_encoding.py ADDED Viewed

	@@ -0,0 +1,97 @@

+# ------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+"""
+Various positional encodings for the transformer.
+"""
+import math
+import torch
+from torch import nn
+from util.misc import NestedTensor
+class PositionEmbeddingSine(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one
+    used by the Attention is all you need paper, generalized to work on images.
+    """
+    def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+    def forward(self, tensor_list: NestedTensor):
+        x = tensor_list.tensors
+        mask = tensor_list.mask
+        assert mask is not None
+        not_mask = ~mask
+        y_embed = not_mask.cumsum(1, dtype=torch.float32)
+        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            eps = 1e-6
+            y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale
+        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+class PositionEmbeddingLearned(nn.Module):
+    """
+    Absolute pos embedding, learned.
+    """
+    def __init__(self, num_pos_feats=256):
+        super().__init__()
+        self.row_embed = nn.Embedding(50, num_pos_feats)
+        self.col_embed = nn.Embedding(50, num_pos_feats)
+        self.reset_parameters()
+    def reset_parameters(self):
+        nn.init.uniform_(self.row_embed.weight)
+        nn.init.uniform_(self.col_embed.weight)
+    def forward(self, tensor_list: NestedTensor):
+        x = tensor_list.tensors
+        h, w = x.shape[-2:]
+        i = torch.arange(w, device=x.device)
+        j = torch.arange(h, device=x.device)
+        x_emb = self.col_embed(i)
+        y_emb = self.row_embed(j)
+        pos = torch.cat([
+            x_emb.unsqueeze(0).repeat(h, 1, 1),
+            y_emb.unsqueeze(1).repeat(1, w, 1),
+        ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1)
+        return pos
+def build_position_encoding(args):
+    N_steps = args.hidden_dim // 2
+    if args.position_embedding in ('v2', 'sine'):
+        # TODO find a better way of exposing other arguments
+        position_embedding = PositionEmbeddingSine(N_steps, normalize=True)
+    elif args.position_embedding in ('v3', 'learned'):
+        position_embedding = PositionEmbeddingLearned(N_steps)
+    else:
+        raise ValueError(f"not supported {args.position_embedding}")
+    return position_embedding

perception_models/apps/detection/DETA_pe/models/segmentation.py ADDED Viewed

	@@ -0,0 +1,369 @@

+# ------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+"""
+This file provides the definition of the convolutional heads used to predict masks, as well as the losses
+"""
+import io
+from collections import defaultdict
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from PIL import Image
+import util.box_ops as box_ops
+from util.misc import NestedTensor, interpolate, nested_tensor_from_tensor_list
+try:
+    from panopticapi.utils import id2rgb, rgb2id
+except ImportError:
+    pass
+class DETRsegm(nn.Module):
+    def __init__(self, detr, freeze_detr=False):
+        super().__init__()
+        self.detr = detr
+        if freeze_detr:
+            for p in self.parameters():
+                p.requires_grad_(False)
+        hidden_dim, nheads = detr.transformer.d_model, detr.transformer.nhead
+        self.bbox_attention = MHAttentionMap(hidden_dim, hidden_dim, nheads, dropout=0)
+        self.mask_head = MaskHeadSmallConv(hidden_dim + nheads, [1024, 512, 256], hidden_dim)
+    def forward(self, samples: NestedTensor):
+        if not isinstance(samples, NestedTensor):
+            samples = nested_tensor_from_tensor_list(samples)
+        features, pos = self.detr.backbone(samples)
+        bs = features[-1].tensors.shape[0]
+        src, mask = features[-1].decompose()
+        src_proj = self.detr.input_proj(src)
+        hs, memory = self.detr.transformer(src_proj, mask, self.detr.query_embed.weight, pos[-1])
+        outputs_class = self.detr.class_embed(hs)
+        outputs_coord = self.detr.bbox_embed(hs).sigmoid()
+        out = {"pred_logits": outputs_class[-1], "pred_boxes": outputs_coord[-1]}
+        if self.detr.aux_loss:
+            out["aux_outputs"] = [
+                {"pred_logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])
+            ]
+        # FIXME h_boxes takes the last one computed, keep this in mind
+        bbox_mask = self.bbox_attention(hs[-1], memory, mask=mask)
+        seg_masks = self.mask_head(src_proj, bbox_mask, [features[2].tensors, features[1].tensors, features[0].tensors])
+        outputs_seg_masks = seg_masks.view(bs, self.detr.num_queries, seg_masks.shape[-2], seg_masks.shape[-1])
+        out["pred_masks"] = outputs_seg_masks
+        return out
+class MaskHeadSmallConv(nn.Module):
+    """
+    Simple convolutional head, using group norm.
+    Upsampling is done using a FPN approach
+    """
+    def __init__(self, dim, fpn_dims, context_dim):
+        super().__init__()
+        inter_dims = [dim, context_dim // 2, context_dim // 4, context_dim // 8, context_dim // 16, context_dim // 64]
+        self.lay1 = torch.nn.Conv2d(dim, dim, 3, padding=1)
+        self.gn1 = torch.nn.GroupNorm(8, dim)
+        self.lay2 = torch.nn.Conv2d(dim, inter_dims[1], 3, padding=1)
+        self.gn2 = torch.nn.GroupNorm(8, inter_dims[1])
+        self.lay3 = torch.nn.Conv2d(inter_dims[1], inter_dims[2], 3, padding=1)
+        self.gn3 = torch.nn.GroupNorm(8, inter_dims[2])
+        self.lay4 = torch.nn.Conv2d(inter_dims[2], inter_dims[3], 3, padding=1)
+        self.gn4 = torch.nn.GroupNorm(8, inter_dims[3])
+        self.lay5 = torch.nn.Conv2d(inter_dims[3], inter_dims[4], 3, padding=1)
+        self.gn5 = torch.nn.GroupNorm(8, inter_dims[4])
+        self.out_lay = torch.nn.Conv2d(inter_dims[4], 1, 3, padding=1)
+        self.dim = dim
+        self.adapter1 = torch.nn.Conv2d(fpn_dims[0], inter_dims[1], 1)
+        self.adapter2 = torch.nn.Conv2d(fpn_dims[1], inter_dims[2], 1)
+        self.adapter3 = torch.nn.Conv2d(fpn_dims[2], inter_dims[3], 1)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_uniform_(m.weight, a=1)
+                nn.init.constant_(m.bias, 0)
+    def forward(self, x, bbox_mask, fpns):
+        def expand(tensor, length):
+            return tensor.unsqueeze(1).repeat(1, int(length), 1, 1, 1).flatten(0, 1)
+        x = torch.cat([expand(x, bbox_mask.shape[1]), bbox_mask.flatten(0, 1)], 1)
+        x = self.lay1(x)
+        x = self.gn1(x)
+        x = F.relu(x)
+        x = self.lay2(x)
+        x = self.gn2(x)
+        x = F.relu(x)
+        cur_fpn = self.adapter1(fpns[0])
+        if cur_fpn.size(0) != x.size(0):
+            cur_fpn = expand(cur_fpn, x.size(0) / cur_fpn.size(0))
+        x = cur_fpn + F.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
+        x = self.lay3(x)
+        x = self.gn3(x)
+        x = F.relu(x)
+        cur_fpn = self.adapter2(fpns[1])
+        if cur_fpn.size(0) != x.size(0):
+            cur_fpn = expand(cur_fpn, x.size(0) / cur_fpn.size(0))
+        x = cur_fpn + F.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
+        x = self.lay4(x)
+        x = self.gn4(x)
+        x = F.relu(x)
+        cur_fpn = self.adapter3(fpns[2])
+        if cur_fpn.size(0) != x.size(0):
+            cur_fpn = expand(cur_fpn, x.size(0) / cur_fpn.size(0))
+        x = cur_fpn + F.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
+        x = self.lay5(x)
+        x = self.gn5(x)
+        x = F.relu(x)
+        x = self.out_lay(x)
+        return x
+class MHAttentionMap(nn.Module):
+    """This is a 2D attention module, which only returns the attention softmax (no multiplication by value)"""
+    def __init__(self, query_dim, hidden_dim, num_heads, dropout=0, bias=True):
+        super().__init__()
+        self.num_heads = num_heads
+        self.hidden_dim = hidden_dim
+        self.dropout = nn.Dropout(dropout)
+        self.q_linear = nn.Linear(query_dim, hidden_dim, bias=bias)
+        self.k_linear = nn.Linear(query_dim, hidden_dim, bias=bias)
+        nn.init.zeros_(self.k_linear.bias)
+        nn.init.zeros_(self.q_linear.bias)
+        nn.init.xavier_uniform_(self.k_linear.weight)
+        nn.init.xavier_uniform_(self.q_linear.weight)
+        self.normalize_fact = float(hidden_dim / self.num_heads) ** -0.5
+    def forward(self, q, k, mask=None):
+        q = self.q_linear(q)
+        k = F.conv2d(k, self.k_linear.weight.unsqueeze(-1).unsqueeze(-1), self.k_linear.bias)
+        qh = q.view(q.shape[0], q.shape[1], self.num_heads, self.hidden_dim // self.num_heads)
+        kh = k.view(k.shape[0], self.num_heads, self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1])
+        weights = torch.einsum("bqnc,bnchw->bqnhw", qh * self.normalize_fact, kh)
+        if mask is not None:
+            weights.masked_fill_(mask.unsqueeze(1).unsqueeze(1), float("-inf"))
+        weights = F.softmax(weights.flatten(2), dim=-1).view_as(weights)
+        weights = self.dropout(weights)
+        return weights
+def dice_loss(inputs, targets, num_boxes):
+    """
+    Compute the DICE loss, similar to generalized IOU for masks
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+    """
+    inputs = inputs.sigmoid()
+    inputs = inputs.flatten(1)
+    numerator = 2 * (inputs * targets).sum(1)
+    denominator = inputs.sum(-1) + targets.sum(-1)
+    loss = 1 - (numerator + 1) / (denominator + 1)
+    return loss.sum() / num_boxes
+def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
+    """
+    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+        alpha: (optional) Weighting factor in range (0,1) to balance
+                positive vs negative examples. Default = -1 (no weighting).
+        gamma: Exponent of the modulating factor (1 - p_t) to
+               balance easy vs hard examples.
+    Returns:
+        Loss tensor
+    """
+    prob = inputs.sigmoid()
+    ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+    p_t = prob * targets + (1 - prob) * (1 - targets)
+    loss = ce_loss * ((1 - p_t) ** gamma)
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+        loss = alpha_t * loss
+    return loss.mean(1).sum() / num_boxes
+class PostProcessSegm(nn.Module):
+    def __init__(self, threshold=0.5):
+        super().__init__()
+        self.threshold = threshold
+    @torch.no_grad()
+    def forward(self, results, outputs, orig_target_sizes, max_target_sizes):
+        assert len(orig_target_sizes) == len(max_target_sizes)
+        max_h, max_w = max_target_sizes.max(0)[0].tolist()
+        outputs_masks = outputs["pred_masks"].squeeze(2)
+        outputs_masks = F.interpolate(outputs_masks, size=(max_h, max_w), mode="bilinear", align_corners=False)
+        outputs_masks = (outputs_masks.sigmoid() > self.threshold).cpu()
+        for i, (cur_mask, t, tt) in enumerate(zip(outputs_masks, max_target_sizes, orig_target_sizes)):
+            img_h, img_w = t[0], t[1]
+            results[i]["masks"] = cur_mask[:, :img_h, :img_w].unsqueeze(1)
+            results[i]["masks"] = F.interpolate(
+                results[i]["masks"].float(), size=tuple(tt.tolist()), mode="nearest"
+            ).byte()
+        return results
+class PostProcessPanoptic(nn.Module):
+    """This class converts the output of the model to the final panoptic result, in the format expected by the
+    coco panoptic API """
+    def __init__(self, is_thing_map, threshold=0.85):
+        """
+        Parameters:
+           is_thing_map: This is a whose keys are the class ids, and the values a boolean indicating whether
+                          the class is  a thing (True) or a stuff (False) class
+           threshold: confidence threshold: segments with confidence lower than this will be deleted
+        """
+        super().__init__()
+        self.threshold = threshold
+        self.is_thing_map = is_thing_map
+    def forward(self, outputs, processed_sizes, target_sizes=None):
+        """ This function computes the panoptic prediction from the model's predictions.
+        Parameters:
+            outputs: This is a dict coming directly from the model. See the model doc for the content.
+            processed_sizes: This is a list of tuples (or torch tensors) of sizes of the images that were passed to the
+                             model, ie the size after data augmentation but before batching.
+            target_sizes: This is a list of tuples (or torch tensors) corresponding to the requested final size
+                          of each prediction. If left to None, it will default to the processed_sizes
+            """
+        if target_sizes is None:
+            target_sizes = processed_sizes
+        assert len(processed_sizes) == len(target_sizes)
+        out_logits, raw_masks, raw_boxes = outputs["pred_logits"], outputs["pred_masks"], outputs["pred_boxes"]
+        assert len(out_logits) == len(raw_masks) == len(target_sizes)
+        preds = []
+        def to_tuple(tup):
+            if isinstance(tup, tuple):
+                return tup
+            return tuple(tup.cpu().tolist())
+        for cur_logits, cur_masks, cur_boxes, size, target_size in zip(
+            out_logits, raw_masks, raw_boxes, processed_sizes, target_sizes
+        ):
+            # we filter empty queries and detection below threshold
+            scores, labels = cur_logits.softmax(-1).max(-1)
+            keep = labels.ne(outputs["pred_logits"].shape[-1] - 1) & (scores > self.threshold)
+            cur_scores, cur_classes = cur_logits.softmax(-1).max(-1)
+            cur_scores = cur_scores[keep]
+            cur_classes = cur_classes[keep]
+            cur_masks = cur_masks[keep]
+            cur_masks = interpolate(cur_masks[None], to_tuple(size), mode="bilinear").squeeze(0)
+            cur_boxes = box_ops.box_cxcywh_to_xyxy(cur_boxes[keep])
+            h, w = cur_masks.shape[-2:]
+            assert len(cur_boxes) == len(cur_classes)
+            # It may be that we have several predicted masks for the same stuff class.
+            # In the following, we track the list of masks ids for each stuff class (they are merged later on)
+            cur_masks = cur_masks.flatten(1)
+            stuff_equiv_classes = defaultdict(lambda: [])
+            for k, label in enumerate(cur_classes):
+                if not self.is_thing_map[label.item()]:
+                    stuff_equiv_classes[label.item()].append(k)
+            def get_ids_area(masks, scores, dedup=False):
+                # This helper function creates the final panoptic segmentation image
+                # It also returns the area of the masks that appears on the image
+                m_id = masks.transpose(0, 1).softmax(-1)
+                if m_id.shape[-1] == 0:
+                    # We didn't detect any mask :(
+                    m_id = torch.zeros((h, w), dtype=torch.long, device=m_id.device)
+                else:
+                    m_id = m_id.argmax(-1).view(h, w)
+                if dedup:
+                    # Merge the masks corresponding to the same stuff class
+                    for equiv in stuff_equiv_classes.values():
+                        if len(equiv) > 1:
+                            for eq_id in equiv:
+                                m_id.masked_fill_(m_id.eq(eq_id), equiv[0])
+                final_h, final_w = to_tuple(target_size)
+                seg_img = Image.fromarray(id2rgb(m_id.view(h, w).cpu().numpy()))
+                seg_img = seg_img.resize(size=(final_w, final_h), resample=Image.NEAREST)
+                np_seg_img = (
+                    torch.ByteTensor(torch.ByteStorage.from_buffer(seg_img.tobytes())).view(final_h, final_w, 3).numpy()
+                )
+                m_id = torch.from_numpy(rgb2id(np_seg_img))
+                area = []
+                for i in range(len(scores)):
+                    area.append(m_id.eq(i).sum().item())
+                return area, seg_img
+            area, seg_img = get_ids_area(cur_masks, cur_scores, dedup=True)
+            if cur_classes.numel() > 0:
+                # We know filter empty masks as long as we find some
+                while True:
+                    filtered_small = torch.as_tensor(
+                        [area[i] <= 4 for i, c in enumerate(cur_classes)], dtype=torch.bool, device=keep.device
+                    )
+                    if filtered_small.any().item():
+                        cur_scores = cur_scores[~filtered_small]
+                        cur_classes = cur_classes[~filtered_small]
+                        cur_masks = cur_masks[~filtered_small]
+                        area, seg_img = get_ids_area(cur_masks, cur_scores)
+                    else:
+                        break
+            else:
+                cur_classes = torch.ones(1, dtype=torch.long, device=cur_classes.device)
+            segments_info = []
+            for i, a in enumerate(area):
+                cat = cur_classes[i].item()
+                segments_info.append({"id": i, "isthing": self.is_thing_map[cat], "category_id": cat, "area": a})
+            del cur_classes
+            with io.BytesIO() as out:
+                seg_img.save(out, format="PNG")
+                predictions = {"png_string": out.getvalue(), "segments_info": segments_info}
+            preds.append(predictions)
+        return preds