returnzeros commited on Jun 6, 2025

Commit

4d8e7a6

verified ·

1 Parent(s): 6927973

Upload 108 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
ACT_DP_multitask/README.md +16 -0
ACT_DP_multitask/base.yaml +71 -0
ACT_DP_multitask/detr/LICENSE +201 -0
ACT_DP_multitask/detr/README.md +9 -0
ACT_DP_multitask/detr/__pycache__/main.cpython-310.pyc +0 -0
ACT_DP_multitask/detr/__pycache__/main.cpython-37.pyc +0 -0
ACT_DP_multitask/detr/detr.egg-info/PKG-INFO +17 -0
ACT_DP_multitask/detr/detr.egg-info/SOURCES.txt +37 -0
ACT_DP_multitask/detr/detr.egg-info/dependency_links.txt +1 -0
ACT_DP_multitask/detr/detr.egg-info/top_level.txt +2 -0
ACT_DP_multitask/detr/main.py +763 -0
ACT_DP_multitask/detr/models/__init__.py +60 -0
ACT_DP_multitask/detr/models/__pycache__/__init__.cpython-310.pyc +0 -0
ACT_DP_multitask/detr/models/__pycache__/__init__.cpython-37.pyc +0 -0
ACT_DP_multitask/detr/models/__pycache__/__init__.cpython-38.pyc +0 -0
ACT_DP_multitask/detr/models/__pycache__/backbone.cpython-310.pyc +0 -0
ACT_DP_multitask/detr/models/__pycache__/backbone.cpython-37.pyc +0 -0
ACT_DP_multitask/detr/models/__pycache__/backbone.cpython-38.pyc +0 -0
ACT_DP_multitask/detr/models/__pycache__/detr_vae.cpython-310.pyc +0 -0
ACT_DP_multitask/detr/models/__pycache__/detr_vae.cpython-37.pyc +0 -0
ACT_DP_multitask/detr/models/__pycache__/detr_vae.cpython-38.pyc +0 -0
ACT_DP_multitask/detr/models/__pycache__/detr_vae_nfp.cpython-310.pyc +0 -0
ACT_DP_multitask/detr/models/__pycache__/detr_vae_nfp.cpython-37.pyc +0 -0
ACT_DP_multitask/detr/models/__pycache__/detr_vae_nfp.cpython-38.pyc +0 -0
ACT_DP_multitask/detr/models/__pycache__/position_encoding.cpython-310.pyc +0 -0
ACT_DP_multitask/detr/models/__pycache__/position_encoding.cpython-37.pyc +0 -0
ACT_DP_multitask/detr/models/__pycache__/position_encoding.cpython-38.pyc +0 -0
ACT_DP_multitask/detr/models/__pycache__/resnet_film.cpython-310.pyc +0 -0
ACT_DP_multitask/detr/models/__pycache__/transformer.cpython-310.pyc +0 -0
ACT_DP_multitask/detr/models/__pycache__/transformer.cpython-37.pyc +0 -0
ACT_DP_multitask/detr/models/__pycache__/transformer.cpython-38.pyc +0 -0
ACT_DP_multitask/detr/models/__pycache__/vision_transformer.cpython-310.pyc +0 -0
ACT_DP_multitask/detr/models/__pycache__/vision_transformer.cpython-37.pyc +0 -0
ACT_DP_multitask/detr/models/__pycache__/vision_transformer.cpython-38.pyc +0 -0
ACT_DP_multitask/detr/models/backbone.py +209 -0
ACT_DP_multitask/detr/models/detr_vae.py +0 -0
ACT_DP_multitask/detr/models/detr_vae_nfp.py +523 -0
ACT_DP_multitask/detr/models/mask_former/__init__.py +19 -0
ACT_DP_multitask/detr/models/mask_former/__pycache__/__init__.cpython-38.pyc +0 -0
ACT_DP_multitask/detr/models/mask_former/config.py +85 -0
ACT_DP_multitask/detr/models/mask_former/mask_former_model.py +304 -0
ACT_DP_multitask/detr/models/mask_former/modeling/__init__.py +5 -0
ACT_DP_multitask/detr/models/mask_former/modeling/backbone/__init__.py +1 -0
ACT_DP_multitask/detr/models/mask_former/modeling/backbone/swin.py +768 -0
ACT_DP_multitask/detr/models/mask_former/modeling/criterion.py +187 -0
ACT_DP_multitask/detr/models/mask_former/modeling/heads/__init__.py +1 -0
ACT_DP_multitask/detr/models/mask_former/modeling/heads/mask_former_head.py +119 -0
ACT_DP_multitask/detr/models/mask_former/modeling/heads/per_pixel_baseline.py +243 -0
ACT_DP_multitask/detr/models/mask_former/modeling/heads/pixel_decoder.py +294 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+ACT_DP_multitask/detr/models/mr_mg/media/model.gif filter=lfs diff=lfs merge=lfs -text

ACT_DP_multitask/README.md ADDED Viewed

	@@ -0,0 +1,16 @@

+### Install
+```
+cd policy/ACT-DP-TP
+cd detr
+pip install -e .
+cd ..
+cd Cosmos-Tokenizer
+pip install -e .
+#upload policy/ACT-DP-TP/Cosmos-Tokenizer/pretrained_ckpts
+```
+### Command
+```
+#data_dir: policy/ACT-DP-TP/data_zarr
+cd policy/ACT-DP-TP
+bash scripts/act_dp_tp/train.sh bottle_adjust 300 20 20 0
+```

ACT_DP_multitask/base.yaml ADDED Viewed

	@@ -0,0 +1,71 @@

+common:
+  # The number of historical images
+  img_history_size: 2
+  # The number of future actions to predict
+  action_chunk_size: 64
+  # The number of cameras to be used in the model
+  num_cameras: 3
+  # Dimension for state/action, we use the same space for both state and action
+  # This MUST be equal to configs/state_vec.py
+  state_dim: 128
+dataset:
+  # We will extract the data from raw dataset
+  # and store them in the disk buffer by producer
+  # When training, we will read the data
+  # randomly from the buffer by consumer
+  # The producer will replace the data which has been
+  # read by the consumer with new data
+  # The path to the buffer (at least 400GB)
+  buf_path: /path/to/buffer
+  # The number of chunks in the buffer
+  buf_num_chunks: 512
+  # The number of samples (step rather than episode) in each chunk
+  buf_chunk_size: 512
+  # We will filter the episodes with length less than `epsd_len_thresh_low`
+  epsd_len_thresh_low: 32
+  # For those more than `epsd_len_thresh_high`,
+  # we will randomly sample `epsd_len_thresh_high` steps each time we load the episode
+  # to better balance the training datasets
+  epsd_len_thresh_high: 2048
+  # How to fit the image size
+  image_aspect_ratio: pad
+  # Maximum number of language tokens
+  tokenizer_max_length: 1024
+model:
+  # Config for condition adpators
+  lang_adaptor: mlp2x_gelu
+  img_adaptor: mlp2x_gelu
+  state_adaptor: mlp3x_gelu
+  lang_token_dim: 4096
+  img_token_dim: 1152
+  # Dim of action or proprioception vector
+  # A `state` refers to an action or a proprioception vector
+  state_token_dim: 128
+  # Config for RDT structure
+  rdt:
+    # 1B: num_head 32 hidden_size 2048
+    hidden_size: 2048
+    depth: 28
+    num_heads: 32
+    cond_pos_embed_type: multimodal
+  # For noise scheduler
+  noise_scheduler:
+    type: ddpm
+    num_train_timesteps: 1000
+    num_inference_timesteps: 5
+    beta_schedule: squaredcos_cap_v2  # Critical choice
+    prediction_type: sample
+    clip_sample: False
+  # For EMA (params averaging)
+  # We do not use EMA currently
+  ema:
+    update_after_step: 0
+    inv_gamma: 1.0
+    power: 0.75
+    min_value: 0.0
+    max_value: 0.9999

ACT_DP_multitask/detr/LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2020 - present, Facebook, Inc
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

ACT_DP_multitask/detr/README.md ADDED Viewed

	@@ -0,0 +1,9 @@

+This part of the codebase is modified from DETR https://github.com/facebookresearch/detr under APACHE 2.0.
+    @article{Carion2020EndtoEndOD,
+      title={End-to-End Object Detection with Transformers},
+      author={Nicolas Carion and Francisco Massa and Gabriel Synnaeve and Nicolas Usunier and Alexander Kirillov and Sergey Zagoruyko},
+      journal={ArXiv},
+      year={2020},
+      volume={abs/2005.12872}
+    }

ACT_DP_multitask/detr/__pycache__/main.cpython-310.pyc ADDED Viewed

Binary file (12.9 kB). View file

ACT_DP_multitask/detr/__pycache__/main.cpython-37.pyc ADDED Viewed

Binary file (15.9 kB). View file

ACT_DP_multitask/detr/detr.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,17 @@

+Metadata-Version: 2.2
+Name: detr
+Version: 0.0.0
+License: MIT License
+License-File: LICENSE
+Dynamic: description
+Dynamic: license
+This part of the codebase is modified from DETR https://github.com/facebookresearch/detr under APACHE 2.0.
+    @article{Carion2020EndtoEndOD,
+      title={End-to-End Object Detection with Transformers},
+      author={Nicolas Carion and Francisco Massa and Gabriel Synnaeve and Nicolas Usunier and Alexander Kirillov and Sergey Zagoruyko},
+      journal={ArXiv},
+      year={2020},
+      volume={abs/2005.12872}
+    }

ACT_DP_multitask/detr/detr.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,37 @@

+LICENSE
+README.md
+setup.py
+detr.egg-info/PKG-INFO
+detr.egg-info/SOURCES.txt
+detr.egg-info/dependency_links.txt
+detr.egg-info/top_level.txt
+models/__init__.py
+models/backbone.py
+models/detr_vae.py
+models/detr_vae_nfp.py
+models/position_encoding.py
+models/transformer.py
+models/vision_transformer.py
+models/mask_former/__init__.py
+models/mask_former/config.py
+models/mask_former/mask_former_model.py
+models/mask_former/test_time_augmentation.py
+models/mask_former/modeling/__init__.py
+models/mask_former/modeling/criterion.py
+models/mask_former/modeling/matcher.py
+models/mask_former/modeling/backbone/__init__.py
+models/mask_former/modeling/backbone/swin.py
+models/mask_former/modeling/heads/__init__.py
+models/mask_former/modeling/heads/mask_former_head.py
+models/mask_former/modeling/heads/per_pixel_baseline.py
+models/mask_former/modeling/heads/pixel_decoder.py
+models/mask_former/modeling/transformer/__init__.py
+models/mask_former/modeling/transformer/position_encoding.py
+models/mask_former/modeling/transformer/transformer.py
+models/mask_former/modeling/transformer/transformer_predictor.py
+models/mask_former/utils/__init__.py
+models/mask_former/utils/misc.py
+util/__init__.py
+util/box_ops.py
+util/misc.py
+util/plot_utils.py

ACT_DP_multitask/detr/detr.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

ACT_DP_multitask/detr/detr.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ models
2	+ util

ACT_DP_multitask/detr/main.py ADDED Viewed

	@@ -0,0 +1,763 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import argparse
+from pathlib import Path
+import os
+import numpy as np
+import torch
+from .models import *
+import IPython
+e = IPython.embed
+def get_args_parser():
+    parser = argparse.ArgumentParser("Set transformer detector", add_help=False)
+    parser.add_argument("--ckpt_path", type=str, default='policy/ACT_DP_multitask/checkpoints/real_fintune_50_2000/act_dp')
+    parser.add_argument("--eval_ckpts", default=0, type=int, help="eval_ckpts")
+    parser.add_argument("--eval_video_log", action="store_true")
+    parser.add_argument("--action_interval", default=1, type=int)
+    parser.add_argument("--lr", default=1e-4, type=float)  # will be overridden
+    parser.add_argument("--lr_backbone", default=1e-5, type=float)  # will be overridden
+    parser.add_argument(
+        "--lr_schedule_type", default="constant", type=str, help="lr_schedule_type"
+    )
+    parser.add_argument(
+        "--num_episodes", type=int, help="num_epochs", default=0, required=False
+    )
+    parser.add_argument("--batch_size", default=2, type=int)  # not used
+    parser.add_argument(
+        "--samples_per_epoch",
+        default=1,
+        type=int,
+        help="samples_per_epoch",
+        required=False,
+    )
+    parser.add_argument("--weight_decay", default=1e-4, type=float)
+    parser.add_argument("--epochs", default=300, type=int)  # not used
+    parser.add_argument("--lr_drop", default=200, type=int)  # not used
+    parser.add_argument(
+        "--clip_max_norm",
+        default=0.1,
+        type=float,  # not used
+        help="gradient clipping max norm",
+    )
+    parser.add_argument("--norm_type", default="meanstd", type=str, help="norm_type")
+    parser.add_argument(
+        "--num_train_steps", default=50, type=int, help="num_train_steps"
+    )
+    parser.add_argument(
+        "--num_inference_steps", default=10, type=int, help="num_inference_steps"
+    )
+    parser.add_argument(
+        "--schedule_type", default="DDIM", type=str, help="scheduler_type"
+    )
+    parser.add_argument(
+        "--imitate_weight", default=1, type=int, help="imitate Weight", required=False
+    )
+    parser.add_argument(
+        "--prediction_type", default="sample", type=str, help="prediction_type"
+    )
+    parser.add_argument(
+        "--beta_schedule", default="squaredcos_cap_v2", type=str, help="prediction_type"
+    )
+    parser.add_argument(
+        "--diffusion_timestep_type",
+        default="cat",
+        type=str,
+        help="diffusion_timestep_type, cat or add, how to combine timestep",
+    )
+    parser.add_argument(
+        "--condition_type",
+        default="cross_attention",
+        type=str,
+        help="diffusion_condition_type, cross_attention or adaLN, how to combine observation condition",
+    )
+    parser.add_argument("--attention_type", default="v0", help="decoder attention type")
+    parser.add_argument(
+        "--causal_mask", action="store_true", help="use causal mask for diffusion"
+    )
+    parser.add_argument("--loss_type", default="l2", type=str, help="loss_type")
+    parser.add_argument(
+        "--disable_vae_latent",
+        action="store_true",
+        help="Use VAE latent space by default",
+    )
+    parser.add_argument(
+        "--disable_resnet",
+        action="store_true",
+        help="Use resnet to encode obs image  by default",
+    )
+    parser.add_argument(
+        "--disable_scale",
+        action="store_true",
+        help="scale model up",
+    )
+    parser.add_argument(
+        "--inference_num_queries",
+        default=0,
+        type=int,
+        help="inference_num_queries",
+        required=False,
+    )  # predict_frame
+    parser.add_argument(
+        "--disable_resize", action="store_true", help="if resize jpeg image"
+    )
+    parser.add_argument(
+        "--share_decoder", action="store_true", help="jpeg and action share decoder"
+    )
+    parser.add_argument(
+        "--resize_rate",
+        default=1,
+        type=int,
+        help="resize rate for pixel prediction",
+        required=False,
+    )
+    parser.add_argument(
+        "--image_downsample_rate",
+        default=1,
+        type=int,
+        help="image_downsample_rate",
+        required=False,
+    )
+    parser.add_argument(
+        "--temporal_downsample_rate",
+        default=1,
+        type=int,
+        help="temporal_downsample_rate",
+        required=False,
+    )
+    # Model parameters external
+    parser.add_argument("--test_num", default=50, type=int, help="test_num")
+    parser.add_argument("--save_episode", action="store_true")
+    parser.add_argument(
+        "--depth_mode",
+        default="None",
+        type=str,
+        help="use depth/depth+coordinate/None. ALL/Single/None",
+    )
+    parser.add_argument(
+        "--pc_mode", default="pc_camera", type=str, help="pc_world/pc_camera"
+    )
+    parser.add_argument(
+        "--disable_multi_view", action="store_true", help="Use multi-view rgb images"
+    )
+    # * Backbone
+    parser.add_argument(
+        "--backbone",
+        default="resnet18",
+        type=str,  # will be overridden
+        help="Name of the convolutional backbone to use",
+    )
+    parser.add_argument(
+        "--dilation",
+        action="store_true",
+        help="If true, we replace stride with dilation in the last convolutional block (DC5)",
+    )
+    parser.add_argument(
+        "--position_embedding",
+        default="sine",
+        type=str,
+        choices=("sine", "learned"),
+        help="Type of positional embedding to use on top of the image features",
+    )
+    parser.add_argument(
+        "--camera_names",
+        default=[],
+        type=list,  # will be overridden
+        help="A list of camera names",
+    )
+    # * Transformer
+    parser.add_argument(
+        "--enc_layers",
+        default=4,
+        type=int,  # will be overridden
+        help="Number of encoding layers in the transformer",
+    )
+    parser.add_argument(
+        "--dec_layers",
+        default=6,
+        type=int,  # will be overridden
+        help="Number of decoding layers in the transformer",
+    )
+    parser.add_argument(
+        "--dim_feedforward",
+        default=2048,
+        type=int,  # will be overridden
+        help="Intermediate size of the feedforward layers in the transformer blocks",
+    )
+    parser.add_argument(
+        "--hidden_dim",
+        default=256,
+        type=int,  # will be overridden
+        help="Size of the embeddings (dimension of the transformer)",
+    )
+    parser.add_argument(
+        "--dropout", default=0.1, type=float, help="Dropout applied in the transformer"
+    )
+    parser.add_argument(
+        "--nheads",
+        default=8,
+        type=int,  # will be overridden
+        help="Number of attention heads inside the transformer's attentions",
+    )
+    parser.add_argument(
+        "--num_queries",
+        default=400,
+        type=int,  # will be overridden
+        help="Number of query slots",
+    )
+    parser.add_argument("--pre_norm", action="store_true")
+    # # * Segmentation
+    parser.add_argument(
+        "--masks",
+        action="store_true",
+        help="Train segmentation head if the flag is provided",
+    )
+    # repeat args in imitate_episodes just to avoid error. Will not be used
+    parser.add_argument("--eval", action="store_true")
+    parser.add_argument("--onscreen_render", action="store_true")
+    parser.add_argument(
+        "--ckpt_dir", action="store", type=str, help="ckpt_dir", required=False
+    )
+    parser.add_argument(
+        "--policy_class",
+        action="store",
+        type=str,
+        help="policy_class, capitalize",
+        required=False,
+    )
+    parser.add_argument(
+        "--task_name", action="store", type=str, help="task_name", required=False
+    )
+    parser.add_argument("--seed", action="store", type=int, help="seed", required=False)
+    parser.add_argument(
+        "--num_epochs", action="store", type=int, help="num_epochs", required=False
+    )
+    parser.add_argument(
+        "--kl_weight", action="store", type=int, help="KL Weight", required=False
+    )
+    parser.add_argument(
+        "--save_epoch",
+        action="store",
+        type=int,
+        help="save_epoch",
+        default=500,
+        required=False,
+    )
+    parser.add_argument(
+        "--chunk_size", action="store", type=int, help="chunk_size", required=False
+    )
+    parser.add_argument(
+        "--history_step", default=0, type=int, help="history_step", required=False
+    )
+    parser.add_argument(
+        "--predict_frame", default=0, type=int, help="predict_frame", required=False
+    )
+    # add image_width and image_height
+    parser.add_argument(
+        "--image_width", default=320, type=int, help="image_width", required=False
+    )
+    parser.add_argument(
+        "--image_height", default=240, type=int, help="image_height", required=False
+    )
+    parser.add_argument(
+        "--predict_only_last", action="store_true"
+    )  # only predict the last #predict_frame frame
+    parser.add_argument("--temporal_agg", action="store_true")
+    # visual tokenizer
+    parser.add_argument(
+        "--tokenizer_model_type",
+        default="DV",
+        type=str,
+        help="tokenizer_model_type, DV,CV,DI,CI",
+    )
+    parser.add_argument(
+        "--tokenizer_model_temporal_rate",
+        default=8,
+        type=int,
+        help="tokenizer_model_temporal_rate, 4,8",
+    )
+    parser.add_argument(
+        "--tokenizer_model_spatial_rate",
+        default=16,
+        type=int,
+        help="tokenizer_model_spatial_rate, 8,16",
+    )
+    parser.add_argument(
+        "--tokenizer_model_name",
+        default="Cosmos-Tokenizer-DV4x8x8",
+        type=str,
+        help="tokenizer_model_name",
+    )
+    parser.add_argument(
+        "--prediction_weight",
+        default=1,
+        type=float,
+        help="pred token Weight",
+        required=False,
+    )
+    parser.add_argument(
+        "--token_dim", default=6, type=int, help="token_dim", required=False
+    )  # token_pe_type
+    parser.add_argument(
+        "--patch_size", default=5, type=int, help="patch_size", required=False
+    )  # token_pe_type
+    parser.add_argument(
+        "--token_pe_type",
+        default="learned",
+        type=str,
+        help="token_pe_type",
+        required=False,
+    )
+    parser.add_argument("--nf", action="store_true")
+    parser.add_argument("--pretrain", action="store_true", required=False)
+    parser.add_argument("--is_wandb", action="store_true")
+    parser.add_argument("--mae", action="store_true")
+    # parser.add_argument('--seg', action='store_true')
+    # parser.add_argument('--seg_next', action='store_true')
+    # parameters for distributed training
+    parser.add_argument(
+        "--resume",
+        default="",
+        type=str,
+        metavar="PATH",
+        help="path to latest checkpoint (default: none)",
+    )
+    parser.add_argument(
+        "--world-size",
+        default=-1,
+        type=int,
+        help="number of nodes for distributed training",
+    )
+    parser.add_argument(
+        "--rank", default=-1, type=int, help="node rank for distributed training"
+    )
+    parser.add_argument(
+        "--dist-url",
+        default="tcp://224.66.41.62:23456",
+        type=str,
+        help="url used to set up distributed training",
+    )
+    parser.add_argument(
+        "--dist-backend", default="nccl", type=str, help="distributed backend"
+    )
+    # parser.add_argument(
+    #     "--seed", default=None, type=int, help="seed for initializing training. "
+    # )
+    parser.add_argument("--gpu", default=None, type=int, help="GPU id to use.")
+    parser.add_argument(
+        "--multiprocessing-distributed",
+        action="store_true",
+        help="Use multi-processing distributed training to launch "
+        "N processes per node, which has N GPUs. This is the "
+        "fastest way to use PyTorch for either single node or "
+        "multi node data parallel training",
+    )
+    parser.add_argument(
+        "-j",
+        "--workers",
+        default=32,
+        type=int,
+        metavar="N",
+        help="number of data loading workers (default: 32)",
+    )
+    return parser
+def build_ACT_model_and_optimizer(args_override):
+    parser = argparse.ArgumentParser(
+        "DETR training and evaluation script", parents=[get_args_parser()]
+    )
+    args = parser.parse_args()
+    for k, v in args_override.items():
+        setattr(args, k, v)
+    if args_override["segmentation"]:
+        model = build_ACT_Seg_model(args)
+    else:
+        model = build_ACT_model(args)
+    model.cuda()
+    param_dicts = [
+        {
+            "params": [
+                p
+                for n, p in model.named_parameters()
+                if "backbone" not in n and p.requires_grad
+            ]
+        },
+        {
+            "params": [
+                p
+                for n, p in model.named_parameters()
+                if "backbone" in n and p.requires_grad
+            ],
+            "lr": args.lr_backbone,
+        },
+    ]
+    optimizer = torch.optim.AdamW(
+        param_dicts, lr=args.lr, weight_decay=args.weight_decay
+    )
+    return model, optimizer
+def build_ACTDiffusion_model_and_optimizer(args_override):
+    parser = argparse.ArgumentParser(
+        "DETR training and evaluation script", parents=[get_args_parser()]
+    )
+    args = parser.parse_args()
+    for k, v in args_override.items():
+        setattr(args, k, v)
+    # print('args',args) # get
+    model = build_ACTDiffusion_model(args)
+    model.cuda()
+    param_dicts = [
+        {
+            "params": [
+                p
+                for n, p in model.named_parameters()
+                if "backbone" not in n and p.requires_grad
+            ]
+        },
+        {
+            "params": [
+                p
+                for n, p in model.named_parameters()
+                if "backbone" in n and p.requires_grad
+            ],
+            "lr": args.lr_backbone,
+        },
+    ]
+    optimizer = torch.optim.AdamW(
+        param_dicts, lr=args.lr, weight_decay=args.weight_decay
+    )
+    return model, optimizer
+def build_ACTDiffusion_tactile_model_and_optimizer(args_override):
+    parser = argparse.ArgumentParser(
+        "DETR training and evaluation script", parents=[get_args_parser()]
+    )
+    args = parser.parse_args()
+    for k, v in args_override.items():
+        setattr(args, k, v)
+    # print('args',args) # get
+    model = build_ACTDiffusion_tactile_model(args)
+    model.cuda()
+    param_dicts = [
+        {
+            "params": [
+                p
+                for n, p in model.named_parameters()
+                if "backbone" not in n and p.requires_grad
+            ]
+        },
+        {
+            "params": [
+                p
+                for n, p in model.named_parameters()
+                if "backbone" in n and p.requires_grad
+            ],
+            "lr": args.lr_backbone,
+        },
+    ]
+    optimizer = torch.optim.AdamW(
+        param_dicts, lr=args.lr, weight_decay=args.weight_decay
+    )
+    return model, optimizer
+def build_diffusion_tp_model_and_optimizer(args_override):
+    parser = argparse.ArgumentParser(
+        "DETR training and evaluation script", parents=[get_args_parser()]
+    )
+    args = parser.parse_args()
+    for k, v in args_override.items():
+        setattr(args, k, v)
+    # print('args',args) # get
+    model = build_ACTDiffusion_tp_model(args)
+    model.cuda()
+    return model  # , optimizer
+def build_diffusion_pp_model_and_optimizer(args_override):
+    parser = argparse.ArgumentParser(
+        "DETR training and evaluation script", parents=[get_args_parser()]
+    )
+    args = parser.parse_args()
+    for k, v in args_override.items():
+        setattr(args, k, v)
+    # print('args',args) # get
+    model = build_ACTDiffusion_pp_model(args)
+    model.cuda()
+    return model
+# discard
+def build_ACT_NF_model_and_optimizer(args_override):
+    parser = argparse.ArgumentParser(
+        "DETR training and evaluation script", parents=[get_args_parser()]
+    )
+    args = parser.parse_args()
+    for k, v in args_override.items():
+        setattr(args, k, v)
+    model = build_ACT_NF_model(args)
+    model.cuda()
+    param_dicts = [
+        {
+            "params": [
+                p
+                for n, p in model.named_parameters()
+                if "backbone" not in n and p.requires_grad
+            ]
+        },
+        {
+            "params": [
+                p
+                for n, p in model.named_parameters()
+                if "backbone" in n and p.requires_grad
+            ],
+            "lr": args.lr_backbone,
+        },
+    ]
+    optimizer = torch.optim.AdamW(
+        param_dicts, lr=args.lr, weight_decay=args.weight_decay
+    )
+    return model, optimizer
+def build_ACT_Dino_model_and_optimizer(args_override):
+    parser = argparse.ArgumentParser(
+        "DETR training and evaluation script", parents=[get_args_parser()]
+    )
+    args = parser.parse_args()
+    for k, v in args_override.items():
+        setattr(args, k, v)
+    model = build_ACT_dino_model(args)
+    model.cuda()
+    param_dicts = [
+        {
+            "params": [
+                p
+                for n, p in model.named_parameters()
+                if "backbone" not in n and p.requires_grad
+            ]
+        },
+        {
+            "params": [
+                p
+                for n, p in model.named_parameters()
+                if "backbone" in n and p.requires_grad
+            ],
+            "lr": args.lr_backbone,
+        },
+    ]
+    optimizer = torch.optim.AdamW(
+        param_dicts, lr=args.lr, weight_decay=args.weight_decay
+    )
+    return model, optimizer
+def build_ACT_jpeg_model_and_optimizer(args_override):
+    parser = argparse.ArgumentParser(
+        "DETR training and evaluation script", parents=[get_args_parser()]
+    )
+    args = parser.parse_args()
+    for k, v in args_override.items():
+        setattr(args, k, v)
+    model = build_ACT_jpeg_model(args)
+    model.cuda()
+    param_dicts = [
+        {
+            "params": [
+                p
+                for n, p in model.named_parameters()
+                if "backbone" not in n and p.requires_grad
+            ]
+        },
+        {
+            "params": [
+                p
+                for n, p in model.named_parameters()
+                if "backbone" in n and p.requires_grad
+            ],
+            "lr": args.lr_backbone,
+        },
+    ]
+    optimizer = torch.optim.AdamW(
+        param_dicts, lr=args.lr, weight_decay=args.weight_decay
+    )
+    return model, optimizer
+def build_ACT_jpeg_diffusion_model_and_optimizer(args_override):
+    parser = argparse.ArgumentParser(
+        "DETR training and evaluation script", parents=[get_args_parser()]
+    )
+    args = parser.parse_args()
+    for k, v in args_override.items():
+        setattr(args, k, v)
+    model = build_ACT_jpeg_diffusion_model(args)
+    model.cuda()
+    param_dicts = [
+        {
+            "params": [
+                p
+                for n, p in model.named_parameters()
+                if "backbone" not in n and p.requires_grad
+            ]
+        },
+        {
+            "params": [
+                p
+                for n, p in model.named_parameters()
+                if "backbone" in n and p.requires_grad
+            ],
+            "lr": args.lr_backbone,
+        },
+    ]
+    optimizer = torch.optim.AdamW(
+        param_dicts, lr=args.lr, weight_decay=args.weight_decay
+    )
+    return model, optimizer
+def build_ACT_jpeg_diffusion_seperate_model_and_optimizer(args_override):
+    parser = argparse.ArgumentParser(
+        "DETR training and evaluation script", parents=[get_args_parser()]
+    )
+    args = parser.parse_args()
+    for k, v in args_override.items():
+        setattr(args, k, v)
+    model = build_ACT_jpeg_diffusion_seperate_model(args)
+    model.cuda()
+    param_dicts = [
+        {
+            "params": [
+                p
+                for n, p in model.named_parameters()
+                if "backbone" not in n and p.requires_grad
+            ]
+        },
+        {
+            "params": [
+                p
+                for n, p in model.named_parameters()
+                if "backbone" in n and p.requires_grad
+            ],
+            "lr": args.lr_backbone,
+        },
+    ]
+    optimizer = torch.optim.AdamW(
+        param_dicts, lr=args.lr, weight_decay=args.weight_decay
+    )
+    return model, optimizer
+def build_nf_diffusion_seperate_model_and_optimizer(args_override):
+    parser = argparse.ArgumentParser(
+        "DETR training and evaluation script", parents=[get_args_parser()]
+    )
+    args = parser.parse_args()
+    for k, v in args_override.items():
+        setattr(args, k, v)
+    model = build_nf_diffusion_seperate_model(args)
+    model.cuda()
+    param_dicts = [
+        {
+            "params": [
+                p
+                for n, p in model.named_parameters()
+                if "backbone" not in n and p.requires_grad
+            ]
+        },
+        {
+            "params": [
+                p
+                for n, p in model.named_parameters()
+                if "backbone" in n and p.requires_grad
+            ],
+            "lr": args.lr_backbone,
+        },
+    ]
+    optimizer = torch.optim.AdamW(
+        param_dicts, lr=args.lr, weight_decay=args.weight_decay
+    )
+    return model, optimizer
+def build_CNNMLP_model_and_optimizer(args_override):
+    parser = argparse.ArgumentParser(
+        "DETR training and evaluation script", parents=[get_args_parser()]
+    )
+    args = parser.parse_args()
+    for k, v in args_override.items():
+        setattr(args, k, v)
+    model = build_CNNMLP_model(args)
+    model.cuda()
+    param_dicts = [
+        {
+            "params": [
+                p
+                for n, p in model.named_parameters()
+                if "backbone" not in n and p.requires_grad
+            ]
+        },
+        {
+            "params": [
+                p
+                for n, p in model.named_parameters()
+                if "backbone" in n and p.requires_grad
+            ],
+            "lr": args.lr_backbone,
+        },
+    ]
+    optimizer = torch.optim.AdamW(
+        param_dicts, lr=args.lr, weight_decay=args.weight_decay
+    )
+    return model, optimizer

ACT_DP_multitask/detr/models/__init__.py ADDED Viewed

	@@ -0,0 +1,60 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from .detr_vae import build as build_vae
+from .detr_vae import build_seg as build_vae_seg
+from .detr_vae_nfp import build as build_vae_nfp
+from .detr_vae import build_cnnmlp as build_cnnmlp
+from .detr_vae import build_dino as build_dino
+from .detr_vae import build_jpeg as build_jpeg
+from .detr_vae import build_jpeg_diffusion as build_jpeg_diffusion
+from .detr_vae import build_jpeg_diffusion_seperate as build_jpeg_diffusion_seperate
+from .detr_vae import build_nf_diffusion_seperate as build_nf_diffusion_seperate
+from .detr_vae import build_diffusion as build_diffusion
+from .detr_vae import build_diffusion_tp as build_diffusion_tp
+from .detr_vae import build_diffusion_tp_with_dual_visual_token as build_diffusion_tp_with_dual_visual_token
+from .detr_vae import build_diffusion_pp as build_diffusion_pp
+from .detr_vae import build_diffusion_tactile as build_diffusion_tactile
+def build_ACT_model(args):
+    return build_vae(args)
+def build_CNNMLP_model(args):
+    return build_cnnmlp(args)
+def build_ACTDiffusion_model(args):
+    return build_diffusion(args)
+def build_ACTDiffusion_tactile_model(args):
+    return build_diffusion_tactile(args)
+def build_ACTDiffusion_tp_model(args):
+    if args.diffusion_timestep_type  == 'vis_cat': # HARDCODE whether use tokenizer feature for decoder & action prediction
+        print('Using dual visual token for decoder and action prediction')
+        return build_diffusion_tp_with_dual_visual_token(args)
+    else:
+        return build_diffusion_tp(args)
+def build_ACTDiffusion_pp_model(args):
+    return build_diffusion_pp(args)
+# discard
+def build_ACT_NF_model(args):
+    return build_vae_nfp(args)
+def build_ACT_Seg_model(args):
+    return build_vae_seg(args)
+def build_ACT_dino_model(args):
+    return build_dino(args)
+def build_ACT_jpeg_model(args):
+    return build_jpeg(args)
+def build_ACT_jpeg_diffusion_model(args):
+    return build_jpeg_diffusion(args)
+def build_ACT_jpeg_diffusion_seperate_model(args):
+    return build_jpeg_diffusion_seperate(args)
+def build_nf_diffusion_seperate_model(args):
+    return build_nf_diffusion_seperate(args)

ACT_DP_multitask/detr/models/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (2.47 kB). View file

ACT_DP_multitask/detr/models/__pycache__/__init__.cpython-37.pyc ADDED Viewed

Binary file (2.55 kB). View file

ACT_DP_multitask/detr/models/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (2.2 kB). View file

ACT_DP_multitask/detr/models/__pycache__/backbone.cpython-310.pyc ADDED Viewed

Binary file (6.66 kB). View file

ACT_DP_multitask/detr/models/__pycache__/backbone.cpython-37.pyc ADDED Viewed

Binary file (4.32 kB). View file

ACT_DP_multitask/detr/models/__pycache__/backbone.cpython-38.pyc ADDED Viewed

Binary file (4.35 kB). View file

ACT_DP_multitask/detr/models/__pycache__/detr_vae.cpython-310.pyc ADDED Viewed

Binary file (50.1 kB). View file

ACT_DP_multitask/detr/models/__pycache__/detr_vae.cpython-37.pyc ADDED Viewed

Binary file (57.9 kB). View file

ACT_DP_multitask/detr/models/__pycache__/detr_vae.cpython-38.pyc ADDED Viewed

Binary file (40.8 kB). View file

ACT_DP_multitask/detr/models/__pycache__/detr_vae_nfp.cpython-310.pyc ADDED Viewed

Binary file (13.3 kB). View file

ACT_DP_multitask/detr/models/__pycache__/detr_vae_nfp.cpython-37.pyc ADDED Viewed

Binary file (15.1 kB). View file

ACT_DP_multitask/detr/models/__pycache__/detr_vae_nfp.cpython-38.pyc ADDED Viewed

Binary file (13.4 kB). View file

ACT_DP_multitask/detr/models/__pycache__/position_encoding.cpython-310.pyc ADDED Viewed

Binary file (3.61 kB). View file

ACT_DP_multitask/detr/models/__pycache__/position_encoding.cpython-37.pyc ADDED Viewed

Binary file (3.55 kB). View file

ACT_DP_multitask/detr/models/__pycache__/position_encoding.cpython-38.pyc ADDED Viewed

Binary file (3.56 kB). View file

ACT_DP_multitask/detr/models/__pycache__/resnet_film.cpython-310.pyc ADDED Viewed

Binary file (13.5 kB). View file

ACT_DP_multitask/detr/models/__pycache__/transformer.cpython-310.pyc ADDED Viewed

Binary file (39 kB). View file

ACT_DP_multitask/detr/models/__pycache__/transformer.cpython-37.pyc ADDED Viewed

Binary file (40.5 kB). View file

ACT_DP_multitask/detr/models/__pycache__/transformer.cpython-38.pyc ADDED Viewed

Binary file (24.5 kB). View file

ACT_DP_multitask/detr/models/__pycache__/vision_transformer.cpython-310.pyc ADDED Viewed

Binary file (13.1 kB). View file

ACT_DP_multitask/detr/models/__pycache__/vision_transformer.cpython-37.pyc ADDED Viewed

Binary file (13.5 kB). View file

ACT_DP_multitask/detr/models/__pycache__/vision_transformer.cpython-38.pyc ADDED Viewed

Binary file (13.2 kB). View file

ACT_DP_multitask/detr/models/backbone.py ADDED Viewed

	@@ -0,0 +1,209 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Backbone modules.
+"""
+from collections import OrderedDict
+import torch
+import torch.nn.functional as F
+import torchvision
+from torch import nn
+from torchvision.models._utils import IntermediateLayerGetter
+from typing import Dict, List
+from typing import Any, Dict, List, Mapping, Optional
+from ..util.misc import NestedTensor, is_main_process
+from .position_encoding import build_position_encoding
+from .resnet_film import resnet18 as resnet18_film
+from .resnet_film import resnet34 as resnet34_film
+import IPython
+e = IPython.embed
+class FrozenBatchNorm2d(torch.nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt,
+    without which any other policy_models than torchvision.policy_models.resnet[18,34,50,101]
+    produce nans.
+    """
+    def __init__(self, n):
+        super(FrozenBatchNorm2d, self).__init__()
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        num_batches_tracked_key = prefix + 'num_batches_tracked'
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+        super(FrozenBatchNorm2d, self)._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict,
+            missing_keys, unexpected_keys, error_msgs)
+    def forward(self, x):
+        # move reshapes to the beginning
+        # to make it fuser-friendly
+        w = self.weight.reshape(1, -1, 1, 1)
+        b = self.bias.reshape(1, -1, 1, 1)
+        rv = self.running_var.reshape(1, -1, 1, 1)
+        rm = self.running_mean.reshape(1, -1, 1, 1)
+        eps = 1e-5
+        scale = w * (rv + eps).rsqrt()
+        bias = b - rm * scale
+        return x * scale + bias
+class BackboneBase(nn.Module):
+    def __init__(self, backbone: nn.Module, train_backbone: bool, num_channels: int, return_interm_layers: bool):
+        super().__init__()
+        # for name, parameter in backbone.named_parameters(): # only train later layers # TODO do we want this?
+        #     if not train_backbone or 'layer2' not in name and 'layer3' not in name and 'layer4' not in name:
+        #         parameter.requires_grad_(False)
+        if return_interm_layers:
+            return_layers = {"layer1": "0", "layer2": "1", "layer3": "2", "layer4": "3"}
+        else:
+            return_layers = {'layer4': "0"}
+        self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
+        self.num_channels = num_channels
+    def forward(self, tensor):
+        xs = self.body(tensor)
+        return xs
+        # out: Dict[str, NestedTensor] = {}
+        # for name, x in xs.items():
+        #     m = tensor_list.mask
+        #     assert m is not None
+        #     mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
+        #     out[name] = NestedTensor(x, mask)
+        # return out
+class Backbone(BackboneBase):
+    """ResNet backbone with frozen BatchNorm."""
+    def __init__(self, name: str,
+                 train_backbone: bool,
+                 return_interm_layers: bool,
+                 dilation: bool):
+        backbone = getattr(torchvision.models, name)(
+            replace_stride_with_dilation=[False, False, dilation],
+            pretrained=is_main_process(), norm_layer=FrozenBatchNorm2d) # pretrained # TODO do we want frozen batch_norm??
+        num_channels = 512 if name in ('resnet18', 'resnet34') else 2048
+        super().__init__(backbone, train_backbone, num_channels, return_interm_layers)
+# ====  ResNet Backbone ====
+class ResNetFilmBackbone(nn.Module):
+    def __init__(self, embedding_name: str, pretrained: bool = False,
+                 film_config: Optional[Mapping[str, Any]] = None):
+        super().__init__()
+        self._pretrained = pretrained
+        weights = 'IMAGENET1K_V1' if pretrained else None
+        if embedding_name in ('resnet34_film', 'resnet34'):
+            backbone = resnet34_film(weights=weights, film_config=film_config, pretrained=pretrained)
+            embedding_dim = 512
+        elif embedding_name in ('resnet18_film', 'resnet18'):
+            backbone = resnet18_film(weights=weights, film_config=film_config, pretrained=pretrained)
+            embedding_dim = 512
+        else:
+            raise NotImplementedError
+        self.resnet_film_model = backbone
+        self._embedding_dim = embedding_dim
+        self.resnet_film_model.fc = nn.Identity()
+        self.resnet_film_model.avgpool = nn.Identity()
+        self.num_channels = self._embedding_dim
+        # FiLM config
+        self.film_config = film_config
+        if film_config is not None and film_config['use']:
+            film_models = []
+            for layer_idx, num_blocks in enumerate(self.resnet_film_model.layers):
+                if layer_idx in film_config['use_in_layers']:
+                    num_planes = self.resnet_film_model.film_planes[layer_idx]
+                    film_model_layer = nn.Linear(
+                        film_config['task_embedding_dim'], num_blocks * 2 * num_planes)
+                else:
+                    film_model_layer = None
+                film_models.append(film_model_layer)
+            self.film_models = nn.ModuleList(film_models)
+    def forward(self, x, texts: Optional[List[str]] = None, task_emb: Optional[torch.Tensor] = None, **kwargs):
+        film_outputs = None
+        if self.film_config is not None and self.film_config['use']:
+            film_outputs = []
+            for layer_idx, num_blocks in enumerate(self.resnet_film_model.layers):
+                if self.film_config['use'] and self.film_models[layer_idx] is not None:
+                    film_features = self.film_models[layer_idx](task_emb)
+                else:
+                    film_features = None
+                film_outputs.append(film_features)
+        return self.resnet_film_model(x, film_features=film_outputs, flatten=False)
+    @property
+    def embed_dim(self):
+        return self._embedding_dim
+# class Joiner(nn.Sequential):
+#     def __init__(self, backbone, position_embedding):
+#         super().__init__(backbone, position_embedding)
+#     def forward(self, tensor_list: NestedTensor, task_emb:NestedTensor):
+#         xs = self[0](tensor_list)
+#         out: List[NestedTensor] = []
+#         pos = []
+#         for name, x in xs.items():
+#             out.append(x)
+#             # position encoding
+#             pos.append(self[1](x).to(x.dtype))
+#         return out, pos
+class Joiner(nn.Sequential):
+    def __init__(self, backbone, position_embedding):
+        super().__init__(backbone, position_embedding)
+    def forward(self, tensor_list: NestedTensor, task_emb: Optional[Any] = None):
+        if task_emb is not None:
+            xs = self[0](tensor_list, task_emb=task_emb)
+            # Make a dictionary out of the last layer outputs since we don't have IntermediateLayerGetter
+            xs = {'0': xs}
+        else:
+            xs = self[0](tensor_list)
+        out: List[NestedTensor] = []
+        pos = []
+        for name, x in xs.items():
+            out.append(x)
+            # position encoding
+            pos.append(self[1](x).to(x.dtype))
+        return out, pos
+def build_backbone(args):
+    position_embedding = build_position_encoding(args)
+    train_backbone = args.lr_backbone > 0
+    return_interm_layers = args.masks
+    backbone = Backbone(args.backbone, train_backbone, return_interm_layers, args.dilation)
+    model = Joiner(backbone, position_embedding)
+    model.num_channels = backbone.num_channels
+    return model
+def build_film_backbone(args):
+    position_embedding = build_position_encoding(args)
+    film_config = {
+        'use': True,
+        'use_in_layers': [1, 2, 3],
+        'task_embedding_dim': 512,
+        'film_planes': [64, 128, 256, 512],
+    }
+    backbone = ResNetFilmBackbone(args.backbone, film_config=film_config)
+    model = Joiner(backbone, position_embedding)
+    model.num_channels = backbone.num_channels
+    return model

ACT_DP_multitask/detr/models/detr_vae.py ADDED Viewed

The diff for this file is too large to render. See raw diff

ACT_DP_multitask/detr/models/detr_vae_nfp.py ADDED Viewed

	@@ -0,0 +1,523 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+DETR model and criterion classes.
+"""
+import torch
+from torch import nn
+from torch.autograd import Variable
+from .backbone import build_backbone
+from .transformer import build_transformer, TransformerEncoder, TransformerEncoderLayer
+from .vision_transformer import Block, get_2d_sincos_pos_embed, get_2d_sincos_pos_embed_v2
+from .mr_mg.policy.model.vision_transformer import vit_base
+import numpy as np
+import IPython
+e = IPython.embed
+def reparametrize(mu, logvar):
+    std = logvar.div(2).exp()
+    eps = Variable(std.data.new(std.size()).normal_())
+    return mu + std * eps
+def get_sinusoid_encoding_table(n_position, d_hid):
+    def get_position_angle_vec(position):
+        return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]
+    sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
+    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
+    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
+    return torch.FloatTensor(sinusoid_table).unsqueeze(0)
+class DETRVAE(nn.Module):
+    """ This is the DETR module that performs object detection """
+    def __init__(self, backbones, transformer, encoder, state_dim, num_queries, camera_names):
+        """ Initializes the model.
+        Parameters:
+            backbones: torch module of the backbone to be used. See backbone.py
+            transformer: torch module of the transformer architecture. See transformer.py
+            state_dim: robot state dimension of the environment
+            num_queries: number of object queries, ie detection slot. This is the maximal number of objects
+                         DETR can detect in a single image. For COCO, we recommend 100 queries.
+            aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
+        """
+        super().__init__()
+        self.num_queries = num_queries
+        self.camera_names = camera_names
+        self.transformer = transformer
+        self.encoder = encoder
+        hidden_dim = transformer.d_model
+        self.action_head = nn.Linear(hidden_dim, state_dim)
+        self.is_pad_head = nn.Linear(hidden_dim, 1)
+        self.query_embed = nn.Embedding(num_queries, hidden_dim)
+        if backbones is not None:
+            self.input_proj = nn.Conv2d(backbones[0].num_channels, hidden_dim, kernel_size=1)
+            self.backbones = nn.ModuleList(backbones)
+            self.input_proj_robot_state = nn.Linear(14, hidden_dim)
+        else:
+            # input_dim = 14 + 7 # robot_state + env_state
+            self.input_proj_robot_state = nn.Linear(14, hidden_dim)
+            self.input_proj_env_state = nn.Linear(7, hidden_dim)
+            self.pos = torch.nn.Embedding(2, hidden_dim)
+            self.backbones = None
+        # encoder extra parameters
+        self.latent_dim = 32 # final size of latent z # TODO tune
+        self.cls_embed = nn.Embedding(1, hidden_dim) # extra cls token embedding
+        self.encoder_action_proj = nn.Linear(14, hidden_dim) # project action to embedding
+        self.encoder_joint_proj = nn.Linear(14, hidden_dim)  # project qpos to embedding
+        self.latent_proj = nn.Linear(hidden_dim, self.latent_dim*2) # project hidden state to latent std, var
+        self.register_buffer('pos_table', get_sinusoid_encoding_table(1+1+num_queries, hidden_dim)) # [CLS], qpos, a_seq
+        # decoder extra parameters
+        self.latent_out_proj = nn.Linear(self.latent_dim, hidden_dim) # project latent sample to embedding
+        self.additional_pos_embed = nn.Embedding(2, hidden_dim) # learned position embedding for proprio and latent
+        # settings for next frame prediction
+        self.patch_size = 16
+        # self.image_size = 224
+        # self.img_h, self.img_w = 128, 160
+        self.img_h, self.img_w = 224, 224
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, hidden_dim))
+        # self.n_patch = (self.image_size//self.patch_size)**2
+        self.k = 1 # number of next frames
+        self.n_patch = (self.img_h//self.patch_size)*(self.img_w//self.patch_size)*(self.k)
+        self.decoder_pos_embed = nn.Parameter(torch.zeros(1, self.n_patch, hidden_dim), requires_grad=False)  # (1, n_patch, h)
+        self.patch_embed = nn.Embedding(self.n_patch, hidden_dim)
+        self.decoder_embed = nn.Linear(hidden_dim, hidden_dim, bias=True)
+        decoder_depth = 2 # hardcode
+        self.decoder_blocks = nn.ModuleList([
+            Block(hidden_dim, 16, 4, qkv_bias=True, qk_scale=None, norm_layer=nn.LayerNorm)
+            for i in range(decoder_depth)])
+        self.decoder_norm = nn.LayerNorm(hidden_dim)
+        self.decoder_pred = nn.Linear(hidden_dim, self.patch_size**2 * 3, bias=True) # decoder to patch
+        # decoder_pos_embed = get_2d_sincos_pos_embed(self.decoder_pos_embed.shape[-1], (self.image_size//self.patch_size), cls_token=False)
+        decoder_pos_embed = get_2d_sincos_pos_embed_v2(self.decoder_pos_embed.shape[-1], (self.img_h//self.patch_size, self.img_w//self.patch_size))
+        self.decoder_pos_embed.data.copy_(torch.from_numpy(decoder_pos_embed).float().unsqueeze(0).repeat(1,self.k,1))
+        # fwd_params = sum(p.numel() for p in self.decoder_blocks.parameters() if p.requires_grad)
+    def forward(self, qpos, image, env_state, actions=None, is_pad=None):
+        """
+        qpos: batch, qpos_dim
+        image: batch, num_cam, channel, height, width
+        env_state: None
+        actions: batch, seq, action_dim
+        """
+        is_training = actions is not None # train or val
+        bs, _ = qpos.shape
+        ### Obtain latent z from action sequence
+        if is_training:
+            # project action sequence to embedding dim, and concat with a CLS token
+            action_embed = self.encoder_action_proj(actions) # (bs, seq, hidden_dim)
+            qpos_embed = self.encoder_joint_proj(qpos)  # (bs, hidden_dim)
+            qpos_embed = torch.unsqueeze(qpos_embed, axis=1)  # (bs, 1, hidden_dim)
+            cls_embed = self.cls_embed.weight # (1, hidden_dim)
+            cls_embed = torch.unsqueeze(cls_embed, axis=0).repeat(bs, 1, 1) # (bs, 1, hidden_dim)
+            encoder_input = torch.cat([cls_embed, qpos_embed, action_embed], axis=1) # (bs, seq+1, hidden_dim)
+            encoder_input = encoder_input.permute(1, 0, 2) # (seq+1, bs, hidden_dim)
+            # do not mask cls token
+            cls_joint_is_pad = torch.full((bs, 2), False).to(qpos.device) # False: not a padding
+            is_pad = torch.cat([cls_joint_is_pad, is_pad], axis=1)  # (bs, seq+1)
+            # obtain position embedding
+            pos_embed = self.pos_table.clone().detach()
+            pos_embed = pos_embed.permute(1, 0, 2)  # (seq+1, 1, hidden_dim)
+            # query model
+            encoder_output = self.encoder(encoder_input, pos=pos_embed, src_key_padding_mask=is_pad)
+            encoder_output = encoder_output[0] # take cls output only
+            latent_info = self.latent_proj(encoder_output)
+            mu = latent_info[:, :self.latent_dim]
+            logvar = latent_info[:, self.latent_dim:]
+            latent_sample = reparametrize(mu, logvar)
+            latent_input = self.latent_out_proj(latent_sample)
+        else:
+            mu = logvar = None
+            latent_sample = torch.zeros([bs, self.latent_dim], dtype=torch.float32).to(qpos.device)
+            latent_input = self.latent_out_proj(latent_sample)
+        if self.backbones is not None:
+            # Image observation features and position embeddings
+            all_cam_features = []
+            all_cam_pos = []
+            if is_training:
+                next_frame_images = image[:,1:]
+                image = image[:,:1]
+            for cam_id, cam_name in enumerate(self.camera_names):
+                features, pos = self.backbones[0](image[:, cam_id]) # HARDCODED?
+                features = features[0] # take the last layer feature
+                pos = pos[0]
+                all_cam_features.append(self.input_proj(features))
+                all_cam_pos.append(pos)
+            # proprioception features
+            proprio_input = self.input_proj_robot_state(qpos)
+            # fold camera dimension into width dimension
+            src = torch.cat(all_cam_features, axis=3)
+            pos = torch.cat(all_cam_pos, axis=3)
+            query_embed = torch.cat([self.query_embed.weight, self.patch_embed.weight], axis=0)
+            hs = self.transformer(src, None, query_embed, pos, latent_input, proprio_input, self.additional_pos_embed.weight)[0]
+            # hs = self.transformer(src, None, self.query_embed.weight, pos, latent_input, proprio_input, self.additional_pos_embed.weight)[0]
+        else:
+            qpos = self.input_proj_robot_state(qpos)
+            env_state = self.input_proj_env_state(env_state)
+            transformer_input = torch.cat([qpos, env_state], axis=1) # seq length = 2
+            hs = self.transformer(transformer_input, None, self.query_embed.weight, self.pos.weight)[0]
+        a_hat = self.action_head(hs[:,:self.num_queries])
+        is_pad_hat = self.is_pad_head(hs[:,:self.num_queries])
+        # next frame prediction
+        mask_token = self.mask_token
+        mask_tokens = mask_token.repeat(bs, self.n_patch, 1)
+        mask_tokens = mask_tokens + self.decoder_pos_embed.repeat(bs, 1, 1)
+        obs_pred = self.decoder_embed(hs[:,self.num_queries:])
+        obs_pred_ = torch.cat([obs_pred, mask_tokens], dim=1)
+        for blk in self.decoder_blocks:
+            obs_pred_ = blk(obs_pred_)
+        obs_pred_ = self.decoder_norm(obs_pred_)
+        obs_preds = self.decoder_pred(obs_pred_)
+        obs_preds = obs_preds[:,self.n_patch:]
+        if is_training:
+            # next_frame_images = image[:,1:]
+            next_frame_images = nn.functional.interpolate(next_frame_images.reshape(bs, self.k*3, 224, 224), size=(self.img_h, self.img_w))
+            p = self.patch_size
+            h_p = self.img_h // p
+            w_p = self.img_w // p
+            obs_targets = next_frame_images.reshape(shape=(bs, self.k, 3, h_p, p, w_p, p))
+            obs_targets = obs_targets.permute(0,1,3,5,4,6,2)
+            obs_targets = obs_targets.reshape(shape=(bs, h_p*w_p*self.k, (p**2)*3))
+        else:
+            obs_targets = torch.zeros_like(obs_preds)
+        return a_hat, is_pad_hat, [mu, logvar], [obs_preds, obs_targets]
+class DETRVAE_MAE(nn.Module):
+    """ This is the DETR module that performs object detection """
+    def __init__(self, backbones, transformer, encoder, state_dim, num_queries, camera_names):
+        """ Initializes the model.
+        Parameters:
+            backbones: torch module of the backbone to be used. See backbone.py
+            transformer: torch module of the transformer architecture. See transformer.py
+            state_dim: robot state dimension of the environment
+            num_queries: number of object queries, ie detection slot. This is the maximal number of objects
+                         DETR can detect in a single image. For COCO, we recommend 100 queries.
+            aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
+        """
+        super().__init__()
+        self.num_queries = num_queries
+        self.camera_names = camera_names
+        self.transformer = transformer
+        self.encoder = encoder
+        hidden_dim = transformer.d_model
+        self.action_head = nn.Linear(hidden_dim, state_dim)
+        self.is_pad_head = nn.Linear(hidden_dim, 1)
+        self.query_embed = nn.Embedding(num_queries, hidden_dim)
+        # self.model_mae = vits.__dict__['vit_base'](patch_size=16, num_classes=0)
+        self.model_mae = vit_base(patch_size=16, num_classes=0)
+        mae_ckpt = 'checkpoints/pretrained/mae_pretrain_vit_base.pth'
+        checkpoint = torch.load(mae_ckpt, map_location='cpu')
+        self.model_mae.load_state_dict(checkpoint['model'], strict=True)
+        print('Load MAE pretrained model')
+        # for name, p in self.model_mae.named_parameters():
+        #     p.requires_grad = False
+        if backbones is not None:
+            self.input_proj = nn.Conv2d(backbones[0].num_channels, hidden_dim, kernel_size=1)
+            self.backbones = nn.ModuleList(backbones)
+            self.input_proj_robot_state = nn.Linear(14, hidden_dim)
+        else:
+            # input_dim = 14 + 7 # robot_state + env_state
+            self.input_proj_robot_state = nn.Linear(14, hidden_dim)
+            self.input_proj_env_state = nn.Linear(7, hidden_dim)
+            self.pos = torch.nn.Embedding(2, hidden_dim)
+            self.backbones = None
+        # encoder extra parameters
+        self.latent_dim = 32 # final size of latent z # TODO tune
+        self.cls_embed = nn.Embedding(1, hidden_dim) # extra cls token embedding
+        self.encoder_action_proj = nn.Linear(14, hidden_dim) # project action to embedding
+        self.encoder_joint_proj = nn.Linear(14, hidden_dim)  # project qpos to embedding
+        self.latent_proj = nn.Linear(hidden_dim, self.latent_dim*2) # project hidden state to latent std, var
+        self.register_buffer('pos_table', get_sinusoid_encoding_table(1+1+num_queries, hidden_dim)) # [CLS], qpos, a_seq
+        # decoder extra parameters
+        self.latent_out_proj = nn.Linear(self.latent_dim, hidden_dim) # project latent sample to embedding
+        self.additional_pos_embed = nn.Embedding(2, hidden_dim) # learned position embedding for proprio and latent
+        # settings for next frame prediction
+        self.patch_size = 16
+        self.img_h, self.img_w = 224, 224
+        self.n_patch = (self.img_h//self.patch_size)*(self.img_w//self.patch_size)
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, hidden_dim))
+        self.decoder_pos_embed = nn.Parameter(torch.zeros(1, self.n_patch, hidden_dim), requires_grad=False)  # (1, n_patch, h)
+        self.patch_embed = nn.Embedding(self.n_patch, hidden_dim)
+        self.decoder_embed = nn.Linear(hidden_dim, hidden_dim, bias=True)
+        decoder_depth = 2 # hardcode
+        self.decoder_blocks = nn.ModuleList([
+            Block(hidden_dim, 16, 4, qkv_bias=True, qk_scale=None, norm_layer=nn.LayerNorm)
+            for i in range(decoder_depth)])
+        self.decoder_norm = nn.LayerNorm(hidden_dim)
+        self.decoder_pred = nn.Linear(hidden_dim, self.patch_size**2 * 3, bias=True) # decoder to patch
+        decoder_pos_embed = get_2d_sincos_pos_embed_v2(self.decoder_pos_embed.shape[-1], (self.img_h//self.patch_size, self.img_w//self.patch_size))
+        self.decoder_pos_embed.data.copy_(torch.from_numpy(decoder_pos_embed).float().unsqueeze(0))
+    def forward(self, qpos, image, env_state, actions=None, is_pad=None):
+        """
+        qpos: batch, qpos_dim
+        image: batch, num_cam, channel, height, width
+        env_state: None
+        actions: batch, seq, action_dim
+        """
+        is_training = actions is not None # train or val
+        bs, _ = qpos.shape
+        ### Obtain latent z from action sequence
+        if is_training:
+            # project action sequence to embedding dim, and concat with a CLS token
+            action_embed = self.encoder_action_proj(actions) # (bs, seq, hidden_dim)
+            qpos_embed = self.encoder_joint_proj(qpos)  # (bs, hidden_dim)
+            qpos_embed = torch.unsqueeze(qpos_embed, axis=1)  # (bs, 1, hidden_dim)
+            cls_embed = self.cls_embed.weight # (1, hidden_dim)
+            cls_embed = torch.unsqueeze(cls_embed, axis=0).repeat(bs, 1, 1) # (bs, 1, hidden_dim)
+            encoder_input = torch.cat([cls_embed, qpos_embed, action_embed], axis=1) # (bs, seq+1, hidden_dim)
+            encoder_input = encoder_input.permute(1, 0, 2) # (seq+1, bs, hidden_dim)
+            # do not mask cls token
+            cls_joint_is_pad = torch.full((bs, 2), False).to(qpos.device) # False: not a padding
+            is_pad = torch.cat([cls_joint_is_pad, is_pad], axis=1)  # (bs, seq+1)
+            # obtain position embedding
+            pos_embed = self.pos_table.clone().detach()
+            pos_embed = pos_embed.permute(1, 0, 2)  # (seq+1, 1, hidden_dim)
+            # query model
+            encoder_output = self.encoder(encoder_input, pos=pos_embed, src_key_padding_mask=is_pad)
+            encoder_output = encoder_output[0] # take cls output only
+            latent_info = self.latent_proj(encoder_output)
+            mu = latent_info[:, :self.latent_dim]
+            logvar = latent_info[:, self.latent_dim:]
+            latent_sample = reparametrize(mu, logvar)
+            latent_input = self.latent_out_proj(latent_sample)
+        else:
+            mu = logvar = None
+            latent_sample = torch.zeros([bs, self.latent_dim], dtype=torch.float32).to(qpos.device)
+            latent_input = self.latent_out_proj(latent_sample)
+        if self.backbones is not None:
+            # Image observation features and position embeddings
+            all_cam_features = []
+            all_cam_pos = []
+            if is_training:
+                next_frame_images = image[:,1:]
+                image = image[:,:1]
+            for cam_id, cam_name in enumerate(self.camera_names):
+                # features, pos = self.backbones[0](image[:, cam_id]) # HARDCODED
+                # features = features[0] # take the last layer feature
+                # pos = pos[0]
+                # all_cam_features.append(self.input_proj(features))
+                # all_cam_pos.append(pos)
+                obs_embedings, patch_embedings, pos_mae = self.model_mae(image[:,cam_id])
+            # proprioception features
+            proprio_input = self.input_proj_robot_state(qpos)
+            # fold camera dimension into width dimension
+            # src = torch.cat(all_cam_features, axis=3)
+            # pos = torch.cat(all_cam_pos, axis=3)
+            query_embed = torch.cat([self.query_embed.weight, self.patch_embed.weight], axis=0)
+            hs = self.transformer(patch_embedings, None, query_embed, pos_mae[0,1:], latent_input, proprio_input, self.additional_pos_embed.weight)[0]
+            # hs = self.transformer(src, None, self.query_embed.weight, pos, latent_input, proprio_input, self.additional_pos_embed.weight)[0]
+        else:
+            qpos = self.input_proj_robot_state(qpos)
+            env_state = self.input_proj_env_state(env_state)
+            transformer_input = torch.cat([qpos, env_state], axis=1) # seq length = 2
+            hs = self.transformer(transformer_input, None, self.query_embed.weight, self.pos.weight)[0]
+        a_hat = self.action_head(hs[:,:self.num_queries])
+        is_pad_hat = self.is_pad_head(hs[:,:self.num_queries])
+        # next frame prediction
+        mask_token = self.mask_token
+        mask_tokens = mask_token.repeat(bs, self.n_patch, 1)
+        mask_tokens = mask_tokens + self.decoder_pos_embed.repeat(bs, 1, 1)
+        obs_pred = self.decoder_embed(hs[:,self.num_queries:])
+        obs_pred_ = torch.cat([obs_pred, mask_tokens], dim=1)
+        for blk in self.decoder_blocks:
+            obs_pred_ = blk(obs_pred_)
+        obs_pred_ = self.decoder_norm(obs_pred_)
+        obs_preds = self.decoder_pred(obs_pred_)
+        obs_preds = obs_preds[:,self.n_patch:]
+        if is_training:
+            # next_frame_images = image[:,1:]
+            # next_frame_images = nn.functional.interpolate(next_frame_images[:,0], size=(self.img_h, self.img_w))
+            next_frame_images = next_frame_images[:,0]
+            p = self.patch_size
+            h_p = self.img_h // p
+            w_p = self.img_w // p
+            obs_targets = next_frame_images.reshape(shape=(bs, 3, h_p, p, w_p, p))
+            obs_targets = obs_targets.permute(0,2,4,3,5,1)
+            obs_targets = obs_targets.reshape(shape=(bs, h_p*w_p, (p**2)*3))
+        else:
+            obs_targets = torch.zeros_like(obs_preds)
+        return a_hat, is_pad_hat, [mu, logvar], [obs_preds, obs_targets]
+class CNNMLP(nn.Module):
+    def __init__(self, backbones, state_dim, camera_names):
+        """ Initializes the model.
+        Parameters:
+            backbones: torch module of the backbone to be used. See backbone.py
+            transformer: torch module of the transformer architecture. See transformer.py
+            state_dim: robot state dimension of the environment
+            num_queries: number of object queries, ie detection slot. This is the maximal number of objects
+                         DETR can detect in a single image. For COCO, we recommend 100 queries.
+            aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
+        """
+        super().__init__()
+        self.camera_names = camera_names
+        self.action_head = nn.Linear(1000, state_dim) # TODO add more
+        if backbones is not None:
+            self.backbones = nn.ModuleList(backbones)
+            backbone_down_projs = []
+            for backbone in backbones:
+                down_proj = nn.Sequential(
+                    nn.Conv2d(backbone.num_channels, 128, kernel_size=5),
+                    nn.Conv2d(128, 64, kernel_size=5),
+                    nn.Conv2d(64, 32, kernel_size=5)
+                )
+                backbone_down_projs.append(down_proj)
+            self.backbone_down_projs = nn.ModuleList(backbone_down_projs)
+            mlp_in_dim = 768 * len(backbones) + 14
+            self.mlp = mlp(input_dim=mlp_in_dim, hidden_dim=1024, output_dim=14, hidden_depth=2)
+        else:
+            raise NotImplementedError
+    def forward(self, qpos, image, env_state, actions=None):
+        """
+        qpos: batch, qpos_dim
+        image: batch, num_cam, channel, height, width
+        env_state: None
+        actions: batch, seq, action_dim
+        """
+        is_training = actions is not None # train or val
+        bs, _ = qpos.shape
+        # Image observation features and position embeddings
+        all_cam_features = []
+        for cam_id, cam_name in enumerate(self.camera_names):
+            features, pos = self.backbones[cam_id](image[:, cam_id])
+            features = features[0] # take the last layer feature
+            pos = pos[0] # not used
+            all_cam_features.append(self.backbone_down_projs[cam_id](features))
+        # flatten everything
+        flattened_features = []
+        for cam_feature in all_cam_features:
+            flattened_features.append(cam_feature.reshape([bs, -1]))
+        flattened_features = torch.cat(flattened_features, axis=1) # 768 each
+        features = torch.cat([flattened_features, qpos], axis=1) # qpos: 14
+        a_hat = self.mlp(features)
+        return a_hat
+def mlp(input_dim, hidden_dim, output_dim, hidden_depth):
+    if hidden_depth == 0:
+        mods = [nn.Linear(input_dim, output_dim)]
+    else:
+        mods = [nn.Linear(input_dim, hidden_dim), nn.ReLU(inplace=True)]
+        for i in range(hidden_depth - 1):
+            mods += [nn.Linear(hidden_dim, hidden_dim), nn.ReLU(inplace=True)]
+        mods.append(nn.Linear(hidden_dim, output_dim))
+    trunk = nn.Sequential(*mods)
+    return trunk
+def build_encoder(args):
+    d_model = args.hidden_dim # 256
+    dropout = args.dropout # 0.1
+    nhead = args.nheads # 8
+    dim_feedforward = args.dim_feedforward # 2048
+    num_encoder_layers = args.enc_layers # 4 # TODO shared with VAE decoder
+    normalize_before = args.pre_norm # False
+    activation = "relu"
+    encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward,
+                                            dropout, activation, normalize_before)
+    encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
+    encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
+    return encoder
+def build(args):
+    state_dim = 14 # TODO hardcode
+    # From state
+    # backbone = None # from state for now, no need for conv nets
+    # From image
+    backbones = []
+    backbone = build_backbone(args)
+    backbones.append(backbone)
+    transformer = build_transformer(args)
+    encoder = build_encoder(args)
+    if not args.mae:
+        model = DETRVAE(
+            backbones,
+            transformer,
+            encoder,
+            state_dim=state_dim,
+            num_queries=args.num_queries,
+            camera_names=args.camera_names,
+        )
+    else:
+        model = DETRVAE_MAE(
+            backbones,
+            transformer,
+            encoder,
+            state_dim=state_dim,
+            num_queries=args.num_queries,
+            camera_names=args.camera_names,
+        )
+    n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print("number of parameters: %.2fM" % (n_parameters/1e6,))
+    return model
+def build_cnnmlp(args):
+    state_dim = 14 # TODO hardcode
+    # From state
+    # backbone = None # from state for now, no need for conv nets
+    # From image
+    backbones = []
+    for _ in args.camera_names:
+        backbone = build_backbone(args)
+        backbones.append(backbone)
+    model = CNNMLP(
+        backbones,
+        state_dim=state_dim,
+        camera_names=args.camera_names,
+    )
+    n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print("number of parameters: %.2fM" % (n_parameters/1e6,))
+    return model

ACT_DP_multitask/detr/models/mask_former/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+from . import data  # register all new datasets
+from . import modeling
+# config
+from .config import add_mask_former_config
+# dataset loading
+from .data.dataset_mappers.detr_panoptic_dataset_mapper import DETRPanopticDatasetMapper
+from .data.dataset_mappers.mask_former_panoptic_dataset_mapper import (
+    MaskFormerPanopticDatasetMapper,
+)
+from .data.dataset_mappers.mask_former_semantic_dataset_mapper import (
+    MaskFormerSemanticDatasetMapper,
+)
+# models
+from .mask_former_model import MaskFormer
+from .test_time_augmentation import SemanticSegmentorWithTTA

ACT_DP_multitask/detr/models/mask_former/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (725 Bytes). View file

ACT_DP_multitask/detr/models/mask_former/config.py ADDED Viewed

	@@ -0,0 +1,85 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+from detectron2.config import CfgNode as CN
+def add_mask_former_config(cfg):
+    """
+    Add config for MASK_FORMER.
+    """
+    # data config
+    # select the dataset mapper
+    cfg.INPUT.DATASET_MAPPER_NAME = "mask_former_semantic"
+    # Color augmentation
+    cfg.INPUT.COLOR_AUG_SSD = False
+    # We retry random cropping until no single category in semantic segmentation GT occupies more
+    # than `SINGLE_CATEGORY_MAX_AREA` part of the crop.
+    cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0
+    # Pad image and segmentation GT in dataset mapper.
+    cfg.INPUT.SIZE_DIVISIBILITY = -1
+    # solver config
+    # weight decay on embedding
+    cfg.SOLVER.WEIGHT_DECAY_EMBED = 0.0
+    # optimizer
+    cfg.SOLVER.OPTIMIZER = "ADAMW"
+    cfg.SOLVER.BACKBONE_MULTIPLIER = 0.1
+    # mask_former model config
+    cfg.MODEL.MASK_FORMER = CN()
+    # loss
+    cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION = True
+    cfg.MODEL.MASK_FORMER.NO_OBJECT_WEIGHT = 0.1
+    cfg.MODEL.MASK_FORMER.DICE_WEIGHT = 1.0
+    cfg.MODEL.MASK_FORMER.MASK_WEIGHT = 20.0
+    # transformer config
+    cfg.MODEL.MASK_FORMER.NHEADS = 8
+    cfg.MODEL.MASK_FORMER.DROPOUT = 0.1
+    cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD = 2048
+    cfg.MODEL.MASK_FORMER.ENC_LAYERS = 0
+    cfg.MODEL.MASK_FORMER.DEC_LAYERS = 6
+    cfg.MODEL.MASK_FORMER.PRE_NORM = False
+    cfg.MODEL.MASK_FORMER.HIDDEN_DIM = 256
+    cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES = 100
+    cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE = "res5"
+    cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ = False
+    # mask_former inference config
+    cfg.MODEL.MASK_FORMER.TEST = CN()
+    cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON = False
+    cfg.MODEL.MASK_FORMER.TEST.OBJECT_MASK_THRESHOLD = 0.0
+    cfg.MODEL.MASK_FORMER.TEST.OVERLAP_THRESHOLD = 0.0
+    cfg.MODEL.MASK_FORMER.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE = False
+    # Sometimes `backbone.size_divisibility` is set to 0 for some backbone (e.g. ResNet)
+    # you can use this config to override
+    cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY = 32
+    # pixel decoder config
+    cfg.MODEL.SEM_SEG_HEAD.MASK_DIM = 256
+    # adding transformer in pixel decoder
+    cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS = 0
+    # pixel decoder
+    cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME = "BasePixelDecoder"
+    # swin transformer backbone
+    cfg.MODEL.SWIN = CN()
+    cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE = 224
+    cfg.MODEL.SWIN.PATCH_SIZE = 4
+    cfg.MODEL.SWIN.EMBED_DIM = 96
+    cfg.MODEL.SWIN.DEPTHS = [2, 2, 6, 2]
+    cfg.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24]
+    cfg.MODEL.SWIN.WINDOW_SIZE = 7
+    cfg.MODEL.SWIN.MLP_RATIO = 4.0
+    cfg.MODEL.SWIN.QKV_BIAS = True
+    cfg.MODEL.SWIN.QK_SCALE = None
+    cfg.MODEL.SWIN.DROP_RATE = 0.0
+    cfg.MODEL.SWIN.ATTN_DROP_RATE = 0.0
+    cfg.MODEL.SWIN.DROP_PATH_RATE = 0.3
+    cfg.MODEL.SWIN.APE = False
+    cfg.MODEL.SWIN.PATCH_NORM = True
+    cfg.MODEL.SWIN.OUT_FEATURES = ["res2", "res3", "res4", "res5"]

ACT_DP_multitask/detr/models/mask_former/mask_former_model.py ADDED Viewed

	@@ -0,0 +1,304 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+from typing import Tuple
+import torch
+from torch import nn
+from torch.nn import functional as F
+from detectron2.config import configurable
+from detectron2.data import MetadataCatalog
+from detectron2.modeling import META_ARCH_REGISTRY, build_backbone, build_sem_seg_head
+from detectron2.modeling.backbone import Backbone
+from detectron2.modeling.postprocessing import sem_seg_postprocess
+from detectron2.structures import ImageList
+from .modeling.criterion import SetCriterion
+from .modeling.matcher import HungarianMatcher
+@META_ARCH_REGISTRY.register()
+class MaskFormer(nn.Module):
+    """
+    Main class for mask classification semantic segmentation architectures.
+    """
+    @configurable
+    def __init__(
+        self,
+        *,
+        backbone: Backbone,
+        sem_seg_head: nn.Module,
+        criterion: nn.Module,
+        num_queries: int,
+        panoptic_on: bool,
+        object_mask_threshold: float,
+        overlap_threshold: float,
+        metadata,
+        size_divisibility: int,
+        sem_seg_postprocess_before_inference: bool,
+        pixel_mean: Tuple[float],
+        pixel_std: Tuple[float],
+    ):
+        """
+        Args:
+            backbone: a backbone module, must follow detectron2's backbone interface
+            sem_seg_head: a module that predicts semantic segmentation from backbone features
+            criterion: a module that defines the loss
+            num_queries: int, number of queries
+            panoptic_on: bool, whether to output panoptic segmentation prediction
+            object_mask_threshold: float, threshold to filter query based on classification score
+                for panoptic segmentation inference
+            overlap_threshold: overlap threshold used in general inference for panoptic segmentation
+            metadata: dataset meta, get `thing` and `stuff` category names for panoptic
+                segmentation inference
+            size_divisibility: Some backbones require the input height and width to be divisible by a
+                specific integer. We can use this to override such requirement.
+            sem_seg_postprocess_before_inference: whether to resize the prediction back
+                to original input size before semantic segmentation inference or after.
+                For high-resolution dataset like Mapillary, resizing predictions before
+                inference will cause OOM error.
+            pixel_mean, pixel_std: list or tuple with #channels element, representing
+                the per-channel mean and std to be used to normalize the input image
+        """
+        super().__init__()
+        self.backbone = backbone
+        self.sem_seg_head = sem_seg_head
+        self.criterion = criterion
+        self.num_queries = num_queries
+        self.overlap_threshold = overlap_threshold
+        self.panoptic_on = panoptic_on
+        self.object_mask_threshold = object_mask_threshold
+        self.metadata = metadata
+        if size_divisibility < 0:
+            # use backbone size_divisibility if not set
+            size_divisibility = self.backbone.size_divisibility
+        self.size_divisibility = size_divisibility
+        self.sem_seg_postprocess_before_inference = sem_seg_postprocess_before_inference
+        self.register_buffer("pixel_mean", torch.Tensor(pixel_mean).view(-1, 1, 1), False)
+        self.register_buffer("pixel_std", torch.Tensor(pixel_std).view(-1, 1, 1), False)
+    @classmethod
+    def from_config(cls, cfg):
+        backbone = build_backbone(cfg)
+        sem_seg_head = build_sem_seg_head(cfg, backbone.output_shape())
+        # Loss parameters:
+        deep_supervision = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION
+        no_object_weight = cfg.MODEL.MASK_FORMER.NO_OBJECT_WEIGHT
+        dice_weight = cfg.MODEL.MASK_FORMER.DICE_WEIGHT
+        mask_weight = cfg.MODEL.MASK_FORMER.MASK_WEIGHT
+        # building criterion
+        matcher = HungarianMatcher(
+            cost_class=1,
+            cost_mask=mask_weight,
+            cost_dice=dice_weight,
+        )
+        weight_dict = {"loss_ce": 1, "loss_mask": mask_weight, "loss_dice": dice_weight}
+        if deep_supervision:
+            dec_layers = cfg.MODEL.MASK_FORMER.DEC_LAYERS
+            aux_weight_dict = {}
+            for i in range(dec_layers - 1):
+                aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
+            weight_dict.update(aux_weight_dict)
+        losses = ["labels", "masks"]
+        criterion = SetCriterion(
+            sem_seg_head.num_classes,
+            matcher=matcher,
+            weight_dict=weight_dict,
+            eos_coef=no_object_weight,
+            losses=losses,
+        )
+        return {
+            "backbone": backbone,
+            "sem_seg_head": sem_seg_head,
+            "criterion": criterion,
+            "num_queries": cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES,
+            "panoptic_on": cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON,
+            "object_mask_threshold": cfg.MODEL.MASK_FORMER.TEST.OBJECT_MASK_THRESHOLD,
+            "overlap_threshold": cfg.MODEL.MASK_FORMER.TEST.OVERLAP_THRESHOLD,
+            "metadata": MetadataCatalog.get(cfg.DATASETS.TRAIN[0]),
+            "size_divisibility": cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY,
+            "sem_seg_postprocess_before_inference": (
+                cfg.MODEL.MASK_FORMER.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE
+                or cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON
+            ),
+            "pixel_mean": cfg.MODEL.PIXEL_MEAN,
+            "pixel_std": cfg.MODEL.PIXEL_STD,
+        }
+    @property
+    def device(self):
+        return self.pixel_mean.device
+    def forward(self, batched_inputs):
+        """
+        Args:
+            batched_inputs: a list, batched outputs of :class:`DatasetMapper`.
+                Each item in the list contains the inputs for one image.
+                For now, each item in the list is a dict that contains:
+                   * "image": Tensor, image in (C, H, W) format.
+                   * "instances": per-region ground truth
+                   * Other information that's included in the original dicts, such as:
+                     "height", "width" (int): the output resolution of the model (may be different
+                     from input resolution), used in inference.
+        Returns:
+            list[dict]:
+                each dict has the results for one image. The dict contains the following keys:
+                * "sem_seg":
+                    A Tensor that represents the
+                    per-pixel segmentation prediced by the head.
+                    The prediction has shape KxHxW that represents the logits of
+                    each class for each pixel.
+                * "panoptic_seg":
+                    A tuple that represent panoptic output
+                    panoptic_seg (Tensor): of shape (height, width) where the values are ids for each segment.
+                    segments_info (list[dict]): Describe each segment in `panoptic_seg`.
+                        Each dict contains keys "id", "category_id", "isthing".
+        """
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.size_divisibility)
+        features = self.backbone(images.tensor)
+        outputs = self.sem_seg_head(features)
+        if self.training:
+            # mask classification target
+            if "instances" in batched_inputs[0]:
+                gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
+                targets = self.prepare_targets(gt_instances, images)
+            else:
+                targets = None
+            # bipartite matching-based loss
+            losses = self.criterion(outputs, targets)
+            for k in list(losses.keys()):
+                if k in self.criterion.weight_dict:
+                    losses[k] *= self.criterion.weight_dict[k]
+                else:
+                    # remove this loss if not specified in `weight_dict`
+                    losses.pop(k)
+            return losses
+        else:
+            mask_cls_results = outputs["pred_logits"]
+            mask_pred_results = outputs["pred_masks"]
+            # upsample masks
+            mask_pred_results = F.interpolate(
+                mask_pred_results,
+                size=(images.tensor.shape[-2], images.tensor.shape[-1]),
+                mode="bilinear",
+                align_corners=False,
+            )
+            processed_results = []
+            for mask_cls_result, mask_pred_result, input_per_image, image_size in zip(
+                mask_cls_results, mask_pred_results, batched_inputs, images.image_sizes
+            ):
+                height = input_per_image.get("height", image_size[0])
+                width = input_per_image.get("width", image_size[1])
+                if self.sem_seg_postprocess_before_inference:
+                    mask_pred_result = sem_seg_postprocess(
+                        mask_pred_result, image_size, height, width
+                    )
+                # semantic segmentation inference
+                r = self.semantic_inference(mask_cls_result, mask_pred_result)
+                if not self.sem_seg_postprocess_before_inference:
+                    r = sem_seg_postprocess(r, image_size, height, width)
+                processed_results.append({"sem_seg": r})
+                # panoptic segmentation inference
+                if self.panoptic_on:
+                    panoptic_r = self.panoptic_inference(mask_cls_result, mask_pred_result)
+                    processed_results[-1]["panoptic_seg"] = panoptic_r
+            return processed_results
+    def prepare_targets(self, targets, images):
+        h, w = images.tensor.shape[-2:]
+        new_targets = []
+        for targets_per_image in targets:
+            # pad gt
+            gt_masks = targets_per_image.gt_masks
+            padded_masks = torch.zeros((gt_masks.shape[0], h, w), dtype=gt_masks.dtype, device=gt_masks.device)
+            padded_masks[:, : gt_masks.shape[1], : gt_masks.shape[2]] = gt_masks
+            new_targets.append(
+                {
+                    "labels": targets_per_image.gt_classes,
+                    "masks": padded_masks,
+                }
+            )
+        return new_targets
+    def semantic_inference(self, mask_cls, mask_pred):
+        mask_cls = F.softmax(mask_cls, dim=-1)[..., :-1]
+        mask_pred = mask_pred.sigmoid()
+        semseg = torch.einsum("qc,qhw->chw", mask_cls, mask_pred)
+        return semseg
+    def panoptic_inference(self, mask_cls, mask_pred):
+        scores, labels = F.softmax(mask_cls, dim=-1).max(-1)
+        mask_pred = mask_pred.sigmoid()
+        keep = labels.ne(self.sem_seg_head.num_classes) & (scores > self.object_mask_threshold)
+        cur_scores = scores[keep]
+        cur_classes = labels[keep]
+        cur_masks = mask_pred[keep]
+        cur_mask_cls = mask_cls[keep]
+        cur_mask_cls = cur_mask_cls[:, :-1]
+        cur_prob_masks = cur_scores.view(-1, 1, 1) * cur_masks
+        h, w = cur_masks.shape[-2:]
+        panoptic_seg = torch.zeros((h, w), dtype=torch.int32, device=cur_masks.device)
+        segments_info = []
+        current_segment_id = 0
+        if cur_masks.shape[0] == 0:
+            # We didn't detect any mask :(
+            return panoptic_seg, segments_info
+        else:
+            # take argmax
+            cur_mask_ids = cur_prob_masks.argmax(0)
+            stuff_memory_list = {}
+            for k in range(cur_classes.shape[0]):
+                pred_class = cur_classes[k].item()
+                isthing = pred_class in self.metadata.thing_dataset_id_to_contiguous_id.values()
+                mask = cur_mask_ids == k
+                mask_area = mask.sum().item()
+                original_area = (cur_masks[k] >= 0.5).sum().item()
+                if mask_area > 0 and original_area > 0:
+                    if mask_area / original_area < self.overlap_threshold:
+                        continue
+                    # merge stuff regions
+                    if not isthing:
+                        if int(pred_class) in stuff_memory_list.keys():
+                            panoptic_seg[mask] = stuff_memory_list[int(pred_class)]
+                            continue
+                        else:
+                            stuff_memory_list[int(pred_class)] = current_segment_id + 1
+                    current_segment_id += 1
+                    panoptic_seg[mask] = current_segment_id
+                    segments_info.append(
+                        {
+                            "id": current_segment_id,
+                            "isthing": bool(isthing),
+                            "category_id": int(pred_class),
+                        }
+                    )
+            return panoptic_seg, segments_info

ACT_DP_multitask/detr/models/mask_former/modeling/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+from .backbone.swin import D2SwinTransformer
+from .heads.mask_former_head import MaskFormerHead
+from .heads.per_pixel_baseline import PerPixelBaselineHead, PerPixelBaselinePlusHead
+from .heads.pixel_decoder import BasePixelDecoder

ACT_DP_multitask/detr/models/mask_former/modeling/backbone/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Copyright (c) Facebook, Inc. and its affiliates.

ACT_DP_multitask/detr/models/mask_former/modeling/backbone/swin.py ADDED Viewed

	@@ -0,0 +1,768 @@

+# --------------------------------------------------------
+# Swin Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ze Liu, Yutong Lin, Yixuan Wei
+# --------------------------------------------------------
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from https://github.com/SwinTransformer/Swin-Transformer-Semantic-Segmentation/blob/main/mmseg/models/backbones/swin_transformer.py
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+from detectron2.modeling import BACKBONE_REGISTRY, Backbone, ShapeSpec
+class Mlp(nn.Module):
+    """Multilayer perceptron."""
+    def __init__(
+        self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+class WindowAttention(nn.Module):
+    """Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+    def __init__(
+        self,
+        dim,
+        window_size,
+        num_heads,
+        qkv_bias=True,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)
+        )  # 2*Wh-1 * 2*Ww-1, nH
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        trunc_normal_(self.relative_position_bias_table, std=0.02)
+        self.softmax = nn.Softmax(dim=-1)
+    def forward(self, x, mask=None):
+        """Forward function.
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B_, N, 3, self.num_heads, C // self.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+        q = q * self.scale
+        attn = q @ k.transpose(-2, -1)
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.view(-1)
+        ].view(
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1
+        )  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(
+            2, 0, 1
+        ).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class SwinTransformerBlock(nn.Module):
+    """Swin Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        window_size=7,
+        shift_size=0,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim,
+            window_size=to_2tuple(self.window_size),
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop
+        )
+        self.H = None
+        self.W = None
+    def forward(self, x, mask_matrix):
+        """Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+            mask_matrix: Attention mask for cyclic shift.
+        """
+        B, L, C = x.shape
+        H, W = self.H, self.W
+        assert L == H * W, "input feature has wrong size"
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(B, H, W, C)
+        # pad feature maps to multiples of window size
+        pad_l = pad_t = 0
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+        _, Hp, Wp, _ = x.shape
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+            attn_mask = mask_matrix
+        else:
+            shifted_x = x
+            attn_mask = None
+        # partition windows
+        x_windows = window_partition(
+            shifted_x, self.window_size
+        )  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(
+            -1, self.window_size * self.window_size, C
+        )  # nW*B, window_size*window_size, C
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp)  # B H' W' C
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        else:
+            x = shifted_x
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :H, :W, :].contiguous()
+        x = x.view(B, H * W, C)
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class PatchMerging(nn.Module):
+    """Patch Merging Layer
+    Args:
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+    def forward(self, x, H, W):
+        """Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        x = x.view(B, H, W, C)
+        # padding
+        pad_input = (H % 2 == 1) or (W % 2 == 1)
+        if pad_input:
+            x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
+        x = self.norm(x)
+        x = self.reduction(x)
+        return x
+class BasicLayer(nn.Module):
+    """A basic Swin Transformer layer for one stage.
+    Args:
+        dim (int): Number of feature channels
+        depth (int): Depths of this stage.
+        num_heads (int): Number of attention head.
+        window_size (int): Local window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+    def __init__(
+        self,
+        dim,
+        depth,
+        num_heads,
+        window_size=7,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        norm_layer=nn.LayerNorm,
+        downsample=None,
+        use_checkpoint=False,
+    ):
+        super().__init__()
+        self.window_size = window_size
+        self.shift_size = window_size // 2
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+        # build blocks
+        self.blocks = nn.ModuleList(
+            [
+                SwinTransformerBlock(
+                    dim=dim,
+                    num_heads=num_heads,
+                    window_size=window_size,
+                    shift_size=0 if (i % 2 == 0) else window_size // 2,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop,
+                    attn_drop=attn_drop,
+                    drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                    norm_layer=norm_layer,
+                )
+                for i in range(depth)
+            ]
+        )
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+    def forward(self, x, H, W):
+        """Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        # calculate attention mask for SW-MSA
+        Hp = int(np.ceil(H / self.window_size)) * self.window_size
+        Wp = int(np.ceil(W / self.window_size)) * self.window_size
+        img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device)  # 1 Hp Wp 1
+        h_slices = (
+            slice(0, -self.window_size),
+            slice(-self.window_size, -self.shift_size),
+            slice(-self.shift_size, None),
+        )
+        w_slices = (
+            slice(0, -self.window_size),
+            slice(-self.window_size, -self.shift_size),
+            slice(-self.shift_size, None),
+        )
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, h, w, :] = cnt
+                cnt += 1
+        mask_windows = window_partition(
+            img_mask, self.window_size
+        )  # nW, window_size, window_size, 1
+        mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+        attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(
+            attn_mask == 0, float(0.0)
+        )
+        for blk in self.blocks:
+            blk.H, blk.W = H, W
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x, attn_mask)
+            else:
+                x = blk(x, attn_mask)
+        if self.downsample is not None:
+            x_down = self.downsample(x, H, W)
+            Wh, Ww = (H + 1) // 2, (W + 1) // 2
+            return x, H, W, x_down, Wh, Ww
+        else:
+            return x, H, W, x, H, W
+class PatchEmbed(nn.Module):
+    """Image to Patch Embedding
+    Args:
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+    def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+    def forward(self, x):
+        """Forward function."""
+        # padding
+        _, _, H, W = x.size()
+        if W % self.patch_size[1] != 0:
+            x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
+        if H % self.patch_size[0] != 0:
+            x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
+        x = self.proj(x)  # B C Wh Ww
+        if self.norm is not None:
+            Wh, Ww = x.size(2), x.size(3)
+            x = x.flatten(2).transpose(1, 2)
+            x = self.norm(x)
+            x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)
+        return x
+class SwinTransformer(nn.Module):
+    """Swin Transformer backbone.
+        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
+          https://arxiv.org/pdf/2103.14030
+    Args:
+        pretrain_img_size (int): Input image size for training the pretrained model,
+            used in absolute postion embedding. Default 224.
+        patch_size (int | tuple(int)): Patch size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        depths (tuple[int]): Depths of each Swin Transformer stage.
+        num_heads (tuple[int]): Number of attention head of each stage.
+        window_size (int): Window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
+        drop_rate (float): Dropout rate.
+        attn_drop_rate (float): Attention dropout rate. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
+        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True.
+        out_indices (Sequence[int]): Output from which stages.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+    def __init__(
+        self,
+        pretrain_img_size=224,
+        patch_size=4,
+        in_chans=3,
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.2,
+        norm_layer=nn.LayerNorm,
+        ape=False,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=-1,
+        use_checkpoint=False,
+    ):
+        super().__init__()
+        self.pretrain_img_size = pretrain_img_size
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None,
+        )
+        # absolute position embedding
+        if self.ape:
+            pretrain_img_size = to_2tuple(pretrain_img_size)
+            patch_size = to_2tuple(patch_size)
+            patches_resolution = [
+                pretrain_img_size[0] // patch_size[0],
+                pretrain_img_size[1] // patch_size[1],
+            ]
+            self.absolute_pos_embed = nn.Parameter(
+                torch.zeros(1, embed_dim, patches_resolution[0], patches_resolution[1])
+            )
+            trunc_normal_(self.absolute_pos_embed, std=0.02)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        # stochastic depth
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
+        # build layers
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2 ** i_layer),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
+                use_checkpoint=use_checkpoint,
+            )
+            self.layers.append(layer)
+        num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)]
+        self.num_features = num_features
+        # add a norm layer for each output
+        for i_layer in out_indices:
+            layer = norm_layer(num_features[i_layer])
+            layer_name = f"norm{i_layer}"
+            self.add_module(layer_name, layer)
+        self._freeze_stages()
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+        if self.frozen_stages >= 1 and self.ape:
+            self.absolute_pos_embed.requires_grad = False
+        if self.frozen_stages >= 2:
+            self.pos_drop.eval()
+            for i in range(0, self.frozen_stages - 1):
+                m = self.layers[i]
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        def _init_weights(m):
+            if isinstance(m, nn.Linear):
+                trunc_normal_(m.weight, std=0.02)
+                if isinstance(m, nn.Linear) and m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.constant_(m.bias, 0)
+                nn.init.constant_(m.weight, 1.0)
+    def forward(self, x):
+        """Forward function."""
+        x = self.patch_embed(x)
+        Wh, Ww = x.size(2), x.size(3)
+        if self.ape:
+            # interpolate the position embedding to the corresponding size
+            absolute_pos_embed = F.interpolate(
+                self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic"
+            )
+            x = (x + absolute_pos_embed).flatten(2).transpose(1, 2)  # B Wh*Ww C
+        else:
+            x = x.flatten(2).transpose(1, 2)
+        x = self.pos_drop(x)
+        outs = {}
+        for i in range(self.num_layers):
+            layer = self.layers[i]
+            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+            if i in self.out_indices:
+                norm_layer = getattr(self, f"norm{i}")
+                x_out = norm_layer(x_out)
+                out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()
+                outs["res{}".format(i + 2)] = out
+        return outs
+    def train(self, mode=True):
+        """Convert the model into training mode while keep layers freezed."""
+        super(SwinTransformer, self).train(mode)
+        self._freeze_stages()
+@BACKBONE_REGISTRY.register()
+class D2SwinTransformer(SwinTransformer, Backbone):
+    def __init__(self, cfg, input_shape):
+        pretrain_img_size = cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE
+        patch_size = cfg.MODEL.SWIN.PATCH_SIZE
+        in_chans = 3
+        embed_dim = cfg.MODEL.SWIN.EMBED_DIM
+        depths = cfg.MODEL.SWIN.DEPTHS
+        num_heads = cfg.MODEL.SWIN.NUM_HEADS
+        window_size = cfg.MODEL.SWIN.WINDOW_SIZE
+        mlp_ratio = cfg.MODEL.SWIN.MLP_RATIO
+        qkv_bias = cfg.MODEL.SWIN.QKV_BIAS
+        qk_scale = cfg.MODEL.SWIN.QK_SCALE
+        drop_rate = cfg.MODEL.SWIN.DROP_RATE
+        attn_drop_rate = cfg.MODEL.SWIN.ATTN_DROP_RATE
+        drop_path_rate = cfg.MODEL.SWIN.DROP_PATH_RATE
+        norm_layer = nn.LayerNorm
+        ape = cfg.MODEL.SWIN.APE
+        patch_norm = cfg.MODEL.SWIN.PATCH_NORM
+        super().__init__(
+            pretrain_img_size,
+            patch_size,
+            in_chans,
+            embed_dim,
+            depths,
+            num_heads,
+            window_size,
+            mlp_ratio,
+            qkv_bias,
+            qk_scale,
+            drop_rate,
+            attn_drop_rate,
+            drop_path_rate,
+            norm_layer,
+            ape,
+            patch_norm,
+        )
+        self._out_features = cfg.MODEL.SWIN.OUT_FEATURES
+        self._out_feature_strides = {
+            "res2": 4,
+            "res3": 8,
+            "res4": 16,
+            "res5": 32,
+        }
+        self._out_feature_channels = {
+            "res2": self.num_features[0],
+            "res3": self.num_features[1],
+            "res4": self.num_features[2],
+            "res5": self.num_features[3],
+        }
+    def forward(self, x):
+        """
+        Args:
+            x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
+        Returns:
+            dict[str->Tensor]: names and the corresponding features
+        """
+        assert (
+            x.dim() == 4
+        ), f"SwinTransformer takes an input of shape (N, C, H, W). Got {x.shape} instead!"
+        outputs = {}
+        y = super().forward(x)
+        for k in y.keys():
+            if k in self._out_features:
+                outputs[k] = y[k]
+        return outputs
+    def output_shape(self):
+        return {
+            name: ShapeSpec(
+                channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
+            )
+            for name in self._out_features
+        }
+    @property
+    def size_divisibility(self):
+        return 32

ACT_DP_multitask/detr/models/mask_former/modeling/criterion.py ADDED Viewed

	@@ -0,0 +1,187 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+"""
+MaskFormer criterion.
+"""
+import torch
+import torch.nn.functional as F
+from torch import nn
+from detectron2.utils.comm import get_world_size
+from ..utils.misc import is_dist_avail_and_initialized, nested_tensor_from_tensor_list
+def dice_loss(inputs, targets, num_masks):
+    """
+    Compute the DICE loss, similar to generalized IOU for masks
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+    """
+    inputs = inputs.sigmoid()
+    inputs = inputs.flatten(1)
+    numerator = 2 * (inputs * targets).sum(-1)
+    denominator = inputs.sum(-1) + targets.sum(-1)
+    loss = 1 - (numerator + 1) / (denominator + 1)
+    return loss.sum() / num_masks
+def sigmoid_focal_loss(inputs, targets, num_masks, alpha: float = 0.25, gamma: float = 2):
+    """
+    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+        alpha: (optional) Weighting factor in range (0,1) to balance
+                positive vs negative examples. Default = -1 (no weighting).
+        gamma: Exponent of the modulating factor (1 - p_t) to
+               balance easy vs hard examples.
+    Returns:
+        Loss tensor
+    """
+    prob = inputs.sigmoid()
+    ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+    p_t = prob * targets + (1 - prob) * (1 - targets)
+    loss = ce_loss * ((1 - p_t) ** gamma)
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+        loss = alpha_t * loss
+    return loss.mean(1).sum() / num_masks
+class SetCriterion(nn.Module):
+    """This class computes the loss for DETR.
+    The process happens in two steps:
+        1) we compute hungarian assignment between ground truth boxes and the outputs of the model
+        2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
+    """
+    def __init__(self, num_classes, matcher, weight_dict, eos_coef, losses):
+        """Create the criterion.
+        Parameters:
+            num_classes: number of object categories, omitting the special no-object category
+            matcher: module able to compute a matching between targets and proposals
+            weight_dict: dict containing as key the names of the losses and as values their relative weight.
+            eos_coef: relative classification weight applied to the no-object category
+            losses: list of all the losses to be applied. See get_loss for list of available losses.
+        """
+        super().__init__()
+        self.num_classes = num_classes
+        self.matcher = matcher
+        self.weight_dict = weight_dict
+        self.eos_coef = eos_coef
+        self.losses = losses
+        empty_weight = torch.ones(self.num_classes + 1)
+        empty_weight[-1] = self.eos_coef
+        self.register_buffer("empty_weight", empty_weight)
+    def loss_labels(self, outputs, targets, indices, num_masks):
+        """Classification loss (NLL)
+        targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
+        """
+        assert "pred_logits" in outputs
+        src_logits = outputs["pred_logits"]
+        idx = self._get_src_permutation_idx(indices)
+        target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)])
+        target_classes = torch.full(
+            src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device
+        )
+        target_classes[idx] = target_classes_o
+        loss_ce = F.cross_entropy(src_logits.transpose(1, 2), target_classes, self.empty_weight)
+        losses = {"loss_ce": loss_ce}
+        return losses
+    def loss_masks(self, outputs, targets, indices, num_masks):
+        """Compute the losses related to the masks: the focal loss and the dice loss.
+        targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w]
+        """
+        assert "pred_masks" in outputs
+        src_idx = self._get_src_permutation_idx(indices)
+        tgt_idx = self._get_tgt_permutation_idx(indices)
+        src_masks = outputs["pred_masks"]
+        src_masks = src_masks[src_idx]
+        masks = [t["masks"] for t in targets]
+        # TODO use valid to mask invalid areas due to padding in loss
+        target_masks, valid = nested_tensor_from_tensor_list(masks).decompose()
+        target_masks = target_masks.to(src_masks)
+        target_masks = target_masks[tgt_idx]
+        # upsample predictions to the target size
+        src_masks = F.interpolate(
+            src_masks[:, None], size=target_masks.shape[-2:], mode="bilinear", align_corners=False
+        )
+        src_masks = src_masks[:, 0].flatten(1)
+        target_masks = target_masks.flatten(1)
+        target_masks = target_masks.view(src_masks.shape)
+        losses = {
+            "loss_mask": sigmoid_focal_loss(src_masks, target_masks, num_masks),
+            "loss_dice": dice_loss(src_masks, target_masks, num_masks),
+        }
+        return losses
+    def _get_src_permutation_idx(self, indices):
+        # permute predictions following indices
+        batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
+        src_idx = torch.cat([src for (src, _) in indices])
+        return batch_idx, src_idx
+    def _get_tgt_permutation_idx(self, indices):
+        # permute targets following indices
+        batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
+        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
+        return batch_idx, tgt_idx
+    def get_loss(self, loss, outputs, targets, indices, num_masks):
+        loss_map = {"labels": self.loss_labels, "masks": self.loss_masks}
+        assert loss in loss_map, f"do you really want to compute {loss} loss?"
+        return loss_map[loss](outputs, targets, indices, num_masks)
+    def forward(self, outputs, targets):
+        """This performs the loss computation.
+        Parameters:
+             outputs: dict of tensors, see the output specification of the model for the format
+             targets: list of dicts, such that len(targets) == batch_size.
+                      The expected keys in each dict depends on the losses applied, see each loss' doc
+        """
+        outputs_without_aux = {k: v for k, v in outputs.items() if k != "aux_outputs"}
+        # Retrieve the matching between the outputs of the last layer and the targets
+        indices = self.matcher(outputs_without_aux, targets)
+        # Compute the average number of target boxes accross all nodes, for normalization purposes
+        num_masks = sum(len(t["labels"]) for t in targets)
+        num_masks = torch.as_tensor(
+            [num_masks], dtype=torch.float, device=next(iter(outputs.values())).device
+        )
+        if is_dist_avail_and_initialized():
+            torch.distributed.all_reduce(num_masks)
+        num_masks = torch.clamp(num_masks / get_world_size(), min=1).item()
+        # Compute all the requested losses
+        losses = {}
+        for loss in self.losses:
+            losses.update(self.get_loss(loss, outputs, targets, indices, num_masks))
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if "aux_outputs" in outputs:
+            for i, aux_outputs in enumerate(outputs["aux_outputs"]):
+                indices = self.matcher(aux_outputs, targets)
+                for loss in self.losses:
+                    l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_masks)
+                    l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+        return losses

ACT_DP_multitask/detr/models/mask_former/modeling/heads/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Copyright (c) Facebook, Inc. and its affiliates.

ACT_DP_multitask/detr/models/mask_former/modeling/heads/mask_former_head.py ADDED Viewed

	@@ -0,0 +1,119 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+from copy import deepcopy
+from typing import Callable, Dict, List, Optional, Tuple, Union
+import fvcore.nn.weight_init as weight_init
+from torch import nn
+from torch.nn import functional as F
+from detectron2.config import configurable
+from detectron2.layers import Conv2d, ShapeSpec, get_norm
+from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
+from ..transformer.transformer_predictor import TransformerPredictor
+from .pixel_decoder import build_pixel_decoder
+@SEM_SEG_HEADS_REGISTRY.register()
+class MaskFormerHead(nn.Module):
+    _version = 2
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        version = local_metadata.get("version", None)
+        if version is None or version < 2:
+            # Do not warn if train from scratch
+            scratch = True
+            logger = logging.getLogger(__name__)
+            for k in list(state_dict.keys()):
+                newk = k
+                if "sem_seg_head" in k and not k.startswith(prefix + "predictor"):
+                    newk = k.replace(prefix, prefix + "pixel_decoder.")
+                    # logger.debug(f"{k} ==> {newk}")
+                if newk != k:
+                    state_dict[newk] = state_dict[k]
+                    del state_dict[k]
+                    scratch = False
+            if not scratch:
+                logger.warning(
+                    f"Weight format of {self.__class__.__name__} have changed! "
+                    "Please upgrade your models. Applying automatic conversion now ..."
+                )
+    @configurable
+    def __init__(
+        self,
+        input_shape: Dict[str, ShapeSpec],
+        *,
+        num_classes: int,
+        pixel_decoder: nn.Module,
+        loss_weight: float = 1.0,
+        ignore_value: int = -1,
+        # extra parameters
+        transformer_predictor: nn.Module,
+        transformer_in_feature: str,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            input_shape: shapes (channels and stride) of the input features
+            num_classes: number of classes to predict
+            pixel_decoder: the pixel decoder module
+            loss_weight: loss weight
+            ignore_value: category id to be ignored during training.
+            transformer_predictor: the transformer decoder that makes prediction
+            transformer_in_feature: input feature name to the transformer_predictor
+        """
+        super().__init__()
+        input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
+        self.in_features = [k for k, v in input_shape]
+        feature_strides = [v.stride for k, v in input_shape]
+        feature_channels = [v.channels for k, v in input_shape]
+        self.ignore_value = ignore_value
+        self.common_stride = 4
+        self.loss_weight = loss_weight
+        self.pixel_decoder = pixel_decoder
+        self.predictor = transformer_predictor
+        self.transformer_in_feature = transformer_in_feature
+        self.num_classes = num_classes
+    @classmethod
+    def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
+        return {
+            "input_shape": {
+                k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES
+            },
+            "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
+            "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
+            "pixel_decoder": build_pixel_decoder(cfg, input_shape),
+            "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT,
+            "transformer_in_feature": cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE,
+            "transformer_predictor": TransformerPredictor(
+                cfg,
+                cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
+                if cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder"
+                else input_shape[cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE].channels,
+                mask_classification=True,
+            ),
+        }
+    def forward(self, features):
+        return self.layers(features)
+    def layers(self, features):
+        mask_features, transformer_encoder_features = self.pixel_decoder.forward_features(features)
+        if self.transformer_in_feature == "transformer_encoder":
+            assert (
+                transformer_encoder_features is not None
+            ), "Please use the TransformerEncoderPixelDecoder."
+            predictions = self.predictor(transformer_encoder_features, mask_features)
+        else:
+            predictions = self.predictor(features[self.transformer_in_feature], mask_features)
+        return predictions

ACT_DP_multitask/detr/models/mask_former/modeling/heads/per_pixel_baseline.py ADDED Viewed

	@@ -0,0 +1,243 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+from typing import Callable, Dict, List, Optional, Tuple, Union
+import fvcore.nn.weight_init as weight_init
+from torch import nn
+from torch.nn import functional as F
+from detectron2.config import configurable
+from detectron2.layers import Conv2d, ShapeSpec, get_norm
+from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
+from ..transformer.transformer_predictor import TransformerPredictor
+from .pixel_decoder import build_pixel_decoder
+@SEM_SEG_HEADS_REGISTRY.register()
+class PerPixelBaselineHead(nn.Module):
+    _version = 2
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        version = local_metadata.get("version", None)
+        if version is None or version < 2:
+            logger = logging.getLogger(__name__)
+            # Do not warn if train from scratch
+            scratch = True
+            logger = logging.getLogger(__name__)
+            for k in list(state_dict.keys()):
+                newk = k
+                if "sem_seg_head" in k and not k.startswith(prefix + "predictor"):
+                    newk = k.replace(prefix, prefix + "pixel_decoder.")
+                    # logger.warning(f"{k} ==> {newk}")
+                if newk != k:
+                    state_dict[newk] = state_dict[k]
+                    del state_dict[k]
+                    scratch = False
+            if not scratch:
+                logger.warning(
+                    f"Weight format of {self.__class__.__name__} have changed! "
+                    "Please upgrade your models. Applying automatic conversion now ..."
+                )
+    @configurable
+    def __init__(
+        self,
+        input_shape: Dict[str, ShapeSpec],
+        *,
+        num_classes: int,
+        pixel_decoder: nn.Module,
+        loss_weight: float = 1.0,
+        ignore_value: int = -1,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            input_shape: shapes (channels and stride) of the input features
+            num_classes: number of classes to predict
+            pixel_decoder: the pixel decoder module
+            loss_weight: loss weight
+            ignore_value: category id to be ignored during training.
+        """
+        super().__init__()
+        input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
+        self.in_features = [k for k, v in input_shape]
+        feature_strides = [v.stride for k, v in input_shape]
+        feature_channels = [v.channels for k, v in input_shape]
+        self.ignore_value = ignore_value
+        self.common_stride = 4
+        self.loss_weight = loss_weight
+        self.pixel_decoder = pixel_decoder
+        self.predictor = Conv2d(
+            self.pixel_decoder.mask_dim, num_classes, kernel_size=1, stride=1, padding=0
+        )
+        weight_init.c2_msra_fill(self.predictor)
+    @classmethod
+    def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
+        return {
+            "input_shape": {
+                k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES
+            },
+            "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
+            "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
+            "pixel_decoder": build_pixel_decoder(cfg, input_shape),
+            "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT,
+        }
+    def forward(self, features, targets=None):
+        """
+        Returns:
+            In training, returns (None, dict of losses)
+            In inference, returns (CxHxW logits, {})
+        """
+        x = self.layers(features)
+        if self.training:
+            return None, self.losses(x, targets)
+        else:
+            x = F.interpolate(
+                x, scale_factor=self.common_stride, mode="bilinear", align_corners=False
+            )
+            return x, {}
+    def layers(self, features):
+        x, _ = self.pixel_decoder.forward_features(features)
+        x = self.predictor(x)
+        return x
+    def losses(self, predictions, targets):
+        predictions = predictions.float()  # https://github.com/pytorch/pytorch/issues/48163
+        predictions = F.interpolate(
+            predictions, scale_factor=self.common_stride, mode="bilinear", align_corners=False
+        )
+        loss = F.cross_entropy(
+            predictions, targets, reduction="mean", ignore_index=self.ignore_value
+        )
+        losses = {"loss_sem_seg": loss * self.loss_weight}
+        return losses
+@SEM_SEG_HEADS_REGISTRY.register()
+class PerPixelBaselinePlusHead(PerPixelBaselineHead):
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        version = local_metadata.get("version", None)
+        if version is None or version < 2:
+            # Do not warn if train from scratch
+            scratch = True
+            logger = logging.getLogger(__name__)
+            for k in list(state_dict.keys()):
+                newk = k
+                if "sem_seg_head" in k and not k.startswith(prefix + "predictor"):
+                    newk = k.replace(prefix, prefix + "pixel_decoder.")
+                    logger.debug(f"{k} ==> {newk}")
+                if newk != k:
+                    state_dict[newk] = state_dict[k]
+                    del state_dict[k]
+                    scratch = False
+            if not scratch:
+                logger.warning(
+                    f"Weight format of {self.__class__.__name__} have changed! "
+                    "Please upgrade your models. Applying automatic conversion now ..."
+                )
+    @configurable
+    def __init__(
+        self,
+        input_shape: Dict[str, ShapeSpec],
+        *,
+        # extra parameters
+        transformer_predictor: nn.Module,
+        transformer_in_feature: str,
+        deep_supervision: bool,
+        # inherit parameters
+        num_classes: int,
+        pixel_decoder: nn.Module,
+        loss_weight: float = 1.0,
+        ignore_value: int = -1,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            input_shape: shapes (channels and stride) of the input features
+            transformer_predictor: the transformer decoder that makes prediction
+            transformer_in_feature: input feature name to the transformer_predictor
+            deep_supervision: whether or not to add supervision to the output of
+                every transformer decoder layer
+            num_classes: number of classes to predict
+            pixel_decoder: the pixel decoder module
+            loss_weight: loss weight
+            ignore_value: category id to be ignored during training.
+        """
+        super().__init__(
+            input_shape,
+            num_classes=num_classes,
+            pixel_decoder=pixel_decoder,
+            loss_weight=loss_weight,
+            ignore_value=ignore_value,
+        )
+        del self.predictor
+        self.predictor = transformer_predictor
+        self.transformer_in_feature = transformer_in_feature
+        self.deep_supervision = deep_supervision
+    @classmethod
+    def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
+        ret = super().from_config(cfg, input_shape)
+        ret["transformer_in_feature"] = cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE
+        if cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder":
+            in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
+        else:
+            in_channels = input_shape[ret["transformer_in_feature"]].channels
+        ret["transformer_predictor"] = TransformerPredictor(
+            cfg, in_channels, mask_classification=False
+        )
+        ret["deep_supervision"] = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION
+        return ret
+    def forward(self, features, targets=None):
+        """
+        Returns:
+            In training, returns (None, dict of losses)
+            In inference, returns (CxHxW logits, {})
+        """
+        x, aux_outputs = self.layers(features)
+        if self.training:
+            if self.deep_supervision:
+                losses = self.losses(x, targets)
+                for i, aux_output in enumerate(aux_outputs):
+                    losses["loss_sem_seg" + f"_{i}"] = self.losses(
+                        aux_output["pred_masks"], targets
+                    )["loss_sem_seg"]
+                return None, losses
+            else:
+                return None, self.losses(x, targets)
+        else:
+            x = F.interpolate(
+                x, scale_factor=self.common_stride, mode="bilinear", align_corners=False
+            )
+            return x, {}
+    def layers(self, features):
+        mask_features, transformer_encoder_features = self.pixel_decoder.forward_features(features)
+        if self.transformer_in_feature == "transformer_encoder":
+            assert (
+                transformer_encoder_features is not None
+            ), "Please use the TransformerEncoderPixelDecoder."
+            predictions = self.predictor(transformer_encoder_features, mask_features)
+        else:
+            predictions = self.predictor(features[self.transformer_in_feature], mask_features)
+        if self.deep_supervision:
+            return predictions["pred_masks"], predictions["aux_outputs"]
+        else:
+            return predictions["pred_masks"], None

ACT_DP_multitask/detr/models/mask_former/modeling/heads/pixel_decoder.py ADDED Viewed

	@@ -0,0 +1,294 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+from typing import Callable, Dict, List, Optional, Tuple, Union
+import fvcore.nn.weight_init as weight_init
+from torch import nn
+from torch.nn import functional as F
+from detectron2.config import configurable
+from detectron2.layers import Conv2d, ShapeSpec, get_norm
+from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
+from ..transformer.position_encoding import PositionEmbeddingSine
+from ..transformer.transformer import TransformerEncoder, TransformerEncoderLayer
+def build_pixel_decoder(cfg, input_shape):
+    """
+    Build a pixel decoder from `cfg.MODEL.MASK_FORMER.PIXEL_DECODER_NAME`.
+    """
+    name = cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME
+    model = SEM_SEG_HEADS_REGISTRY.get(name)(cfg, input_shape)
+    forward_features = getattr(model, "forward_features", None)
+    if not callable(forward_features):
+        raise ValueError(
+            "Only SEM_SEG_HEADS with forward_features method can be used as pixel decoder. "
+            f"Please implement forward_features for {name} to only return mask features."
+        )
+    return model
+@SEM_SEG_HEADS_REGISTRY.register()
+class BasePixelDecoder(nn.Module):
+    # @configurable
+    def __init__(
+        self,
+        input_shape: Dict[str, ShapeSpec],
+        # *,
+        conv_dim: int,
+        mask_dim: int,
+        norm: Optional[Union[str, Callable]] = None,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            input_shape: shapes (channels and stride) of the input features
+            conv_dims: number of output channels for the intermediate conv layers.
+            mask_dim: number of output channels for the final conv layer.
+            norm (str or callable): normalization for all conv layers
+        """
+        super().__init__()
+        input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
+        self.in_features = [k for k, v in input_shape]  # starting from "res2" to "res5"
+        feature_channels = [v.channels for k, v in input_shape]
+        lateral_convs = []
+        output_convs = []
+        use_bias = norm == ""
+        for idx, in_channels in enumerate(feature_channels):
+            if idx == len(self.in_features) - 1:
+                output_norm = get_norm(norm, conv_dim)
+                output_conv = Conv2d(
+                    in_channels,
+                    conv_dim,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=use_bias,
+                    norm=output_norm,
+                    activation=F.relu,
+                )
+                weight_init.c2_xavier_fill(output_conv)
+                self.add_module("layer_{}".format(idx + 1), output_conv)
+                lateral_convs.append(None)
+                output_convs.append(output_conv)
+            else:
+                lateral_norm = get_norm(norm, conv_dim)
+                output_norm = get_norm(norm, conv_dim)
+                lateral_conv = Conv2d(
+                    in_channels, conv_dim, kernel_size=1, bias=use_bias, norm=lateral_norm
+                )
+                output_conv = Conv2d(
+                    conv_dim,
+                    conv_dim,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=use_bias,
+                    norm=output_norm,
+                    activation=F.relu,
+                )
+                weight_init.c2_xavier_fill(lateral_conv)
+                weight_init.c2_xavier_fill(output_conv)
+                self.add_module("adapter_{}".format(idx + 1), lateral_conv)
+                self.add_module("layer_{}".format(idx + 1), output_conv)
+                lateral_convs.append(lateral_conv)
+                output_convs.append(output_conv)
+        # Place convs into top-down order (from low to high resolution)
+        # to make the top-down computation in forward clearer.
+        self.lateral_convs = lateral_convs[::-1]
+        self.output_convs = output_convs[::-1]
+        self.mask_dim = mask_dim
+        self.mask_features = Conv2d(
+            conv_dim,
+            mask_dim,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+        weight_init.c2_xavier_fill(self.mask_features)
+    # @classmethod
+    # def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
+    #     ret = {}
+    #     ret["input_shape"] = {
+    #         k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES
+    #     }
+    #     ret["conv_dim"] = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
+    #     ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
+    #     ret["norm"] = cfg.MODEL.SEM_SEG_HEAD.NORM
+    #     return ret
+    def forward_features(self, features):
+        # Reverse feature maps into top-down order (from low to high resolution)
+        for idx, f in enumerate(self.in_features[::-1]):
+            x = features[f]
+            lateral_conv = self.lateral_convs[idx]
+            output_conv = self.output_convs[idx]
+            if lateral_conv is None:
+                y = output_conv(x)
+            else:
+                cur_fpn = lateral_conv(x)
+                # Following FPN implementation, we use nearest upsampling here
+                y = cur_fpn + F.interpolate(y, size=cur_fpn.shape[-2:], mode="nearest")
+                y = output_conv(y)
+        return self.mask_features(y), None
+    def forward(self, features, targets=None):
+        logger = logging.getLogger(__name__)
+        logger.warning("Calling forward() may cause unpredicted behavior of PixelDecoder module.")
+        return self.forward_features(features)
+class TransformerEncoderOnly(nn.Module):
+    def __init__(
+        self,
+        d_model=512,
+        nhead=8,
+        num_encoder_layers=6,
+        dim_feedforward=2048,
+        dropout=0.1,
+        activation="relu",
+        normalize_before=False,
+    ):
+        super().__init__()
+        encoder_layer = TransformerEncoderLayer(
+            d_model, nhead, dim_feedforward, dropout, activation, normalize_before
+        )
+        encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
+        self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
+        self._reset_parameters()
+        self.d_model = d_model
+        self.nhead = nhead
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+    def forward(self, src, mask, pos_embed):
+        # flatten NxCxHxW to HWxNxC
+        bs, c, h, w = src.shape
+        src = src.flatten(2).permute(2, 0, 1)
+        pos_embed = pos_embed.flatten(2).permute(2, 0, 1)
+        if mask is not None:
+            mask = mask.flatten(1)
+        memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed)
+        return memory.permute(1, 2, 0).view(bs, c, h, w)
+@SEM_SEG_HEADS_REGISTRY.register()
+class TransformerEncoderPixelDecoder(BasePixelDecoder):
+    @configurable
+    def __init__(
+        self,
+        input_shape: Dict[str, ShapeSpec],
+        *,
+        transformer_dropout: float,
+        transformer_nheads: int,
+        transformer_dim_feedforward: int,
+        transformer_enc_layers: int,
+        transformer_pre_norm: bool,
+        conv_dim: int,
+        mask_dim: int,
+        norm: Optional[Union[str, Callable]] = None,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            input_shape: shapes (channels and stride) of the input features
+            transformer_dropout: dropout probability in transformer
+            transformer_nheads: number of heads in transformer
+            transformer_dim_feedforward: dimension of feedforward network
+            transformer_enc_layers: number of transformer encoder layers
+            transformer_pre_norm: whether to use pre-layernorm or not
+            conv_dims: number of output channels for the intermediate conv layers.
+            mask_dim: number of output channels for the final conv layer.
+            norm (str or callable): normalization for all conv layers
+        """
+        super().__init__(input_shape, conv_dim=conv_dim, mask_dim=mask_dim, norm=norm)
+        input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
+        self.in_features = [k for k, v in input_shape]  # starting from "res2" to "res5"
+        feature_strides = [v.stride for k, v in input_shape]
+        feature_channels = [v.channels for k, v in input_shape]
+        in_channels = feature_channels[len(self.in_features) - 1]
+        self.input_proj = Conv2d(in_channels, conv_dim, kernel_size=1)
+        weight_init.c2_xavier_fill(self.input_proj)
+        self.transformer = TransformerEncoderOnly(
+            d_model=conv_dim,
+            dropout=transformer_dropout,
+            nhead=transformer_nheads,
+            dim_feedforward=transformer_dim_feedforward,
+            num_encoder_layers=transformer_enc_layers,
+            normalize_before=transformer_pre_norm,
+        )
+        N_steps = conv_dim // 2
+        self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True)
+        # update layer
+        use_bias = norm == ""
+        output_norm = get_norm(norm, conv_dim)
+        output_conv = Conv2d(
+            conv_dim,
+            conv_dim,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=use_bias,
+            norm=output_norm,
+            activation=F.relu,
+        )
+        weight_init.c2_xavier_fill(output_conv)
+        delattr(self, "layer_{}".format(len(self.in_features)))
+        self.add_module("layer_{}".format(len(self.in_features)), output_conv)
+        self.output_convs[0] = output_conv
+    @classmethod
+    def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
+        ret = super().from_config(cfg, input_shape)
+        ret["transformer_dropout"] = cfg.MODEL.MASK_FORMER.DROPOUT
+        ret["transformer_nheads"] = cfg.MODEL.MASK_FORMER.NHEADS
+        ret["transformer_dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD
+        ret[
+            "transformer_enc_layers"
+        ] = cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS  # a separate config
+        ret["transformer_pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM
+        return ret
+    def forward_features(self, features):
+        # Reverse feature maps into top-down order (from low to high resolution)
+        for idx, f in enumerate(self.in_features[::-1]):
+            x = features[f]
+            lateral_conv = self.lateral_convs[idx]
+            output_conv = self.output_convs[idx]
+            if lateral_conv is None:
+                transformer = self.input_proj(x)
+                pos = self.pe_layer(x)
+                transformer = self.transformer(transformer, None, pos)
+                y = output_conv(transformer)
+                # save intermediate feature as input to Transformer decoder
+                transformer_encoder_features = transformer
+            else:
+                cur_fpn = lateral_conv(x)
+                # Following FPN implementation, we use nearest upsampling here
+                y = cur_fpn + F.interpolate(y, size=cur_fpn.shape[-2:], mode="nearest")
+                y = output_conv(y)
+        return self.mask_features(y), transformer_encoder_features
+    def forward(self, features, targets=None):
+        logger = logging.getLogger(__name__)
+        logger.warning("Calling forward() may cause unpredicted behavior of PixelDecoder module.")
+        return self.forward_features(features)