init

Files changed (4) hide show

README.md +2 -0
model.config.json +650 -0
model.safetensors +3 -0
training_log.txt +0 -0

README.md CHANGED Viewed

@@ -1,3 +1,5 @@
 ---
 license: mit
 ---

 ---
 license: mit
 ---
+The single-task version of robotwin "block_stack_three" uses Grounding-DINO-Tiny as the base model.

model.config.json ADDED Viewed

	@@ -0,0 +1,650 @@

+{
+    "__config_type__": "robo_orchard_lab.models.bip3d.structure:BIP3DConfig",
+    "class_type": "robo_orchard_lab.models.bip3d.structure:BIP3D",
+    "backbone": {
+        "type": "robo_orchard_lab.models.modules.swin_transformer:SwinTransformer",
+        "embed_dims": 96,
+        "depths": [
+            2,
+            2,
+            6,
+            2
+        ],
+        "num_heads": [
+            3,
+            6,
+            12,
+            24
+        ],
+        "window_size": 7,
+        "mlp_ratio": 4,
+        "qkv_bias": true,
+        "qk_scale": null,
+        "drop_rate": 0.0,
+        "attn_drop_rate": 0.0,
+        "out_indices": [
+            1,
+            2,
+            3
+        ],
+        "with_cp": true,
+        "convert_weights": false
+    },
+    "decoder": {
+        "type": "robo_orchard_lab.models.sem_modules.action_decoder:SEMActionDecoder",
+        "img_cross_attn": {
+            "type": "robo_orchard_lab.models.sem_modules.layers:RotaryAttention",
+            "embed_dims": 256,
+            "num_heads": 8,
+            "max_position_embeddings": 32
+        },
+        "norm_layer": {
+            "type": "torch.nn.modules.normalization:RMSNorm",
+            "normalized_shape": 256
+        },
+        "ffn": {
+            "type": "robo_orchard_lab.models.layers.transformer_layers:FFN",
+            "embed_dims": 256,
+            "feedforward_channels": 2048,
+            "act_cfg": {
+                "type": "torch.nn.modules.activation:SiLU",
+                "inplace": true
+            }
+        },
+        "head": {
+            "type": "robo_orchard_lab.models.sem_modules.layers:UpsampleHead",
+            "upsample_sizes": [
+                16,
+                32,
+                64
+            ],
+            "input_dim": 256,
+            "dims": [
+                128,
+                64,
+                8
+            ],
+            "norm": {
+                "type": "torch.nn.modules.normalization:RMSNorm",
+                "normalized_shape": 256
+            },
+            "act": {
+                "type": "torch.nn.modules.activation:SiLU",
+                "inplace": true
+            },
+            "norm_act_idx": [
+                0,
+                1,
+                2
+            ]
+        },
+        "training_noise_scheduler": {
+            "type": "diffusers.schedulers.scheduling_ddpm:DDPMScheduler",
+            "num_train_timesteps": 1000,
+            "beta_schedule": "squaredcos_cap_v2",
+            "prediction_type": "sample",
+            "clip_sample": false
+        },
+        "test_noise_scheduler": {
+            "type": "diffusers.schedulers.scheduling_dpmsolver_multistep:DPMSolverMultistepScheduler",
+            "num_train_timesteps": 1000,
+            "beta_schedule": "squaredcos_cap_v2",
+            "prediction_type": "sample"
+        },
+        "num_inference_timesteps": 10,
+        "joint_self_attn": {
+            "type": "robo_orchard_lab.models.sem_modules.layers:JointGraphAttention",
+            "embed_dims": 256,
+            "num_heads": 8
+        },
+        "temp_cross_attn": {
+            "type": "robo_orchard_lab.models.sem_modules.layers:RotaryAttention",
+            "embed_dims": 256,
+            "num_heads": 8,
+            "max_position_embeddings": 32
+        },
+        "text_cross_attn": {
+            "type": "robo_orchard_lab.models.sem_modules.layers:RotaryAttention",
+            "embed_dims": 256,
+            "num_heads": 8,
+            "max_position_embeddings": 256
+        },
+        "pred_steps": 64,
+        "timestep_norm_layer": {
+            "type": "robo_orchard_lab.models.sem_modules.layers:AdaRMSNorm",
+            "normalized_shape": 256,
+            "condition_dims": 256,
+            "zero": true
+        },
+        "operation_order": [
+            "t_norm",
+            "joint_self_attn",
+            "gate_msa",
+            "norm",
+            "temp_cross_attn",
+            "norm",
+            "img_cross_attn",
+            "norm",
+            null,
+            null,
+            "scale_shift",
+            "ffn",
+            "gate_mlp",
+            "t_norm",
+            "joint_self_attn",
+            "gate_msa",
+            "norm",
+            "temp_cross_attn",
+            "norm",
+            "img_cross_attn",
+            "norm",
+            null,
+            null,
+            "scale_shift",
+            "ffn",
+            "gate_mlp",
+            "t_norm",
+            "joint_self_attn",
+            "gate_msa",
+            "norm",
+            "temp_cross_attn",
+            "norm",
+            "img_cross_attn",
+            "norm",
+            null,
+            null,
+            "scale_shift",
+            "ffn",
+            "gate_mlp",
+            "t_norm",
+            "joint_self_attn",
+            "gate_msa",
+            "norm",
+            "temp_cross_attn",
+            "norm",
+            "img_cross_attn",
+            "norm",
+            null,
+            null,
+            "scale_shift",
+            "ffn",
+            "gate_mlp",
+            "t_norm",
+            "joint_self_attn",
+            "gate_msa",
+            "norm",
+            "temp_cross_attn",
+            "norm",
+            "img_cross_attn",
+            "norm",
+            null,
+            null,
+            "scale_shift",
+            "ffn",
+            "gate_mlp",
+            "t_norm",
+            "joint_self_attn",
+            "gate_msa",
+            "norm",
+            "temp_cross_attn",
+            "norm",
+            "img_cross_attn",
+            "norm",
+            null,
+            null,
+            "scale_shift",
+            "ffn",
+            "gate_mlp"
+        ],
+        "feature_level": [
+            1,
+            2
+        ],
+        "act_cfg": {
+            "type": "torch.nn.modules.activation:SiLU",
+            "inplace": true
+        },
+        "robot_encoder": {
+            "type": "robo_orchard_lab.models.sem_modules.robot_state_encoder:SEMRobotStateEncoder",
+            "embed_dims": 256,
+            "chunk_size": 1,
+            "joint_self_attn": {
+                "type": "robo_orchard_lab.models.sem_modules.layers:JointGraphAttention",
+                "embed_dims": 256,
+                "num_heads": 8
+            },
+            "norm_layer": {
+                "type": "torch.nn.modules.normalization:RMSNorm",
+                "normalized_shape": 256
+            },
+            "ffn": {
+                "type": "robo_orchard_lab.models.layers.transformer_layers:FFN",
+                "embed_dims": 256,
+                "feedforward_channels": 2048,
+                "act_cfg": {
+                    "type": "torch.nn.modules.activation:SiLU",
+                    "inplace": true
+                }
+            },
+            "temp_self_attn": {
+                "type": "robo_orchard_lab.models.sem_modules.layers:RotaryAttention",
+                "embed_dims": 256,
+                "num_heads": 8,
+                "max_position_embeddings": 32
+            },
+            "act_cfg": {
+                "type": "torch.nn.modules.activation:SiLU",
+                "inplace": true
+            },
+            "operation_order": [
+                "norm",
+                "joint_self_attn",
+                null,
+                null,
+                "norm",
+                "ffn",
+                "norm",
+                "joint_self_attn",
+                null,
+                null,
+                "norm",
+                "ffn",
+                "norm",
+                "joint_self_attn",
+                null,
+                null,
+                "norm",
+                "ffn",
+                "norm",
+                "joint_self_attn",
+                null,
+                null,
+                "norm",
+                "ffn",
+                "norm"
+            ],
+            "state_dims": 8
+        },
+        "state_loss_weights": [
+            [
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                0.1,
+                0.1,
+                0.1,
+                0.1
+            ],
+            [
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                0.1,
+                0.1,
+                0.1,
+                0.1
+            ],
+            [
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                0.1,
+                0.1,
+                0.1,
+                0.1
+            ],
+            [
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                0.1,
+                0.1,
+                0.1,
+                0.1
+            ],
+            [
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                0.1,
+                0.1,
+                0.1,
+                0.1
+            ],
+            [
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                0.1,
+                0.1,
+                0.1,
+                0.1
+            ],
+            [
+                1.0,
+                2.0,
+                2.0,
+                2.0,
+                0.2,
+                0.2,
+                0.2,
+                0.2
+            ],
+            [
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                0.1,
+                0.1,
+                0.1,
+                0.1
+            ],
+            [
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                0.1,
+                0.1,
+                0.1,
+                0.1
+            ],
+            [
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                0.1,
+                0.1,
+                0.1,
+                0.1
+            ],
+            [
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                0.1,
+                0.1,
+                0.1,
+                0.1
+            ],
+            [
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                0.1,
+                0.1,
+                0.1,
+                0.1
+            ],
+            [
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                0.1,
+                0.1,
+                0.1,
+                0.1
+            ],
+            [
+                1.0,
+                2.0,
+                2.0,
+                2.0,
+                0.2,
+                0.2,
+                0.2,
+                0.2
+            ]
+        ],
+        "fk_loss_weight": [
+            [
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                0.1,
+                0.1,
+                0.1,
+                0.1
+            ],
+            [
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                0.1,
+                0.1,
+                0.1,
+                0.1
+            ],
+            [
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                0.1,
+                0.1,
+                0.1,
+                0.1
+            ],
+            [
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                0.1,
+                0.1,
+                0.1,
+                0.1
+            ],
+            [
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                0.1,
+                0.1,
+                0.1,
+                0.1
+            ],
+            [
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                0.1,
+                0.1,
+                0.1,
+                0.1
+            ],
+            [
+                1.0,
+                2.0,
+                2.0,
+                2.0,
+                0.2,
+                0.2,
+                0.2,
+                0.2
+            ],
+            [
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                0.1,
+                0.1,
+                0.1,
+                0.1
+            ],
+            [
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                0.1,
+                0.1,
+                0.1,
+                0.1
+            ],
+            [
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                0.1,
+                0.1,
+                0.1,
+                0.1
+            ],
+            [
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                0.1,
+                0.1,
+                0.1,
+                0.1
+            ],
+            [
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                0.1,
+                0.1,
+                0.1,
+                0.1
+            ],
+            [
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                0.1,
+                0.1,
+                0.1,
+                0.1
+            ],
+            [
+                1.0,
+                2.0,
+                2.0,
+                2.0,
+                0.2,
+                0.2,
+                0.2,
+                0.2
+            ]
+        ],
+        "state_dims": 8
+    },
+    "neck": {
+        "type": "robo_orchard_lab.models.modules.channel_mapper:ChannelMapper",
+        "in_channels": [
+            192,
+            384,
+            768
+        ],
+        "kernel_size": 1,
+        "out_channels": 256,
+        "act_cfg": null,
+        "bias": true,
+        "norm_cfg": {
+            "type": "torch.nn.modules.normalization:GroupNorm",
+            "num_groups": 32
+        },
+        "num_outs": 3
+    },
+    "text_encoder": null,
+    "feature_enhancer": null,
+    "spatial_enhancer": {
+        "type": "robo_orchard_lab.models.bip3d.spatial_enhancer:DepthFusionSpatialEnhancer",
+        "embed_dims": 256,
+        "feature_3d_dim": 32,
+        "num_depth_layers": 2,
+        "min_depth": 0.01,
+        "max_depth": 1.2,
+        "num_depth": 128,
+        "with_feature_3d": true,
+        "loss_depth_weight": 1.0
+    },
+    "data_preprocessor": {
+        "type": "robo_orchard_lab.models.layers.data_preprocessors:BaseDataPreprocessor",
+        "mean": [
+            123.675,
+            116.28,
+            103.53
+        ],
+        "std": [
+            58.395,
+            57.12,
+            57.375
+        ],
+        "channel_flip": false,
+        "unsqueeze_depth_channel": true,
+        "batch_transforms": [
+            {
+                "type": "robo_orchard_lab.models.bip3d.spatial_enhancer:BatchDepthProbGTGenerator",
+                "min_depth": 0.01,
+                "max_depth": 1.2,
+                "num_depth": 128,
+                "origin_stride": 2,
+                "valid_threshold": 0.5,
+                "stride": [
+                    8,
+                    16,
+                    32
+                ]
+            }
+        ]
+    },
+    "backbone_3d": {
+        "type": "robo_orchard_lab.models.modules.resnet:ResNet",
+        "depth": 34,
+        "in_channels": 1,
+        "base_channels": 4,
+        "num_stages": 4,
+        "out_indices": [
+            1,
+            2,
+            3
+        ],
+        "bn_eval": true,
+        "with_cp": true,
+        "style": "pytorch"
+    },
+    "neck_3d": {
+        "type": "robo_orchard_lab.models.modules.channel_mapper:ChannelMapper",
+        "in_channels": [
+            8,
+            16,
+            32
+        ],
+        "kernel_size": 1,
+        "out_channels": 32,
+        "act_cfg": null,
+        "bias": true,
+        "norm_cfg": {
+            "type": "torch.nn.modules.normalization:GroupNorm",
+            "num_groups": 4
+        },
+        "num_outs": 3
+    },
+    "input_2d": "imgs",
+    "input_3d": "depths",
+    "embed_dims": 256,
+    "pre_spatial_enhancer": false
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9d5372e1ff59d339a7da28a6352f96dcfb3b0f4fed558f6371af8bfa88ea4e29
+size 198329632

training_log.txt ADDED Viewed

The diff for this file is too large to render. See raw diff