mgovind7 commited on
Commit
e792dae
·
1 Parent(s): 2569eda

Add UniLACT model weights of 3 stages for calvin

Browse files
unilact_finetuned_on_calvin/config.yaml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _target_: unilact.src.models.unilact.UniLACT
2
+ model_lang:
3
+ _target_: transformers.T5EncoderModel.from_pretrained
4
+ pretrained_model_name_or_path: t5-base
5
+ model_vision:
6
+ _target_: unilact.src.models.mae_model.MaeEncoder
7
+ use_obs_feature: true
8
+ pretrained_model_name_or_path: facebook/vit-mae-large
9
+ model_causal_transformer:
10
+ _target_: unilact.src.models.trajectory_gpt2.GPT2Model
11
+ config:
12
+ _target_: unilact.src.models.trajectory_gpt2.GPT2Config
13
+ vocab_size: 1
14
+ n_embd: 768
15
+ n_layer: 12
16
+ n_head: 12
17
+ activation_function: relu
18
+ dropout: 0.1
19
+ n_positions: 1024
20
+ act_dim: 7
21
+ hidden_size: 768
22
+ sequence_length: 2
23
+ chunk_size: 5
24
+ per_latent_motion_len: 8
25
+ latent_motion_codebook_size: 128
26
+ latent_motion_pred: true
27
+ act_pred: true
28
+ img_feat_dim: 1024
29
+ patch_feat_dim: 1024
30
+ lang_feat_dim: 768
31
+ mask_latent_motion_probability: 0.5
32
+ freeze_lang: true
33
+ freeze_vision: true
34
+ use_latent_motion_pos_embedding: true
35
+ pred_tokens_modality: unified
36
+ output_modality_tokens: same-modal
37
+
unilact_finetuned_on_calvin/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c7319ea0a4ab81e93467da59003b8926884180d5f936eb86fe2c7b467463944
3
+ size 364206482
unilact_pretrained_on_calvin/config.yaml ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _target_: unilact.src.models.unilact.UniLACT
2
+ model_lang:
3
+ _target_: transformers.T5EncoderModel.from_pretrained
4
+ pretrained_model_name_or_path: t5-base
5
+ model_vision:
6
+ _target_: unilact.src.models.mae_model.MaeEncoder
7
+ use_obs_feature: true
8
+ pretrained_model_name_or_path: facebook/vit-mae-large
9
+ model_causal_transformer:
10
+ _target_: unilact.src.models.trajectory_gpt2.GPT2Model
11
+ config:
12
+ _target_: unilact.src.models.trajectory_gpt2.GPT2Config
13
+ vocab_size: 1
14
+ n_embd: 768
15
+ n_layer: 12
16
+ n_head: 12
17
+ activation_function: relu
18
+ dropout: 0.1
19
+ n_positions: 1024
20
+ act_dim: 7
21
+ hidden_size: 768
22
+ sequence_length: 2
23
+ chunk_size: 5
24
+ per_latent_motion_len: 8
25
+ latent_motion_codebook_size: 128
26
+ latent_motion_pred: true
27
+ act_pred: true
28
+ img_feat_dim: 1024
29
+ patch_feat_dim: 1024
30
+ lang_feat_dim: 768
31
+ mask_latent_motion_probability: 0.5
32
+ freeze_lang: true
33
+ freeze_vision: true
34
+ use_latent_motion_pos_embedding: true
35
+ pred_tokens_modality: unified
36
+ output_modality_tokens: cross-modal
unilact_pretrained_on_calvin/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28af0ea019db6ad36e6c109582afd97f95f0b14f819b097182ea1431a1e0e064
3
+ size 364172042
unilarn_trained_on_calvin/config.yaml ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _target_: unilarn.src.models.unilarn.UniLARN
2
+ codebook_dim: 32
3
+ commit_loss_w: 1.0
4
+ recon_loss_w: 1.0
5
+ recon_depth_loss_w: 1.0
6
+ perceptual_loss_w: 1.0
7
+ image_encoder:
8
+ _target_: transformers.ViTMAEModel.from_pretrained
9
+ pretrained_model_name_or_path: facebook/vit-mae-large
10
+ m_former:
11
+ _target_: unilarn.src.models.m_former.MFormer
12
+ add_pooling_layer: false
13
+ config:
14
+ _target_: transformers.ViTConfig
15
+ query_num: 8
16
+ input_hidden_size: 1024
17
+ num_patches: 197
18
+ attention_probs_dropout_prob: 0.0
19
+ hidden_act: gelu
20
+ hidden_dropout_prob: 0.0
21
+ hidden_size: 768
22
+ initializer_range: 0.02
23
+ intermediate_size: 3072
24
+ layer_norm_eps: 1.0e-12
25
+ model_type: vit
26
+ num_attention_heads: 12
27
+ num_hidden_layers: 4
28
+ qkv_bias: true
29
+ m_former_depth:
30
+ _target_: unilarn.src.models.m_former.MFormer
31
+ add_pooling_layer: false
32
+ config:
33
+ _target_: transformers.ViTConfig
34
+ query_num: 8
35
+ input_hidden_size: 1024
36
+ num_patches: 197
37
+ attention_probs_dropout_prob: 0.0
38
+ hidden_act: gelu
39
+ hidden_dropout_prob: 0.0
40
+ hidden_size: 768
41
+ initializer_range: 0.02
42
+ intermediate_size: 3072
43
+ layer_norm_eps: 1.0e-12
44
+ model_type: vit
45
+ num_attention_heads: 12
46
+ num_hidden_layers: 4
47
+ qkv_bias: true
48
+ vector_quantizer:
49
+ _target_: unilarn.src.models.vector_quantizer.VectorQuantizer2
50
+ n_e: 128
51
+ e_dim: 32
52
+ beta: 0.25
53
+ remap: null
54
+ sane_index_shape: true
55
+ legacy: false
56
+ vector_quantizer_uni:
57
+ _target_: unilarn.src.models.vector_quantizer.VectorQuantizer2
58
+ n_e: 128
59
+ e_dim: 32
60
+ beta: 0.25
61
+ remap: null
62
+ sane_index_shape: true
63
+ legacy: false
64
+ decoder:
65
+ _target_: unilarn.src.models.latent_action_decoder.LatentActionDecoder
66
+ config:
67
+ _target_: transformers.ViTConfig
68
+ query_num: 8
69
+ attention_probs_dropout_prob: 0.0
70
+ hidden_act: gelu
71
+ hidden_dropout_prob: 0.0
72
+ hidden_size: 768
73
+ image_size: 224
74
+ initializer_range: 0.02
75
+ intermediate_size: 3072
76
+ layer_norm_eps: 1.0e-12
77
+ model_type: vit
78
+ num_attention_heads: 12
79
+ num_channels: 3
80
+ num_hidden_layers: 12
81
+ patch_size: 16
82
+ qkv_bias: true
83
+ encoder_stride: 16
84
+ num_patches: 196
85
+ depth_decoder:
86
+ _target_: unilarn.src.models.latent_action_decoder.LatentActionDecoder
87
+ config:
88
+ _target_: transformers.ViTConfig
89
+ query_num: 8
90
+ attention_probs_dropout_prob: 0.0
91
+ hidden_act: gelu
92
+ hidden_dropout_prob: 0.0
93
+ hidden_size: 768
94
+ image_size: 224
95
+ initializer_range: 0.02
96
+ intermediate_size: 3072
97
+ layer_norm_eps: 1.0e-12
98
+ model_type: vit
99
+ num_attention_heads: 12
100
+ num_channels: 3
101
+ num_hidden_layers: 12
102
+ patch_size: 16
103
+ qkv_bias: true
104
+ encoder_stride: 16
105
+ num_patches: 196
unilarn_trained_on_calvin/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5c64ce3b2833687157a598d37e362aa80a578445ee14d69ad827b6a7524c294
3
+ size 967328706