binhpham commited on
Commit
809ddf2
·
verified ·
1 Parent(s): 5033eb2

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. config.json +150 -120
  2. model.safetensors +2 -2
config.json CHANGED
@@ -1,123 +1,153 @@
1
  {
2
- "type": "molmoact2",
3
- "n_obs_steps": 1,
4
- "input_features": {
5
- "observation.state": {
6
- "type": "STATE",
7
- "shape": [
8
- 6
9
- ]
10
- },
11
- "observation.images.arm_camera": {
12
- "type": "VISUAL",
13
- "shape": [
14
- 3,
15
- 480,
16
- 640
17
- ]
18
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  },
20
- "output_features": {
21
- "action": {
22
- "type": "ACTION",
23
- "shape": [
24
- 6
25
- ]
26
- }
27
- },
28
- "device": "cuda",
29
- "use_amp": false,
30
- "use_peft": false,
31
- "push_to_hub": false,
32
- "repo_id": null,
33
- "private": null,
34
- "tags": null,
35
- "license": null,
36
- "pretrained_path": "/home/binhpham/outputs/molmoact2-leslider_steps10000_bs8_lr1e-5_chunk30/checkpoints/005000/pretrained_model",
37
- "checkpoint_path": "allenai/MolmoAct2-SO100_101",
38
- "checkpoint_revision": null,
39
- "checkpoint_force_download": false,
40
- "trust_remote_code": true,
41
- "chunk_size": 30,
42
- "n_action_steps": 30,
43
- "action_mode": "continuous",
44
- "inference_action_mode": null,
45
- "discrete_action_tokenizer": "allenai/MolmoAct2-FAST-Tokenizer",
46
- "discrete_generation_max_steps": null,
47
- "norm_tag": null,
48
- "setup_type": "single so100/so101 robotic arm in molmoact2",
49
- "control_mode": "absolute joint pose",
50
- "image_keys": [
51
- "observation.images.arm_camera"
52
- ],
53
- "normalize_language": true,
54
- "add_setup_tokens": true,
55
- "add_control_tokens": true,
56
- "normalize_gripper": false,
57
- "num_state_tokens": 256,
58
- "max_sequence_length": null,
59
- "expected_max_action_dim": 32,
60
- "num_flow_timesteps": 8,
61
- "flow_matching_cutoff": 1.0,
62
- "flow_matching_time_offset": 0.001,
63
- "flow_matching_time_scale": 0.999,
64
- "flow_matching_beta_alpha": 1.0,
65
- "flow_matching_beta_beta": 1.5,
66
- "num_inference_steps": null,
67
- "mask_action_dim_padding": true,
68
- "enable_inference_cuda_graph": true,
69
- "per_episode_seed": false,
70
- "eval_seed": null,
71
- "rtc_config": null,
72
- "enable_lora_vlm": false,
73
- "lora_rank": 64,
74
- "lora_alpha": 16,
75
- "lora_dropout": 0.05,
76
- "lora_bias": "none",
77
- "enable_lora_action_expert": false,
78
- "enable_knowledge_insulation": false,
79
- "freeze_embedding": true,
80
- "train_action_expert_only": false,
81
- "gradient_checkpointing": true,
82
- "model_dtype": "bfloat16",
83
- "softmax_auxiliary_loss": true,
84
- "softmax_auxiliary_loss_scale": 0.0001,
85
- "discrete_loss_token_weighting": "root_subsegments_root_tokens",
86
- "optimizer_lr": 1e-05,
87
- "optimizer_vit_lr": 5e-06,
88
- "optimizer_connector_lr": 5e-06,
89
- "optimizer_action_expert_lr": 5e-05,
90
- "optimizer_betas": [
91
- 0.9,
92
- 0.95
93
  ],
94
- "optimizer_eps": 1e-06,
95
- "optimizer_weight_decay": 0.0,
96
- "optimizer_grad_clip_norm": 1.0,
97
- "scheduler_warmup_steps": 200,
98
- "scheduler_decay_steps": null,
99
- "scheduler_decay_lr": 1e-06,
100
- "normalization_mapping": {
101
- "ACTION": "MEAN_STD",
102
- "STATE": "MEAN_STD",
103
- "VISUAL": "IDENTITY"
104
- },
105
- "dataset_feature_names": {
106
- "action": [
107
- "shoulder_pan.pos",
108
- "shoulder_lift.pos",
109
- "elbow_flex.pos",
110
- "wrist_flex.pos",
111
- "wrist_roll.pos",
112
- "gripper.pos"
113
- ],
114
- "observation.state": [
115
- "shoulder_pan.pos",
116
- "shoulder_lift.pos",
117
- "elbow_flex.pos",
118
- "wrist_flex.pos",
119
- "wrist_roll.pos",
120
- "gripper.pos"
121
- ]
122
- }
123
- }
 
1
  {
2
+ "action_end_token_id": 151933,
3
+ "action_expert_config": {
4
+ "attn_dropout": 0.0,
5
+ "causal_attn": false,
6
+ "context_layer_norm": true,
7
+ "dropout": 0.0,
8
+ "ffn_multiple_of": 256,
9
+ "hidden_size": 768,
10
+ "mlp_ratio": 4.0,
11
+ "model_type": "molmoact2_action_expert",
12
+ "num_heads": 8,
13
+ "num_layers": 36,
14
+ "qk_norm": true,
15
+ "qk_norm_eps": 1e-06,
16
+ "rope": true,
17
+ "timestep_embed_dim": 256
18
+ },
19
+ "action_expert_depth_gate": false,
20
+ "action_expert_depth_gate_init_bias": -4.0,
21
+ "action_expert_depth_gate_per_layer": false,
22
+ "action_mode": "both",
23
+ "max_action_horizon": 30,
24
+ "action_output_token_id": 151931,
25
+ "action_start_token_id": 151932,
26
+ "action_token_start_id": 151934,
27
+ "adapter_config": {
28
+ "attention_dropout": 0.0,
29
+ "attn_implementation": "sdpa",
30
+ "float32_attention": true,
31
+ "head_dim": 72,
32
+ "hidden_act": "silu",
33
+ "hidden_size": 1152,
34
+ "image_feature_dropout": 0.0,
35
+ "initializer_range": 0.02,
36
+ "intermediate_size": 9728,
37
+ "model_type": "molmoact2",
38
+ "num_attention_heads": 16,
39
+ "num_key_value_heads": 16,
40
+ "pooling_attention_mask": true,
41
+ "residual_dropout": 0.0,
42
+ "text_hidden_size": 2560,
43
+ "vit_layers": [
44
+ -3,
45
+ -9
46
+ ]
47
+ },
48
+ "add_action_expert": true,
49
+ "add_control_tokens": true,
50
+ "add_setup_tokens": true,
51
+ "architectures": [
52
+ "MolmoAct2ForConditionalGeneration"
53
+ ],
54
+ "auto_map": {
55
+ "AutoConfig": "configuration_molmoact2.MolmoAct2Config",
56
+ "AutoModelForImageTextToText": "modeling_molmoact2.MolmoAct2ForConditionalGeneration"
57
+ },
58
+ "depth_end_token_id": null,
59
+ "depth_mode": 2,
60
+ "depth_output_token_id": null,
61
+ "depth_start_token_id": null,
62
+ "depth_token_start_id": null,
63
+ "dtype": "float32",
64
+ "enable_depth_reasoning": false,
65
+ "flow_matching_beta_alpha": 1.0,
66
+ "flow_matching_beta_beta": 1.5,
67
+ "flow_matching_cutoff": 1.0,
68
+ "flow_matching_num_steps": 10,
69
+ "flow_matching_time_offset": 0.001,
70
+ "flow_matching_time_scale": 0.999,
71
+ "frame_end_token_id": 154632,
72
+ "frame_start_token_id": 154631,
73
+ "image_col_id": 154627,
74
+ "image_end_token_id": 154625,
75
+ "image_high_res_id": 154626,
76
+ "image_low_res_id": 154630,
77
+ "image_patch_id": 154626,
78
+ "image_start_token_id": 154624,
79
+ "initializer_range": 0.02,
80
+ "low_res_image_start_token_id": 154628,
81
+ "mask_action_dim_padding": true,
82
+ "max_action_dim": 32,
83
+ "model_type": "molmoact2",
84
+ "n_obs_steps": 1,
85
+ "norm_stats_filename": "norm_stats.json",
86
+ "num_action_tokens": 2048,
87
+ "num_depth_codes": 100,
88
+ "num_depth_tokens": 0,
89
+ "num_state_tokens": 256,
90
+ "state_end_token_id": 151674,
91
+ "state_format": "discrete",
92
+ "state_start_token_id": 151673,
93
+ "state_token_start_id": 151675,
94
+ "text_config": {
95
+ "additional_vocab_size": 128,
96
+ "attention_dropout": 0.0,
97
+ "attn_implementation": "sdpa",
98
+ "embedding_dropout": 0.0,
99
+ "head_dim": 128,
100
+ "hidden_act": "silu",
101
+ "hidden_size": 2560,
102
+ "initializer_range": 0.02,
103
+ "intermediate_size": 9728,
104
+ "layer_norm_eps": 1e-06,
105
+ "max_position_embeddings": 16384,
106
+ "model_type": "molmoact2_text",
107
+ "norm_after": false,
108
+ "num_attention_heads": 32,
109
+ "num_hidden_layers": 36,
110
+ "num_key_value_heads": 8,
111
+ "qk_norm_type": "qwen3",
112
+ "qkv_bias": false,
113
+ "residual_dropout": 0.0,
114
+ "rope_parameters": {
115
+ "rope_theta": 5000000.0,
116
+ "rope_type": "default"
117
  },
118
+ "rope_scaling_layers": null,
119
+ "rope_theta": 5000000.0,
120
+ "tie_word_embeddings": false,
121
+ "use_cache": true,
122
+ "use_qk_norm": true,
123
+ "vocab_size": 154624
124
+ },
125
+ "tie_word_embeddings": false,
126
+ "transformers_version": "5.3.0",
127
+ "use_frame_special_tokens": true,
128
+ "vit_config": {
129
+ "attention_dropout": 0.0,
130
+ "attn_implementation": "sdpa",
131
+ "float32_attention": true,
132
+ "head_dim": 72,
133
+ "hidden_act": "gelu_pytorch_tanh",
134
+ "hidden_size": 1152,
135
+ "image_default_input_size": [
136
+ 378,
137
+ 378
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  ],
139
+ "image_num_pos": 729,
140
+ "image_patch_size": 14,
141
+ "initializer_range": 0.02,
142
+ "intermediate_size": 4304,
143
+ "layer_norm_eps": 1e-06,
144
+ "model_type": "molmoact2",
145
+ "num_attention_heads": 16,
146
+ "num_hidden_layers": 27,
147
+ "num_key_value_heads": 16,
148
+ "residual_dropout": 0.0
149
+ },
150
+ "bos_token_id": 151645,
151
+ "eos_token_id": 151645,
152
+ "pad_token_id": 151643
153
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5129ddfec4817fbcfd58a41dcc5543b4eafbbe0ba6b867012c6dc955c31bfe34
3
- size 10884573720
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d34a2c69ef43bb3f346bacffff8866e8e6f6653f4cfa493aee26b75afdead19
3
+ size 10884565968