euijinrnd commited on
Commit
fab3b27
·
verified ·
1 Parent(s): 61b234b

Add files using upload-large-folder tool

Browse files
Files changed (34) hide show
  1. 1e-4/twinvla-scratch-1e-4-aloha_dish_drainer/config.json +315 -0
  2. 1e-4/twinvla-scratch-1e-4-aloha_dish_drainer/dataset_statistics.json +634 -0
  3. 1e-4/twinvla-scratch-1e-4-aloha_dish_drainer/model.safetensors +3 -0
  4. 1e-4/twinvla-scratch-1e-4-aloha_dish_drainer/singlevla_config/config.json +227 -0
  5. 1e-4/twinvla-scratch-1e-4-aloha_handover_box/config.json +314 -0
  6. 1e-4/twinvla-scratch-1e-4-aloha_handover_box/dataset_statistics.json +634 -0
  7. 1e-4/twinvla-scratch-1e-4-aloha_handover_box/model.safetensors +3 -0
  8. 1e-4/twinvla-scratch-1e-4-aloha_handover_box/singlevla_config/config.json +227 -0
  9. 1e-4/twinvla-scratch-1e-4-aloha_lift_box/config.json +314 -0
  10. 1e-4/twinvla-scratch-1e-4-aloha_lift_box/dataset_statistics.json +634 -0
  11. 1e-4/twinvla-scratch-1e-4-aloha_lift_box/model.safetensors +3 -0
  12. 1e-4/twinvla-scratch-1e-4-aloha_lift_box/singlevla_config/config.json +227 -0
  13. 1e-4/twinvla-scratch-1e-4-aloha_shoes_table/config.json +314 -0
  14. 1e-4/twinvla-scratch-1e-4-aloha_shoes_table/dataset_statistics.json +634 -0
  15. 1e-4/twinvla-scratch-1e-4-aloha_shoes_table/model.safetensors +3 -0
  16. 1e-4/twinvla-scratch-1e-4-aloha_shoes_table/singlevla_config/config.json +227 -0
  17. 2e-5/twinvla-aloha_shoes_table/config.json +317 -0
  18. 2e-5/twinvla-aloha_shoes_table/dataset_statistics.json +634 -0
  19. 2e-5/twinvla-aloha_shoes_table/model.safetensors +3 -0
  20. 2e-5/twinvla-aloha_shoes_table/singlevla_config/config.json +230 -0
  21. 2e-5/twinvla-scratch-aloha_dish_drainer/config.json +314 -0
  22. 2e-5/twinvla-scratch-aloha_dish_drainer/dataset_statistics.json +634 -0
  23. 2e-5/twinvla-scratch-aloha_dish_drainer/model.safetensors +3 -0
  24. 2e-5/twinvla-scratch-aloha_dish_drainer/singlevla_config/config.json +227 -0
  25. 2e-5/twinvla-scratch-aloha_handover_box/config.json +314 -0
  26. 2e-5/twinvla-scratch-aloha_handover_box/dataset_statistics.json +634 -0
  27. 2e-5/twinvla-scratch-aloha_handover_box/model.safetensors +3 -0
  28. 2e-5/twinvla-scratch-aloha_handover_box/singlevla_config/config.json +227 -0
  29. 2e-5/twinvla-scratch-aloha_handover_box/training_states.pth +3 -0
  30. 2e-5/twinvla-scratch-aloha_lift_box/config.json +314 -0
  31. 2e-5/twinvla-scratch-aloha_lift_box/dataset_statistics.json +634 -0
  32. 2e-5/twinvla-scratch-aloha_lift_box/model.safetensors +3 -0
  33. 2e-5/twinvla-scratch-aloha_lift_box/singlevla_config/config.json +227 -0
  34. 2e-5/twinvla-scratch-aloha_lift_box/training_states.pth +3 -0
1e-4/twinvla-scratch-1e-4-aloha_dish_drainer/config.json ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/scratch2/jellyho/rebuttal/tabletop-v3/twinvla-scratch-1e-4-aloha_dish_drainer",
3
+ "action_dim": 10,
4
+ "action_head": "DiT",
5
+ "action_len": 20,
6
+ "architectures": [
7
+ "Eagle2_1BTwinVLA"
8
+ ],
9
+ "attn_reweighting": true,
10
+ "denoiser": "FM",
11
+ "dit_scratch": false,
12
+ "global_normalization": true,
13
+ "hz_interpolate": null,
14
+ "interpolate_gripper": false,
15
+ "knowledge_insulation": false,
16
+ "model_path": null,
17
+ "model_type": "Eagle2_1BTwinVLA",
18
+ "modeling": "denoising",
19
+ "normalization": "quantile",
20
+ "num_readouts": 1,
21
+ "readout_token_as_eos": true,
22
+ "share_decoder": true,
23
+ "share_embed_tokens": true,
24
+ "share_vision": true,
25
+ "singlevla_config": {
26
+ "_attn_implementation_autoset": false,
27
+ "_attn_implementation_internal": null,
28
+ "_commit_hash": null,
29
+ "_name_or_path": "/scratch2/jellyho/rebuttal/tabletop-v3/twinvla-scratch-1e-4-aloha_dish_drainer/singlevla_config",
30
+ "action_dim": 10,
31
+ "action_head": "DiT",
32
+ "action_head_hidden_dim": 1024,
33
+ "action_len": 20,
34
+ "add_cross_attention": false,
35
+ "aggregation": "None",
36
+ "architectures": [
37
+ "Eagle2_1BVLA"
38
+ ],
39
+ "auto_map": {},
40
+ "bad_words_ids": null,
41
+ "begin_suppress_tokens": null,
42
+ "bos_token_id": null,
43
+ "chunk_size_feed_forward": 0,
44
+ "cross_attention_hidden_size": null,
45
+ "decoder_start_token_id": null,
46
+ "denoiser": "FM",
47
+ "diffusion_batch": 32,
48
+ "dit_size": "DiT-B",
49
+ "diversity_penalty": 0.0,
50
+ "do_sample": false,
51
+ "downsample_ratio": 0.5,
52
+ "dynamic_image_size": true,
53
+ "early_stopping": false,
54
+ "efficient_loss": true,
55
+ "enable_cfg": true,
56
+ "encoder_no_repeat_ngram_size": 0,
57
+ "eos_token_id": null,
58
+ "exponential_decay_length_penalty": null,
59
+ "finetuning_task": null,
60
+ "force_image_size": 448,
61
+ "forced_bos_token_id": null,
62
+ "forced_eos_token_id": null,
63
+ "global_normalization": true,
64
+ "id2label": {
65
+ "0": "LABEL_0",
66
+ "1": "LABEL_1"
67
+ },
68
+ "image_size": 448,
69
+ "is_decoder": false,
70
+ "is_encoder_decoder": false,
71
+ "keep_aspect_ratio": false,
72
+ "knowledge_insulation": false,
73
+ "label2id": {
74
+ "LABEL_0": 0,
75
+ "LABEL_1": 1
76
+ },
77
+ "length_penalty": 1.0,
78
+ "llm_config": {
79
+ "_attn_implementation_autoset": true,
80
+ "_name_or_path": "./pretrained/Qwen2_5-0_5B-Instruct",
81
+ "add_cross_attention": false,
82
+ "architectures": [
83
+ "Qwen2ForCausalLM"
84
+ ],
85
+ "attention_dropout": 0.0,
86
+ "auto_map": {
87
+ "AutoConfig": "configuration_qwen2.Qwen2Config",
88
+ "AutoModel": "modeling_qwen2.Qwen2Model",
89
+ "AutoModelForCausalLM": "modeling_qwen2.Qwen2ForCausalLM"
90
+ },
91
+ "bad_words_ids": null,
92
+ "begin_suppress_tokens": null,
93
+ "bos_token_id": 151643,
94
+ "chunk_size_feed_forward": 0,
95
+ "cross_attention_hidden_size": null,
96
+ "decoder_start_token_id": null,
97
+ "diversity_penalty": 0.0,
98
+ "do_sample": false,
99
+ "early_stopping": false,
100
+ "encoder_no_repeat_ngram_size": 0,
101
+ "eos_token_id": 151645,
102
+ "exponential_decay_length_penalty": null,
103
+ "finetuning_task": null,
104
+ "forced_bos_token_id": null,
105
+ "forced_eos_token_id": null,
106
+ "hidden_act": "silu",
107
+ "hidden_size": 896,
108
+ "id2label": {
109
+ "0": "LABEL_0",
110
+ "1": "LABEL_1"
111
+ },
112
+ "initializer_range": 0.02,
113
+ "intermediate_size": 4864,
114
+ "is_decoder": false,
115
+ "is_encoder_decoder": false,
116
+ "label2id": {
117
+ "LABEL_0": 0,
118
+ "LABEL_1": 1
119
+ },
120
+ "length_penalty": 1.0,
121
+ "max_length": 20,
122
+ "max_position_embeddings": 32768,
123
+ "max_window_layers": 21,
124
+ "min_length": 0,
125
+ "model_type": "qwen2",
126
+ "no_repeat_ngram_size": 0,
127
+ "num_attention_heads": 14,
128
+ "num_beam_groups": 1,
129
+ "num_beams": 1,
130
+ "num_hidden_layers": 24,
131
+ "num_key_value_heads": 2,
132
+ "num_return_sequences": 1,
133
+ "output_attentions": false,
134
+ "output_hidden_states": false,
135
+ "output_scores": false,
136
+ "pad_token_id": null,
137
+ "prefix": null,
138
+ "problem_type": null,
139
+ "pruned_heads": {},
140
+ "remove_invalid_values": false,
141
+ "repetition_penalty": 1.0,
142
+ "return_dict": true,
143
+ "return_dict_in_generate": false,
144
+ "rms_norm_eps": 1e-06,
145
+ "rope_scaling": null,
146
+ "rope_theta": 1000000.0,
147
+ "sep_token_id": null,
148
+ "sliding_window": 32768,
149
+ "suppress_tokens": null,
150
+ "task_specific_params": null,
151
+ "temperature": 1.0,
152
+ "tf_legacy_loss": false,
153
+ "tie_encoder_decoder": false,
154
+ "tie_word_embeddings": true,
155
+ "tokenizer_class": null,
156
+ "top_k": 50,
157
+ "top_p": 1.0,
158
+ "torch_dtype": "bfloat16",
159
+ "torchscript": false,
160
+ "transformers_version": "4.50.0.dev0",
161
+ "typical_p": 1.0,
162
+ "use_bfloat16": false,
163
+ "use_cache": false,
164
+ "use_sliding_window": false,
165
+ "vocab_size": 151674
166
+ },
167
+ "loss_version": "v4",
168
+ "max_dynamic_patch": 12,
169
+ "max_length": 20,
170
+ "min_dynamic_patch": 1,
171
+ "min_length": 0,
172
+ "mlp_checkpoint": true,
173
+ "model_path": "nvidia/Eagle2-1B",
174
+ "model_type": "Eagle2_1BVLA",
175
+ "modeling": "denoising",
176
+ "no_repeat_ngram_size": 0,
177
+ "normalization": "quantile",
178
+ "num_beam_groups": 1,
179
+ "num_beams": 1,
180
+ "num_readouts": 1,
181
+ "num_return_sequences": 1,
182
+ "output_attentions": false,
183
+ "output_hidden_states": false,
184
+ "output_scores": false,
185
+ "pad2square": false,
186
+ "pad_token_id": null,
187
+ "pre_feature_reduction": false,
188
+ "prefix": null,
189
+ "problem_type": null,
190
+ "pruned_heads": {},
191
+ "ps_version": "v2",
192
+ "readout_token_as_eos": true,
193
+ "remove_invalid_values": false,
194
+ "repetition_penalty": 1.0,
195
+ "return_dict": true,
196
+ "return_dict_in_generate": false,
197
+ "return_text": null,
198
+ "select_layer": -1,
199
+ "sep_token_id": null,
200
+ "state_dim": 10,
201
+ "stopping_token": "|",
202
+ "suppress_tokens": null,
203
+ "task_specific_params": null,
204
+ "temperature": 1.0,
205
+ "template": "qwen2-chat",
206
+ "test_denoising_steps": 10,
207
+ "tf_legacy_loss": false,
208
+ "tie_encoder_decoder": false,
209
+ "tie_word_embeddings": true,
210
+ "tokenizer_class": null,
211
+ "top_k": 50,
212
+ "top_p": 1.0,
213
+ "torch_dtype": "bfloat16",
214
+ "torchscript": false,
215
+ "train_denoising_steps": 100,
216
+ "typical_p": 1.0,
217
+ "use_backbone_lora": 0,
218
+ "use_bfloat16": false,
219
+ "use_llm_lora": 0,
220
+ "use_thumbnail": true,
221
+ "vision_config": {
222
+ "_attn_implementation_autoset": true,
223
+ "_name_or_path": "",
224
+ "add_cross_attention": false,
225
+ "architectures": [
226
+ "SiglipVisionModel"
227
+ ],
228
+ "attention_dropout": 0.0,
229
+ "auto_map": {
230
+ "AutoConfig": "configuration_siglip.SiglipVisionConfig",
231
+ "AutoModel": "modeling_siglip.SiglipVisionModel"
232
+ },
233
+ "bad_words_ids": null,
234
+ "begin_suppress_tokens": null,
235
+ "bos_token_id": null,
236
+ "chunk_size_feed_forward": 0,
237
+ "cross_attention_hidden_size": null,
238
+ "decoder_start_token_id": null,
239
+ "diversity_penalty": 0.0,
240
+ "do_sample": false,
241
+ "drop_path_rate": 0.1,
242
+ "early_stopping": false,
243
+ "encoder_no_repeat_ngram_size": 0,
244
+ "eos_token_id": null,
245
+ "exponential_decay_length_penalty": null,
246
+ "finetuning_task": null,
247
+ "forced_bos_token_id": null,
248
+ "forced_eos_token_id": null,
249
+ "hidden_act": "gelu_pytorch_tanh",
250
+ "hidden_size": 1152,
251
+ "id2label": {
252
+ "0": "LABEL_0",
253
+ "1": "LABEL_1"
254
+ },
255
+ "image_size": 448,
256
+ "intermediate_size": 4304,
257
+ "is_decoder": false,
258
+ "is_encoder_decoder": false,
259
+ "label2id": {
260
+ "LABEL_0": 0,
261
+ "LABEL_1": 1
262
+ },
263
+ "layer_norm_eps": 1e-06,
264
+ "length_penalty": 1.0,
265
+ "max_length": 20,
266
+ "min_length": 0,
267
+ "model_type": "siglip_vision_model",
268
+ "no_repeat_ngram_size": 0,
269
+ "num_attention_heads": 16,
270
+ "num_beam_groups": 1,
271
+ "num_beams": 1,
272
+ "num_channels": 3,
273
+ "num_hidden_layers": 27,
274
+ "num_image_tokens": 1024,
275
+ "num_return_sequences": 1,
276
+ "output_attentions": false,
277
+ "output_hidden_states": false,
278
+ "output_scores": false,
279
+ "pad_token_id": null,
280
+ "patch_size": 14,
281
+ "prefix": null,
282
+ "problem_type": null,
283
+ "projection_dim": 2048,
284
+ "projector_hidden_act": "gelu_fast",
285
+ "pruned_heads": {},
286
+ "remove_invalid_values": false,
287
+ "repetition_penalty": 1.0,
288
+ "return_dict": true,
289
+ "return_dict_in_generate": false,
290
+ "sep_token_id": null,
291
+ "suppress_tokens": null,
292
+ "task_specific_params": null,
293
+ "temperature": 1.0,
294
+ "tf_legacy_loss": false,
295
+ "tie_encoder_decoder": false,
296
+ "tie_word_embeddings": true,
297
+ "tokenizer_class": null,
298
+ "top_k": 50,
299
+ "top_p": 1.0,
300
+ "torch_dtype": "bfloat16",
301
+ "torchscript": false,
302
+ "transformers_version": "4.50.0.dev0",
303
+ "typical_p": 1.0,
304
+ "use_bfloat16": false,
305
+ "vision_use_head": false
306
+ },
307
+ "vocab_size": 151674,
308
+ "vocab_start": null
309
+ },
310
+ "singlevla_config_path": "/scratch2/jellyho/rebuttal/tabletop-v3/twinvla-scratch-1e-4-aloha_dish_drainer/singlevla_config",
311
+ "singlevla_pretrained_path": null,
312
+ "state_dim": 10,
313
+ "torch_dtype": "bfloat16",
314
+ "transformers_version": "4.50.0.dev0"
315
+ }
1e-4/twinvla-scratch-1e-4-aloha_dish_drainer/dataset_statistics.json ADDED
@@ -0,0 +1,634 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "aloha_dish_drainer": {
3
+ "action": {
4
+ "mean": [
5
+ 0.40166154503822327,
6
+ -0.01112140342593193,
7
+ 0.1838103085756302,
8
+ 0.7193062901496887,
9
+ 0.2537333369255066,
10
+ -0.5631371736526489,
11
+ -0.14408059418201447,
12
+ 0.8170215487480164,
13
+ 0.1759030818939209,
14
+ 0.3052484393119812,
15
+ 0.27496495842933655,
16
+ 0.07536163926124573,
17
+ 0.11210401356220245,
18
+ 0.5866137146949768,
19
+ 0.16616441309452057,
20
+ -0.6815541982650757,
21
+ -0.029566079378128052,
22
+ 0.9651421308517456,
23
+ 0.16927561163902283,
24
+ -0.015535339713096619
25
+ ],
26
+ "std": [
27
+ 0.11050384491682053,
28
+ 0.09560801833868027,
29
+ 0.07149424403905869,
30
+ 0.16429583728313446,
31
+ 0.23607666790485382,
32
+ 0.13553044199943542,
33
+ 0.4136315882205963,
34
+ 0.16760702431201935,
35
+ 0.28564009070396423,
36
+ 0.9522888660430908,
37
+ 0.031309906393289566,
38
+ 0.04574710130691528,
39
+ 0.08567056804895401,
40
+ 0.298023521900177,
41
+ 0.15602006018161774,
42
+ 0.22492952644824982,
43
+ 0.10802315920591354,
44
+ 0.04161505028605461,
45
+ 0.15993013978004456,
46
+ 0.9998809099197388
47
+ ],
48
+ "max": [
49
+ 0.6568294763565063,
50
+ 0.20922525227069855,
51
+ 0.329291433095932,
52
+ 0.9988790154457092,
53
+ 0.8221861720085144,
54
+ -0.02126980759203434,
55
+ 0.554952085018158,
56
+ 0.9999961256980896,
57
+ 0.8352594971656799,
58
+ 1.0,
59
+ 0.3725535273551941,
60
+ 0.20133008062839508,
61
+ 0.2683204710483551,
62
+ 0.9969081878662109,
63
+ 0.5947288274765015,
64
+ 0.135818213224411,
65
+ 0.297533243894577,
66
+ 0.9999833106994629,
67
+ 0.6284497380256653,
68
+ 1.0
69
+ ],
70
+ "min": [
71
+ 0.1679762601852417,
72
+ -0.2037276178598404,
73
+ 0.026118876412510872,
74
+ 0.06734701991081238,
75
+ -0.3303077816963196,
76
+ -0.865761399269104,
77
+ -0.9697803854942322,
78
+ 0.24385260045528412,
79
+ -0.3337814211845398,
80
+ -1.0,
81
+ 0.17690593004226685,
82
+ -0.019342761486768723,
83
+ -0.045900676399469376,
84
+ -0.08388058096170425,
85
+ -0.1825810670852661,
86
+ -0.9999706149101257,
87
+ -0.4282298684120178,
88
+ 0.7756603956222534,
89
+ -0.19046637415885925,
90
+ -1.0
91
+ ],
92
+ "q01": [
93
+ 0.22041156470775605,
94
+ -0.17958899974822998,
95
+ 0.04473079532384872,
96
+ 0.2284090793132782,
97
+ -0.2088965356349945,
98
+ -0.811203727722168,
99
+ -0.9306126594543457,
100
+ 0.3530711317062378,
101
+ -0.2207678198814392,
102
+ -1.0,
103
+ 0.2055837804079056,
104
+ 0.005079864375293255,
105
+ -0.04285515695810318,
106
+ 0.023393160849809646,
107
+ -0.12909780085086822,
108
+ -0.9969730639457702,
109
+ -0.31871861577033994,
110
+ 0.8128526282310485,
111
+ -0.12555764615535736,
112
+ -1.0
113
+ ],
114
+ "q99": [
115
+ 0.6256548881530761,
116
+ 0.16506216526031484,
117
+ 0.3053938007354736,
118
+ 0.9577780866622918,
119
+ 0.7322160029411315,
120
+ -0.22335838973522296,
121
+ 0.43161325573921194,
122
+ 0.9983374524116516,
123
+ 0.7683744573593138,
124
+ 1.0,
125
+ 0.344862767457962,
126
+ 0.19341405749320983,
127
+ 0.24164194464683514,
128
+ 0.9684402346611023,
129
+ 0.5674381494522094,
130
+ -0.24195577383041442,
131
+ 0.2379095745086669,
132
+ 0.9997554516792297,
133
+ 0.564831252098083,
134
+ 1.0
135
+ ],
136
+ "mask": [
137
+ true,
138
+ true,
139
+ true,
140
+ false,
141
+ false,
142
+ false,
143
+ false,
144
+ false,
145
+ false,
146
+ false,
147
+ true,
148
+ true,
149
+ true,
150
+ false,
151
+ false,
152
+ false,
153
+ false,
154
+ false,
155
+ false,
156
+ false
157
+ ]
158
+ },
159
+ "proprio": {
160
+ "mean": [
161
+ 0.39952757954597473,
162
+ -0.01743784174323082,
163
+ 0.17103438079357147,
164
+ 0.7105360627174377,
165
+ 0.238600954413414,
166
+ -0.5759879350662231,
167
+ -0.11663737148046494,
168
+ 0.815979540348053,
169
+ 0.1842789500951767,
170
+ 0.32329148054122925,
171
+ 0.2642972469329834,
172
+ 0.056828975677490234,
173
+ 0.10836686193943024,
174
+ 0.5435150265693665,
175
+ 0.10053255409002304,
176
+ -0.7011978030204773,
177
+ -0.03383756801486015,
178
+ 0.950905442237854,
179
+ 0.0682743638753891,
180
+ -0.11205191910266876
181
+ ],
182
+ "std": [
183
+ 0.1054687574505806,
184
+ 0.09407484531402588,
185
+ 0.07594858109951019,
186
+ 0.15545178949832916,
187
+ 0.23370550572872162,
188
+ 0.1663108915090561,
189
+ 0.417312353849411,
190
+ 0.1589633673429489,
191
+ 0.295290470123291,
192
+ 0.8386000990867615,
193
+ 0.03193666413426399,
194
+ 0.03702628239989281,
195
+ 0.08499231189489365,
196
+ 0.33746662735939026,
197
+ 0.13817644119262695,
198
+ 0.2642515301704407,
199
+ 0.13742688298225403,
200
+ 0.10328594595193863,
201
+ 0.24581065773963928,
202
+ 0.988041341304779
203
+ ],
204
+ "max": [
205
+ 0.6216338276863098,
206
+ 0.1681845635175705,
207
+ 0.3582729399204254,
208
+ 0.9998778104782104,
209
+ 0.7569742202758789,
210
+ 0.29317960143089294,
211
+ 0.5474420785903931,
212
+ 1.0,
213
+ 0.9644882678985596,
214
+ 1.2399240732192993,
215
+ 0.36810019612312317,
216
+ 0.15229015052318573,
217
+ 0.3755773603916168,
218
+ 0.9999530911445618,
219
+ 0.47173869609832764,
220
+ 0.4396477937698364,
221
+ 0.5856077671051025,
222
+ 1.0,
223
+ 0.9141661524772644,
224
+ 1.0335123538970947
225
+ ],
226
+ "min": [
227
+ 0.17779576778411865,
228
+ -0.2223799079656601,
229
+ 0.009585360996425152,
230
+ 0.27525120973587036,
231
+ -0.3401731848716736,
232
+ -0.8740139603614807,
233
+ -0.922980010509491,
234
+ 0.20966650545597076,
235
+ -0.5117865800857544,
236
+ -1.04777991771698,
237
+ 0.13721425831317902,
238
+ -0.11607959121465683,
239
+ -0.006126723252236843,
240
+ -0.12117788940668106,
241
+ -0.5865428447723389,
242
+ -0.9999897480010986,
243
+ -0.48856121301651,
244
+ -0.09543908387422562,
245
+ -0.9954046607017517,
246
+ -1.1056499481201172
247
+ ],
248
+ "q01": [
249
+ 0.22190109610557557,
250
+ -0.19557971894741058,
251
+ 0.02071425139904022,
252
+ 0.33727509021759033,
253
+ -0.20722176849842072,
254
+ -0.8324707460403442,
255
+ -0.8625615048408508,
256
+ 0.48607451796531675,
257
+ -0.30660848736763,
258
+ -0.9315443730354309,
259
+ 0.19041878879070281,
260
+ -0.04380948930978775,
261
+ -0.0050327684171497826,
262
+ -0.05638677150011063,
263
+ -0.26807846426963805,
264
+ -0.9989835453033448,
265
+ -0.329305921792984,
266
+ 0.4013799297809601,
267
+ -0.8769975972175598,
268
+ -1.088063154220581
269
+ ],
270
+ "q99": [
271
+ 0.6066525983810425,
272
+ 0.13703446269035335,
273
+ 0.3049686551094055,
274
+ 0.995180070400238,
275
+ 0.6961594343185422,
276
+ 0.0980641171336174,
277
+ 0.463529108762741,
278
+ 0.9997917461395264,
279
+ 0.7787499904632564,
280
+ 1.0201601552963255,
281
+ 0.3442830562591551,
282
+ 0.1231292974948883,
283
+ 0.30282683849334713,
284
+ 0.9978944087028503,
285
+ 0.41937308669090234,
286
+ 0.0980641171336174,
287
+ 0.27175513744354135,
288
+ 0.999885528087616,
289
+ 0.5146499085426329,
290
+ 1.0030832004547119
291
+ ],
292
+ "mask": [
293
+ true,
294
+ true,
295
+ true,
296
+ false,
297
+ false,
298
+ false,
299
+ false,
300
+ false,
301
+ false,
302
+ false,
303
+ true,
304
+ true,
305
+ true,
306
+ false,
307
+ false,
308
+ false,
309
+ false,
310
+ false,
311
+ false,
312
+ false
313
+ ]
314
+ },
315
+ "num_transitions": 7145,
316
+ "num_trajectories": 50
317
+ },
318
+ "aloha_dish_drainer_new": {
319
+ "action": {
320
+ "mean": [
321
+ 0.40166154503822327,
322
+ -0.01112140342593193,
323
+ 0.1838103085756302,
324
+ 0.7193062901496887,
325
+ 0.2537333369255066,
326
+ -0.5631371736526489,
327
+ -0.14408059418201447,
328
+ 0.8170215487480164,
329
+ 0.1759030818939209,
330
+ 0.3052484393119812,
331
+ 0.27496495842933655,
332
+ 0.07536163926124573,
333
+ 0.11210401356220245,
334
+ 0.5866137146949768,
335
+ 0.16616441309452057,
336
+ -0.6815541982650757,
337
+ -0.029566079378128052,
338
+ 0.9651421308517456,
339
+ 0.16927561163902283,
340
+ -0.015535339713096619
341
+ ],
342
+ "std": [
343
+ 0.11050384491682053,
344
+ 0.09560801833868027,
345
+ 0.07149424403905869,
346
+ 0.16429583728313446,
347
+ 0.23607666790485382,
348
+ 0.13553044199943542,
349
+ 0.4136315882205963,
350
+ 0.16760702431201935,
351
+ 0.28564009070396423,
352
+ 0.9522888660430908,
353
+ 0.031309906393289566,
354
+ 0.04574710130691528,
355
+ 0.08567056804895401,
356
+ 0.298023521900177,
357
+ 0.15602006018161774,
358
+ 0.22492952644824982,
359
+ 0.10802315920591354,
360
+ 0.04161505028605461,
361
+ 0.15993013978004456,
362
+ 0.9998809099197388
363
+ ],
364
+ "max": [
365
+ 0.6568294763565063,
366
+ 0.20922525227069855,
367
+ 0.329291433095932,
368
+ 0.9988790154457092,
369
+ 0.8221861720085144,
370
+ -0.02126980759203434,
371
+ 0.554952085018158,
372
+ 0.9999961256980896,
373
+ 0.8352594971656799,
374
+ 1.0,
375
+ 0.3725535273551941,
376
+ 0.20133008062839508,
377
+ 0.2683204710483551,
378
+ 0.9969081878662109,
379
+ 0.5947288274765015,
380
+ 0.135818213224411,
381
+ 0.297533243894577,
382
+ 0.9999833106994629,
383
+ 0.6284497380256653,
384
+ 1.0
385
+ ],
386
+ "min": [
387
+ 0.1679762601852417,
388
+ -0.2037276178598404,
389
+ 0.026118876412510872,
390
+ 0.06734701991081238,
391
+ -0.3303077816963196,
392
+ -0.865761399269104,
393
+ -0.9697803854942322,
394
+ 0.24385260045528412,
395
+ -0.3337814211845398,
396
+ -1.0,
397
+ 0.17690593004226685,
398
+ -0.019342761486768723,
399
+ -0.045900676399469376,
400
+ -0.08388058096170425,
401
+ -0.1825810670852661,
402
+ -0.9999706149101257,
403
+ -0.4282298684120178,
404
+ 0.7756603956222534,
405
+ -0.19046637415885925,
406
+ -1.0
407
+ ],
408
+ "q01": [
409
+ 0.22041156470775605,
410
+ -0.17958899974822998,
411
+ 0.04473079532384872,
412
+ 0.2284090793132782,
413
+ -0.2088965356349945,
414
+ -0.811203727722168,
415
+ -0.9306126594543457,
416
+ 0.3530711317062378,
417
+ -0.2207678198814392,
418
+ -1.0,
419
+ 0.2055837804079056,
420
+ 0.005079864375293255,
421
+ -0.04285515695810318,
422
+ 0.023393160849809646,
423
+ -0.12909780085086822,
424
+ -0.9969730639457702,
425
+ -0.31871861577033994,
426
+ 0.8128526282310485,
427
+ -0.12555764615535736,
428
+ -1.0
429
+ ],
430
+ "q99": [
431
+ 0.6256548881530761,
432
+ 0.16506216526031484,
433
+ 0.3053938007354736,
434
+ 0.9577780866622918,
435
+ 0.7322160029411315,
436
+ -0.22335838973522296,
437
+ 0.43161325573921194,
438
+ 0.9983374524116516,
439
+ 0.7683744573593138,
440
+ 1.0,
441
+ 0.344862767457962,
442
+ 0.19341405749320983,
443
+ 0.24164194464683514,
444
+ 0.9684402346611023,
445
+ 0.5674381494522094,
446
+ -0.24195577383041442,
447
+ 0.2379095745086669,
448
+ 0.9997554516792297,
449
+ 0.564831252098083,
450
+ 1.0
451
+ ],
452
+ "mask": [
453
+ true,
454
+ true,
455
+ true,
456
+ false,
457
+ false,
458
+ false,
459
+ false,
460
+ false,
461
+ false,
462
+ false,
463
+ true,
464
+ true,
465
+ true,
466
+ false,
467
+ false,
468
+ false,
469
+ false,
470
+ false,
471
+ false,
472
+ false
473
+ ]
474
+ },
475
+ "proprio": {
476
+ "mean": [
477
+ 0.39952757954597473,
478
+ -0.01743784174323082,
479
+ 0.17103438079357147,
480
+ 0.7105360627174377,
481
+ 0.238600954413414,
482
+ -0.5759879350662231,
483
+ -0.11663737148046494,
484
+ 0.815979540348053,
485
+ 0.1842789500951767,
486
+ 0.32329148054122925,
487
+ 0.2642972469329834,
488
+ 0.056828975677490234,
489
+ 0.10836686193943024,
490
+ 0.5435150265693665,
491
+ 0.10053255409002304,
492
+ -0.7011978030204773,
493
+ -0.03383756801486015,
494
+ 0.950905442237854,
495
+ 0.0682743638753891,
496
+ -0.11205191910266876
497
+ ],
498
+ "std": [
499
+ 0.1054687574505806,
500
+ 0.09407484531402588,
501
+ 0.07594858109951019,
502
+ 0.15545178949832916,
503
+ 0.23370550572872162,
504
+ 0.1663108915090561,
505
+ 0.417312353849411,
506
+ 0.1589633673429489,
507
+ 0.295290470123291,
508
+ 0.8386000990867615,
509
+ 0.03193666413426399,
510
+ 0.03702628239989281,
511
+ 0.08499231189489365,
512
+ 0.33746662735939026,
513
+ 0.13817644119262695,
514
+ 0.2642515301704407,
515
+ 0.13742688298225403,
516
+ 0.10328594595193863,
517
+ 0.24581065773963928,
518
+ 0.988041341304779
519
+ ],
520
+ "max": [
521
+ 0.6216338276863098,
522
+ 0.1681845635175705,
523
+ 0.3582729399204254,
524
+ 0.9998778104782104,
525
+ 0.7569742202758789,
526
+ 0.29317960143089294,
527
+ 0.5474420785903931,
528
+ 1.0,
529
+ 0.9644882678985596,
530
+ 1.2399240732192993,
531
+ 0.36810019612312317,
532
+ 0.15229015052318573,
533
+ 0.3755773603916168,
534
+ 0.9999530911445618,
535
+ 0.47173869609832764,
536
+ 0.4396477937698364,
537
+ 0.5856077671051025,
538
+ 1.0,
539
+ 0.9141661524772644,
540
+ 1.0335123538970947
541
+ ],
542
+ "min": [
543
+ 0.17779576778411865,
544
+ -0.2223799079656601,
545
+ 0.009585360996425152,
546
+ 0.27525120973587036,
547
+ -0.3401731848716736,
548
+ -0.8740139603614807,
549
+ -0.922980010509491,
550
+ 0.20966650545597076,
551
+ -0.5117865800857544,
552
+ -1.04777991771698,
553
+ 0.13721425831317902,
554
+ -0.11607959121465683,
555
+ -0.006126723252236843,
556
+ -0.12117788940668106,
557
+ -0.5865428447723389,
558
+ -0.9999897480010986,
559
+ -0.48856121301651,
560
+ -0.09543908387422562,
561
+ -0.9954046607017517,
562
+ -1.1056499481201172
563
+ ],
564
+ "q01": [
565
+ 0.22190109610557557,
566
+ -0.19557971894741058,
567
+ 0.02071425139904022,
568
+ 0.33727509021759033,
569
+ -0.20722176849842072,
570
+ -0.8324707460403442,
571
+ -0.8625615048408508,
572
+ 0.48607451796531675,
573
+ -0.30660848736763,
574
+ -0.9315443730354309,
575
+ 0.19041878879070281,
576
+ -0.04380948930978775,
577
+ -0.0050327684171497826,
578
+ -0.05638677150011063,
579
+ -0.26807846426963805,
580
+ -0.9989835453033448,
581
+ -0.329305921792984,
582
+ 0.4013799297809601,
583
+ -0.8769975972175598,
584
+ -1.088063154220581
585
+ ],
586
+ "q99": [
587
+ 0.6066525983810425,
588
+ 0.13703446269035335,
589
+ 0.3049686551094055,
590
+ 0.995180070400238,
591
+ 0.6961594343185422,
592
+ 0.0980641171336174,
593
+ 0.463529108762741,
594
+ 0.9997917461395264,
595
+ 0.7787499904632564,
596
+ 1.0201601552963255,
597
+ 0.3442830562591551,
598
+ 0.1231292974948883,
599
+ 0.30282683849334713,
600
+ 0.9978944087028503,
601
+ 0.41937308669090234,
602
+ 0.0980641171336174,
603
+ 0.27175513744354135,
604
+ 0.999885528087616,
605
+ 0.5146499085426329,
606
+ 1.0030832004547119
607
+ ],
608
+ "mask": [
609
+ true,
610
+ true,
611
+ true,
612
+ false,
613
+ false,
614
+ false,
615
+ false,
616
+ false,
617
+ false,
618
+ false,
619
+ true,
620
+ true,
621
+ true,
622
+ false,
623
+ false,
624
+ false,
625
+ false,
626
+ false,
627
+ false,
628
+ false
629
+ ]
630
+ },
631
+ "num_transitions": 7145,
632
+ "num_trajectories": 50
633
+ }
634
+ }
1e-4/twinvla-scratch-1e-4-aloha_dish_drainer/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fcbde742d8d0fe9747cabd988a524d92c8d8512cc8c5f28e2853157bf95c38da
3
+ size 2889536104
1e-4/twinvla-scratch-1e-4-aloha_dish_drainer/singlevla_config/config.json ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_commit_hash": null,
3
+ "_name_or_path": "/scratch2/jellyho/rebuttal/tabletop-v3/twinvla-scratch-1e-4-aloha_dish_drainer/singlevla_config",
4
+ "action_dim": 10,
5
+ "action_head": "DiT",
6
+ "action_head_hidden_dim": 1024,
7
+ "action_len": 20,
8
+ "aggregation": "None",
9
+ "architectures": [
10
+ "Eagle2_1BVLA"
11
+ ],
12
+ "auto_map": {},
13
+ "denoiser": "FM",
14
+ "diffusion_batch": 32,
15
+ "dit_size": "DiT-B",
16
+ "downsample_ratio": 0.5,
17
+ "dynamic_image_size": true,
18
+ "efficient_loss": true,
19
+ "enable_cfg": true,
20
+ "force_image_size": 448,
21
+ "global_normalization": true,
22
+ "image_size": 448,
23
+ "keep_aspect_ratio": false,
24
+ "knowledge_insulation": false,
25
+ "llm_config": {
26
+ "_attn_implementation_autoset": true,
27
+ "_name_or_path": "./pretrained/Qwen2_5-0_5B-Instruct",
28
+ "add_cross_attention": false,
29
+ "architectures": [
30
+ "Qwen2ForCausalLM"
31
+ ],
32
+ "attention_dropout": 0.0,
33
+ "auto_map": {
34
+ "AutoConfig": "configuration_qwen2.Qwen2Config",
35
+ "AutoModel": "modeling_qwen2.Qwen2Model",
36
+ "AutoModelForCausalLM": "modeling_qwen2.Qwen2ForCausalLM"
37
+ },
38
+ "bad_words_ids": null,
39
+ "begin_suppress_tokens": null,
40
+ "bos_token_id": 151643,
41
+ "chunk_size_feed_forward": 0,
42
+ "cross_attention_hidden_size": null,
43
+ "decoder_start_token_id": null,
44
+ "diversity_penalty": 0.0,
45
+ "do_sample": false,
46
+ "early_stopping": false,
47
+ "encoder_no_repeat_ngram_size": 0,
48
+ "eos_token_id": 151645,
49
+ "exponential_decay_length_penalty": null,
50
+ "finetuning_task": null,
51
+ "forced_bos_token_id": null,
52
+ "forced_eos_token_id": null,
53
+ "hidden_act": "silu",
54
+ "hidden_size": 896,
55
+ "id2label": {
56
+ "0": "LABEL_0",
57
+ "1": "LABEL_1"
58
+ },
59
+ "initializer_range": 0.02,
60
+ "intermediate_size": 4864,
61
+ "is_decoder": false,
62
+ "is_encoder_decoder": false,
63
+ "label2id": {
64
+ "LABEL_0": 0,
65
+ "LABEL_1": 1
66
+ },
67
+ "length_penalty": 1.0,
68
+ "max_length": 20,
69
+ "max_position_embeddings": 32768,
70
+ "max_window_layers": 21,
71
+ "min_length": 0,
72
+ "model_type": "qwen2",
73
+ "no_repeat_ngram_size": 0,
74
+ "num_attention_heads": 14,
75
+ "num_beam_groups": 1,
76
+ "num_beams": 1,
77
+ "num_hidden_layers": 24,
78
+ "num_key_value_heads": 2,
79
+ "num_return_sequences": 1,
80
+ "output_attentions": false,
81
+ "output_hidden_states": false,
82
+ "output_scores": false,
83
+ "pad_token_id": null,
84
+ "prefix": null,
85
+ "problem_type": null,
86
+ "pruned_heads": {},
87
+ "remove_invalid_values": false,
88
+ "repetition_penalty": 1.0,
89
+ "return_dict": true,
90
+ "return_dict_in_generate": false,
91
+ "rms_norm_eps": 1e-06,
92
+ "rope_scaling": null,
93
+ "rope_theta": 1000000.0,
94
+ "sep_token_id": null,
95
+ "sliding_window": 32768,
96
+ "suppress_tokens": null,
97
+ "task_specific_params": null,
98
+ "temperature": 1.0,
99
+ "tf_legacy_loss": false,
100
+ "tie_encoder_decoder": false,
101
+ "tie_word_embeddings": true,
102
+ "tokenizer_class": null,
103
+ "top_k": 50,
104
+ "top_p": 1.0,
105
+ "torch_dtype": "bfloat16",
106
+ "torchscript": false,
107
+ "transformers_version": "4.50.0.dev0",
108
+ "typical_p": 1.0,
109
+ "use_bfloat16": false,
110
+ "use_cache": false,
111
+ "use_sliding_window": false,
112
+ "vocab_size": 151674
113
+ },
114
+ "loss_version": "v4",
115
+ "max_dynamic_patch": 12,
116
+ "min_dynamic_patch": 1,
117
+ "mlp_checkpoint": true,
118
+ "model_path": "nvidia/Eagle2-1B",
119
+ "model_type": "Eagle2_1BVLA",
120
+ "modeling": "denoising",
121
+ "normalization": "quantile",
122
+ "num_readouts": 1,
123
+ "pad2square": false,
124
+ "pre_feature_reduction": false,
125
+ "ps_version": "v2",
126
+ "readout_token_as_eos": true,
127
+ "return_text": null,
128
+ "select_layer": -1,
129
+ "state_dim": 10,
130
+ "stopping_token": "|",
131
+ "template": "qwen2-chat",
132
+ "test_denoising_steps": 10,
133
+ "torch_dtype": "bfloat16",
134
+ "train_denoising_steps": 100,
135
+ "transformers_version": null,
136
+ "use_backbone_lora": 0,
137
+ "use_llm_lora": 0,
138
+ "use_thumbnail": true,
139
+ "vision_config": {
140
+ "_attn_implementation_autoset": true,
141
+ "_name_or_path": "",
142
+ "add_cross_attention": false,
143
+ "architectures": [
144
+ "SiglipVisionModel"
145
+ ],
146
+ "attention_dropout": 0.0,
147
+ "auto_map": {
148
+ "AutoConfig": "configuration_siglip.SiglipVisionConfig",
149
+ "AutoModel": "modeling_siglip.SiglipVisionModel"
150
+ },
151
+ "bad_words_ids": null,
152
+ "begin_suppress_tokens": null,
153
+ "bos_token_id": null,
154
+ "chunk_size_feed_forward": 0,
155
+ "cross_attention_hidden_size": null,
156
+ "decoder_start_token_id": null,
157
+ "diversity_penalty": 0.0,
158
+ "do_sample": false,
159
+ "drop_path_rate": 0.1,
160
+ "early_stopping": false,
161
+ "encoder_no_repeat_ngram_size": 0,
162
+ "eos_token_id": null,
163
+ "exponential_decay_length_penalty": null,
164
+ "finetuning_task": null,
165
+ "forced_bos_token_id": null,
166
+ "forced_eos_token_id": null,
167
+ "hidden_act": "gelu_pytorch_tanh",
168
+ "hidden_size": 1152,
169
+ "id2label": {
170
+ "0": "LABEL_0",
171
+ "1": "LABEL_1"
172
+ },
173
+ "image_size": 448,
174
+ "intermediate_size": 4304,
175
+ "is_decoder": false,
176
+ "is_encoder_decoder": false,
177
+ "label2id": {
178
+ "LABEL_0": 0,
179
+ "LABEL_1": 1
180
+ },
181
+ "layer_norm_eps": 1e-06,
182
+ "length_penalty": 1.0,
183
+ "max_length": 20,
184
+ "min_length": 0,
185
+ "model_type": "siglip_vision_model",
186
+ "no_repeat_ngram_size": 0,
187
+ "num_attention_heads": 16,
188
+ "num_beam_groups": 1,
189
+ "num_beams": 1,
190
+ "num_channels": 3,
191
+ "num_hidden_layers": 27,
192
+ "num_image_tokens": 1024,
193
+ "num_return_sequences": 1,
194
+ "output_attentions": false,
195
+ "output_hidden_states": false,
196
+ "output_scores": false,
197
+ "pad_token_id": null,
198
+ "patch_size": 14,
199
+ "prefix": null,
200
+ "problem_type": null,
201
+ "projection_dim": 2048,
202
+ "projector_hidden_act": "gelu_fast",
203
+ "pruned_heads": {},
204
+ "remove_invalid_values": false,
205
+ "repetition_penalty": 1.0,
206
+ "return_dict": true,
207
+ "return_dict_in_generate": false,
208
+ "sep_token_id": null,
209
+ "suppress_tokens": null,
210
+ "task_specific_params": null,
211
+ "temperature": 1.0,
212
+ "tf_legacy_loss": false,
213
+ "tie_encoder_decoder": false,
214
+ "tie_word_embeddings": true,
215
+ "tokenizer_class": null,
216
+ "top_k": 50,
217
+ "top_p": 1.0,
218
+ "torch_dtype": "bfloat16",
219
+ "torchscript": false,
220
+ "transformers_version": "4.50.0.dev0",
221
+ "typical_p": 1.0,
222
+ "use_bfloat16": false,
223
+ "vision_use_head": false
224
+ },
225
+ "vocab_size": 151674,
226
+ "vocab_start": null
227
+ }
1e-4/twinvla-scratch-1e-4-aloha_handover_box/config.json ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "action_dim": 10,
3
+ "action_head": "DiT",
4
+ "action_len": 20,
5
+ "architectures": [
6
+ "Eagle2_1BTwinVLA"
7
+ ],
8
+ "attn_reweighting": true,
9
+ "denoiser": "FM",
10
+ "dit_scratch": false,
11
+ "global_normalization": true,
12
+ "hz_interpolate": null,
13
+ "interpolate_gripper": false,
14
+ "knowledge_insulation": false,
15
+ "model_path": null,
16
+ "model_type": "Eagle2_1BTwinVLA",
17
+ "modeling": "denoising",
18
+ "normalization": "quantile",
19
+ "num_readouts": 1,
20
+ "readout_token_as_eos": true,
21
+ "share_decoder": true,
22
+ "share_embed_tokens": true,
23
+ "share_vision": true,
24
+ "singlevla_config": {
25
+ "_attn_implementation_autoset": false,
26
+ "_attn_implementation_internal": null,
27
+ "_commit_hash": null,
28
+ "_name_or_path": "/scratch2/jellyho/rebuttal/singlevla-work/Eagle2_1B-Scratch-DiT-B",
29
+ "action_dim": 10,
30
+ "action_head": "DiT",
31
+ "action_head_hidden_dim": 1024,
32
+ "action_len": 20,
33
+ "add_cross_attention": false,
34
+ "aggregation": "None",
35
+ "architectures": [
36
+ "Eagle2_1BVLA"
37
+ ],
38
+ "auto_map": {},
39
+ "bad_words_ids": null,
40
+ "begin_suppress_tokens": null,
41
+ "bos_token_id": null,
42
+ "chunk_size_feed_forward": 0,
43
+ "cross_attention_hidden_size": null,
44
+ "decoder_start_token_id": null,
45
+ "denoiser": "FM",
46
+ "diffusion_batch": 32,
47
+ "dit_size": "DiT-B",
48
+ "diversity_penalty": 0.0,
49
+ "do_sample": false,
50
+ "downsample_ratio": 0.5,
51
+ "dynamic_image_size": true,
52
+ "early_stopping": false,
53
+ "efficient_loss": true,
54
+ "enable_cfg": true,
55
+ "encoder_no_repeat_ngram_size": 0,
56
+ "eos_token_id": null,
57
+ "exponential_decay_length_penalty": null,
58
+ "finetuning_task": null,
59
+ "force_image_size": 448,
60
+ "forced_bos_token_id": null,
61
+ "forced_eos_token_id": null,
62
+ "global_normalization": true,
63
+ "id2label": {
64
+ "0": "LABEL_0",
65
+ "1": "LABEL_1"
66
+ },
67
+ "image_size": 448,
68
+ "is_decoder": false,
69
+ "is_encoder_decoder": false,
70
+ "keep_aspect_ratio": false,
71
+ "knowledge_insulation": false,
72
+ "label2id": {
73
+ "LABEL_0": 0,
74
+ "LABEL_1": 1
75
+ },
76
+ "length_penalty": 1.0,
77
+ "llm_config": {
78
+ "_attn_implementation_autoset": true,
79
+ "_name_or_path": "./pretrained/Qwen2_5-0_5B-Instruct",
80
+ "add_cross_attention": false,
81
+ "architectures": [
82
+ "Qwen2ForCausalLM"
83
+ ],
84
+ "attention_dropout": 0.0,
85
+ "auto_map": {
86
+ "AutoConfig": "configuration_qwen2.Qwen2Config",
87
+ "AutoModel": "modeling_qwen2.Qwen2Model",
88
+ "AutoModelForCausalLM": "modeling_qwen2.Qwen2ForCausalLM"
89
+ },
90
+ "bad_words_ids": null,
91
+ "begin_suppress_tokens": null,
92
+ "bos_token_id": 151643,
93
+ "chunk_size_feed_forward": 0,
94
+ "cross_attention_hidden_size": null,
95
+ "decoder_start_token_id": null,
96
+ "diversity_penalty": 0.0,
97
+ "do_sample": false,
98
+ "early_stopping": false,
99
+ "encoder_no_repeat_ngram_size": 0,
100
+ "eos_token_id": 151645,
101
+ "exponential_decay_length_penalty": null,
102
+ "finetuning_task": null,
103
+ "forced_bos_token_id": null,
104
+ "forced_eos_token_id": null,
105
+ "hidden_act": "silu",
106
+ "hidden_size": 896,
107
+ "id2label": {
108
+ "0": "LABEL_0",
109
+ "1": "LABEL_1"
110
+ },
111
+ "initializer_range": 0.02,
112
+ "intermediate_size": 4864,
113
+ "is_decoder": false,
114
+ "is_encoder_decoder": false,
115
+ "label2id": {
116
+ "LABEL_0": 0,
117
+ "LABEL_1": 1
118
+ },
119
+ "length_penalty": 1.0,
120
+ "max_length": 20,
121
+ "max_position_embeddings": 32768,
122
+ "max_window_layers": 21,
123
+ "min_length": 0,
124
+ "model_type": "qwen2",
125
+ "no_repeat_ngram_size": 0,
126
+ "num_attention_heads": 14,
127
+ "num_beam_groups": 1,
128
+ "num_beams": 1,
129
+ "num_hidden_layers": 24,
130
+ "num_key_value_heads": 2,
131
+ "num_return_sequences": 1,
132
+ "output_attentions": false,
133
+ "output_hidden_states": false,
134
+ "output_scores": false,
135
+ "pad_token_id": null,
136
+ "prefix": null,
137
+ "problem_type": null,
138
+ "pruned_heads": {},
139
+ "remove_invalid_values": false,
140
+ "repetition_penalty": 1.0,
141
+ "return_dict": true,
142
+ "return_dict_in_generate": false,
143
+ "rms_norm_eps": 1e-06,
144
+ "rope_scaling": null,
145
+ "rope_theta": 1000000.0,
146
+ "sep_token_id": null,
147
+ "sliding_window": 32768,
148
+ "suppress_tokens": null,
149
+ "task_specific_params": null,
150
+ "temperature": 1.0,
151
+ "tf_legacy_loss": false,
152
+ "tie_encoder_decoder": false,
153
+ "tie_word_embeddings": true,
154
+ "tokenizer_class": null,
155
+ "top_k": 50,
156
+ "top_p": 1.0,
157
+ "torch_dtype": "bfloat16",
158
+ "torchscript": false,
159
+ "transformers_version": "4.50.0.dev0",
160
+ "typical_p": 1.0,
161
+ "use_bfloat16": false,
162
+ "use_cache": false,
163
+ "use_sliding_window": false,
164
+ "vocab_size": 151674
165
+ },
166
+ "loss_version": "v4",
167
+ "max_dynamic_patch": 12,
168
+ "max_length": 20,
169
+ "min_dynamic_patch": 1,
170
+ "min_length": 0,
171
+ "mlp_checkpoint": true,
172
+ "model_path": "nvidia/Eagle2-1B",
173
+ "model_type": "Eagle2_1BVLA",
174
+ "modeling": "denoising",
175
+ "no_repeat_ngram_size": 0,
176
+ "normalization": "quantile",
177
+ "num_beam_groups": 1,
178
+ "num_beams": 1,
179
+ "num_readouts": 1,
180
+ "num_return_sequences": 1,
181
+ "output_attentions": false,
182
+ "output_hidden_states": false,
183
+ "output_scores": false,
184
+ "pad2square": false,
185
+ "pad_token_id": null,
186
+ "pre_feature_reduction": false,
187
+ "prefix": null,
188
+ "problem_type": null,
189
+ "pruned_heads": {},
190
+ "ps_version": "v2",
191
+ "readout_token_as_eos": true,
192
+ "remove_invalid_values": false,
193
+ "repetition_penalty": 1.0,
194
+ "return_dict": true,
195
+ "return_dict_in_generate": false,
196
+ "return_text": null,
197
+ "select_layer": -1,
198
+ "sep_token_id": null,
199
+ "state_dim": 10,
200
+ "stopping_token": "|",
201
+ "suppress_tokens": null,
202
+ "task_specific_params": null,
203
+ "temperature": 1.0,
204
+ "template": "qwen2-chat",
205
+ "test_denoising_steps": 10,
206
+ "tf_legacy_loss": false,
207
+ "tie_encoder_decoder": false,
208
+ "tie_word_embeddings": true,
209
+ "tokenizer_class": null,
210
+ "top_k": 50,
211
+ "top_p": 1.0,
212
+ "torch_dtype": "bfloat16",
213
+ "torchscript": false,
214
+ "train_denoising_steps": 100,
215
+ "typical_p": 1.0,
216
+ "use_backbone_lora": 0,
217
+ "use_bfloat16": false,
218
+ "use_llm_lora": 0,
219
+ "use_thumbnail": true,
220
+ "vision_config": {
221
+ "_attn_implementation_autoset": true,
222
+ "_name_or_path": "",
223
+ "add_cross_attention": false,
224
+ "architectures": [
225
+ "SiglipVisionModel"
226
+ ],
227
+ "attention_dropout": 0.0,
228
+ "auto_map": {
229
+ "AutoConfig": "configuration_siglip.SiglipVisionConfig",
230
+ "AutoModel": "modeling_siglip.SiglipVisionModel"
231
+ },
232
+ "bad_words_ids": null,
233
+ "begin_suppress_tokens": null,
234
+ "bos_token_id": null,
235
+ "chunk_size_feed_forward": 0,
236
+ "cross_attention_hidden_size": null,
237
+ "decoder_start_token_id": null,
238
+ "diversity_penalty": 0.0,
239
+ "do_sample": false,
240
+ "drop_path_rate": 0.1,
241
+ "early_stopping": false,
242
+ "encoder_no_repeat_ngram_size": 0,
243
+ "eos_token_id": null,
244
+ "exponential_decay_length_penalty": null,
245
+ "finetuning_task": null,
246
+ "forced_bos_token_id": null,
247
+ "forced_eos_token_id": null,
248
+ "hidden_act": "gelu_pytorch_tanh",
249
+ "hidden_size": 1152,
250
+ "id2label": {
251
+ "0": "LABEL_0",
252
+ "1": "LABEL_1"
253
+ },
254
+ "image_size": 448,
255
+ "intermediate_size": 4304,
256
+ "is_decoder": false,
257
+ "is_encoder_decoder": false,
258
+ "label2id": {
259
+ "LABEL_0": 0,
260
+ "LABEL_1": 1
261
+ },
262
+ "layer_norm_eps": 1e-06,
263
+ "length_penalty": 1.0,
264
+ "max_length": 20,
265
+ "min_length": 0,
266
+ "model_type": "siglip_vision_model",
267
+ "no_repeat_ngram_size": 0,
268
+ "num_attention_heads": 16,
269
+ "num_beam_groups": 1,
270
+ "num_beams": 1,
271
+ "num_channels": 3,
272
+ "num_hidden_layers": 27,
273
+ "num_image_tokens": 1024,
274
+ "num_return_sequences": 1,
275
+ "output_attentions": false,
276
+ "output_hidden_states": false,
277
+ "output_scores": false,
278
+ "pad_token_id": null,
279
+ "patch_size": 14,
280
+ "prefix": null,
281
+ "problem_type": null,
282
+ "projection_dim": 2048,
283
+ "projector_hidden_act": "gelu_fast",
284
+ "pruned_heads": {},
285
+ "remove_invalid_values": false,
286
+ "repetition_penalty": 1.0,
287
+ "return_dict": true,
288
+ "return_dict_in_generate": false,
289
+ "sep_token_id": null,
290
+ "suppress_tokens": null,
291
+ "task_specific_params": null,
292
+ "temperature": 1.0,
293
+ "tf_legacy_loss": false,
294
+ "tie_encoder_decoder": false,
295
+ "tie_word_embeddings": true,
296
+ "tokenizer_class": null,
297
+ "top_k": 50,
298
+ "top_p": 1.0,
299
+ "torch_dtype": "bfloat16",
300
+ "torchscript": false,
301
+ "transformers_version": "4.50.0.dev0",
302
+ "typical_p": 1.0,
303
+ "use_bfloat16": false,
304
+ "vision_use_head": false
305
+ },
306
+ "vocab_size": 151674,
307
+ "vocab_start": null
308
+ },
309
+ "singlevla_config_path": "/scratch2/jellyho/rebuttal/singlevla-work/Eagle2_1B-Scratch-DiT-B",
310
+ "singlevla_pretrained_path": null,
311
+ "state_dim": 10,
312
+ "torch_dtype": "bfloat16",
313
+ "transformers_version": "4.50.0.dev0"
314
+ }
1e-4/twinvla-scratch-1e-4-aloha_handover_box/dataset_statistics.json ADDED
@@ -0,0 +1,634 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "aloha_handover_box_new": {
3
+ "action": {
4
+ "mean": [
5
+ 0.322445809841156,
6
+ -0.0713680312037468,
7
+ 0.16350828111171722,
8
+ 0.615584671497345,
9
+ 0.02582639269530773,
10
+ -0.6291787028312683,
11
+ -0.12873496115207672,
12
+ 0.9684517979621887,
13
+ -0.05324753001332283,
14
+ 0.32048356533050537,
15
+ 0.35535329580307007,
16
+ -0.017269128933548927,
17
+ 0.25354719161987305,
18
+ 0.946760356426239,
19
+ -0.1093481183052063,
20
+ -0.16364224255084991,
21
+ 0.10808137059211731,
22
+ 0.9652293920516968,
23
+ -0.08225571364164352,
24
+ 0.6809535622596741
25
+ ],
26
+ "std": [
27
+ 0.07454725354909897,
28
+ 0.08869025856256485,
29
+ 0.07996608316898346,
30
+ 0.3346059024333954,
31
+ 0.19985823333263397,
32
+ 0.2694716155529022,
33
+ 0.12514568865299225,
34
+ 0.030899852514266968,
35
+ 0.16146361827850342,
36
+ 0.9473041296005249,
37
+ 0.06487792730331421,
38
+ 0.0389498695731163,
39
+ 0.027652494609355927,
40
+ 0.10490526258945465,
41
+ 0.18384318053722382,
42
+ 0.1417805403470993,
43
+ 0.20332522690296173,
44
+ 0.06656655669212341,
45
+ 0.06421920657157898,
46
+ 0.7322932481765747
47
+ ],
48
+ "max": [
49
+ 0.48683926463127136,
50
+ 0.0484432689845562,
51
+ 0.31490612030029297,
52
+ 0.99891197681427,
53
+ 0.4277522563934326,
54
+ 0.06322141736745834,
55
+ 0.4004654884338379,
56
+ 0.9999857544898987,
57
+ 0.3100079298019409,
58
+ 1.0,
59
+ 0.5334027409553528,
60
+ 0.08494444936513901,
61
+ 0.36568865180015564,
62
+ 0.9999882578849792,
63
+ 0.2546274662017822,
64
+ 0.1172015443444252,
65
+ 0.7982608079910278,
66
+ 0.9999992251396179,
67
+ 0.20094169676303864,
68
+ 1.0
69
+ ],
70
+ "min": [
71
+ 0.1422317922115326,
72
+ -0.2763901352882385,
73
+ -0.0600760243833065,
74
+ -0.14848311245441437,
75
+ -0.6282482743263245,
76
+ -0.9999129176139832,
77
+ -0.42181891202926636,
78
+ 0.7404066324234009,
79
+ -0.6676974296569824,
80
+ -1.0,
81
+ 0.1786160171031952,
82
+ -0.1845615804195404,
83
+ 0.1687021553516388,
84
+ 0.2762398421764374,
85
+ -0.7479667067527771,
86
+ -0.8485982418060303,
87
+ -0.2597721517086029,
88
+ 0.6015138626098633,
89
+ -0.3933228552341461,
90
+ -1.0
91
+ ],
92
+ "q01": [
93
+ 0.1950138956308365,
94
+ -0.24691226959228516,
95
+ -0.015285035967826844,
96
+ -0.04555398792028427,
97
+ -0.4452396559715271,
98
+ -0.996303243637085,
99
+ -0.3760478734970093,
100
+ 0.8516808867454528,
101
+ -0.46342918753623963,
102
+ -1.0,
103
+ 0.21926841557025908,
104
+ -0.1317625629901886,
105
+ 0.1978745412826538,
106
+ 0.5117229986190795,
107
+ -0.6376786828041077,
108
+ -0.6609986042976379,
109
+ -0.19099083304405212,
110
+ 0.6930621123313904,
111
+ -0.2356126993894577,
112
+ -1.0
113
+ ],
114
+ "q99": [
115
+ 0.47150796771049497,
116
+ 0.038070930540561675,
117
+ 0.28182336688041676,
118
+ 0.9817836880683899,
119
+ 0.3871919810771942,
120
+ -0.1345064049959186,
121
+ 0.20285944879054985,
122
+ 0.9992118668556214,
123
+ 0.2293877118825912,
124
+ 1.0,
125
+ 0.49810330152511595,
126
+ 0.0599309906363487,
127
+ 0.3309180569648742,
128
+ 0.9995350050926208,
129
+ 0.1829529863595952,
130
+ 0.03216676786541939,
131
+ 0.7132800936698909,
132
+ 0.9997488117218017,
133
+ 0.08941484957933345,
134
+ 1.0
135
+ ],
136
+ "mask": [
137
+ true,
138
+ true,
139
+ true,
140
+ false,
141
+ false,
142
+ false,
143
+ false,
144
+ false,
145
+ false,
146
+ false,
147
+ true,
148
+ true,
149
+ true,
150
+ false,
151
+ false,
152
+ false,
153
+ false,
154
+ false,
155
+ false,
156
+ false
157
+ ]
158
+ },
159
+ "proprio": {
160
+ "mean": [
161
+ 0.3200362026691437,
162
+ -0.06315236538648605,
163
+ 0.15397155284881592,
164
+ 0.6064433455467224,
165
+ 0.06654603034257889,
166
+ -0.640325129032135,
167
+ -0.09359700232744217,
168
+ 0.9627139568328857,
169
+ 0.03187317028641701,
170
+ 0.6514412760734558,
171
+ 0.35154762864112854,
172
+ -0.018256496638059616,
173
+ 0.2389756739139557,
174
+ 0.9405066967010498,
175
+ -0.11245886981487274,
176
+ -0.2019510418176651,
177
+ 0.10934194922447205,
178
+ 0.9565088152885437,
179
+ -0.08098198473453522,
180
+ 0.8085420727729797
181
+ ],
182
+ "std": [
183
+ 0.07218372821807861,
184
+ 0.07983937114477158,
185
+ 0.08212247490882874,
186
+ 0.32560110092163086,
187
+ 0.1554078459739685,
188
+ 0.29600194096565247,
189
+ 0.147041916847229,
190
+ 0.06898749619722366,
191
+ 0.1924201399087906,
192
+ 0.47606807947158813,
193
+ 0.06400062888860703,
194
+ 0.03751807287335396,
195
+ 0.032336752861738205,
196
+ 0.10509955137968063,
197
+ 0.1779795140028,
198
+ 0.13889142870903015,
199
+ 0.20048992335796356,
200
+ 0.082735575735569,
201
+ 0.13973525166511536,
202
+ 0.40020087361335754
203
+ ],
204
+ "max": [
205
+ 0.47570154070854187,
206
+ 0.08932404220104218,
207
+ 0.44513142108917236,
208
+ 0.9999915361404419,
209
+ 0.6316148042678833,
210
+ 0.7311769127845764,
211
+ 0.5646719932556152,
212
+ 1.0,
213
+ 0.9345466494560242,
214
+ 1.3299691677093506,
215
+ 0.5250220894813538,
216
+ 0.07912999391555786,
217
+ 0.41775044798851013,
218
+ 0.9999979138374329,
219
+ 0.2288104146718979,
220
+ 0.2556033134460449,
221
+ 0.7930954098701477,
222
+ 1.0,
223
+ 0.8460071086883545,
224
+ 1.1448447704315186
225
+ ],
226
+ "min": [
227
+ 0.15507374703884125,
228
+ -0.24968452751636505,
229
+ -0.005626574158668518,
230
+ -0.12249666452407837,
231
+ -0.3874700665473938,
232
+ -1.0,
233
+ -0.8481224179267883,
234
+ 0.28493279218673706,
235
+ -0.8170893788337708,
236
+ -1.083611011505127,
237
+ 0.18484443426132202,
238
+ -0.1679670214653015,
239
+ 0.1543029397726059,
240
+ 0.2590605616569519,
241
+ -0.7203781604766846,
242
+ -0.8606433272361755,
243
+ -0.2443554699420929,
244
+ 0.2216777801513672,
245
+ -0.9731146693229675,
246
+ -1.0848060846328735
247
+ ],
248
+ "q01": [
249
+ 0.1903739631175995,
250
+ -0.22257488489151,
251
+ -0.0036250025033950804,
252
+ -0.015333320312201977,
253
+ -0.2553225290775299,
254
+ -0.9997995805740356,
255
+ -0.3545967137813568,
256
+ 0.6295642066001892,
257
+ -0.32733017563819883,
258
+ -0.4065189242362976,
259
+ 0.22028838396072387,
260
+ -0.1278022611141205,
261
+ 0.17875114858150482,
262
+ 0.488557243347168,
263
+ -0.6262442255020142,
264
+ -0.6858670902252197,
265
+ -0.17815817892551422,
266
+ 0.6348884439468384,
267
+ -0.5856496715545654,
268
+ -0.4086606001853943
269
+ ],
270
+ "q99": [
271
+ 0.4643457818031311,
272
+ 0.05302721098065367,
273
+ 0.32663319587707507,
274
+ 0.995180070400238,
275
+ 0.426870135068893,
276
+ 0.18705489814281454,
277
+ 0.3631119978427884,
278
+ 0.9999364447593689,
279
+ 0.7475578069686883,
280
+ 1.178509011268615,
281
+ 0.4939642870426177,
282
+ 0.051381030380725806,
283
+ 0.3385275864601135,
284
+ 0.999157931804657,
285
+ 0.16684140086173982,
286
+ 0.05098062053322772,
287
+ 0.7065742087364195,
288
+ 0.9998370099067688,
289
+ 0.5137611627578699,
290
+ 1.0447997903823851
291
+ ],
292
+ "mask": [
293
+ true,
294
+ true,
295
+ true,
296
+ false,
297
+ false,
298
+ false,
299
+ false,
300
+ false,
301
+ false,
302
+ false,
303
+ true,
304
+ true,
305
+ true,
306
+ false,
307
+ false,
308
+ false,
309
+ false,
310
+ false,
311
+ false,
312
+ false
313
+ ]
314
+ },
315
+ "num_transitions": 11829,
316
+ "num_trajectories": 50
317
+ },
318
+ "aloha_handover_box": {
319
+ "action": {
320
+ "mean": [
321
+ 0.322445809841156,
322
+ -0.0713680312037468,
323
+ 0.16350828111171722,
324
+ 0.615584671497345,
325
+ 0.02582639269530773,
326
+ -0.6291787028312683,
327
+ -0.12873496115207672,
328
+ 0.9684517979621887,
329
+ -0.05324753001332283,
330
+ 0.32048356533050537,
331
+ 0.35535329580307007,
332
+ -0.017269128933548927,
333
+ 0.25354719161987305,
334
+ 0.946760356426239,
335
+ -0.1093481183052063,
336
+ -0.16364224255084991,
337
+ 0.10808137059211731,
338
+ 0.9652293920516968,
339
+ -0.08225571364164352,
340
+ 0.6809535622596741
341
+ ],
342
+ "std": [
343
+ 0.07454725354909897,
344
+ 0.08869025856256485,
345
+ 0.07996608316898346,
346
+ 0.3346059024333954,
347
+ 0.19985823333263397,
348
+ 0.2694716155529022,
349
+ 0.12514568865299225,
350
+ 0.030899852514266968,
351
+ 0.16146361827850342,
352
+ 0.9473041296005249,
353
+ 0.06487792730331421,
354
+ 0.0389498695731163,
355
+ 0.027652494609355927,
356
+ 0.10490526258945465,
357
+ 0.18384318053722382,
358
+ 0.1417805403470993,
359
+ 0.20332522690296173,
360
+ 0.06656655669212341,
361
+ 0.06421920657157898,
362
+ 0.7322932481765747
363
+ ],
364
+ "max": [
365
+ 0.48683926463127136,
366
+ 0.0484432689845562,
367
+ 0.31490612030029297,
368
+ 0.99891197681427,
369
+ 0.4277522563934326,
370
+ 0.06322141736745834,
371
+ 0.4004654884338379,
372
+ 0.9999857544898987,
373
+ 0.3100079298019409,
374
+ 1.0,
375
+ 0.5334027409553528,
376
+ 0.08494444936513901,
377
+ 0.36568865180015564,
378
+ 0.9999882578849792,
379
+ 0.2546274662017822,
380
+ 0.1172015443444252,
381
+ 0.7982608079910278,
382
+ 0.9999992251396179,
383
+ 0.20094169676303864,
384
+ 1.0
385
+ ],
386
+ "min": [
387
+ 0.1422317922115326,
388
+ -0.2763901352882385,
389
+ -0.0600760243833065,
390
+ -0.14848311245441437,
391
+ -0.6282482743263245,
392
+ -0.9999129176139832,
393
+ -0.42181891202926636,
394
+ 0.7404066324234009,
395
+ -0.6676974296569824,
396
+ -1.0,
397
+ 0.1786160171031952,
398
+ -0.1845615804195404,
399
+ 0.1687021553516388,
400
+ 0.2762398421764374,
401
+ -0.7479667067527771,
402
+ -0.8485982418060303,
403
+ -0.2597721517086029,
404
+ 0.6015138626098633,
405
+ -0.3933228552341461,
406
+ -1.0
407
+ ],
408
+ "q01": [
409
+ 0.1950138956308365,
410
+ -0.24691226959228516,
411
+ -0.015285035967826844,
412
+ -0.04555398792028427,
413
+ -0.4452396559715271,
414
+ -0.996303243637085,
415
+ -0.3760478734970093,
416
+ 0.8516808867454528,
417
+ -0.46342918753623963,
418
+ -1.0,
419
+ 0.21926841557025908,
420
+ -0.1317625629901886,
421
+ 0.1978745412826538,
422
+ 0.5117229986190795,
423
+ -0.6376786828041077,
424
+ -0.6609986042976379,
425
+ -0.19099083304405212,
426
+ 0.6930621123313904,
427
+ -0.2356126993894577,
428
+ -1.0
429
+ ],
430
+ "q99": [
431
+ 0.47150796771049497,
432
+ 0.038070930540561675,
433
+ 0.28182336688041676,
434
+ 0.9817836880683899,
435
+ 0.3871919810771942,
436
+ -0.1345064049959186,
437
+ 0.20285944879054985,
438
+ 0.9992118668556214,
439
+ 0.2293877118825912,
440
+ 1.0,
441
+ 0.49810330152511595,
442
+ 0.0599309906363487,
443
+ 0.3309180569648742,
444
+ 0.9995350050926208,
445
+ 0.1829529863595952,
446
+ 0.03216676786541939,
447
+ 0.7132800936698909,
448
+ 0.9997488117218017,
449
+ 0.08941484957933345,
450
+ 1.0
451
+ ],
452
+ "mask": [
453
+ true,
454
+ true,
455
+ true,
456
+ false,
457
+ false,
458
+ false,
459
+ false,
460
+ false,
461
+ false,
462
+ false,
463
+ true,
464
+ true,
465
+ true,
466
+ false,
467
+ false,
468
+ false,
469
+ false,
470
+ false,
471
+ false,
472
+ false
473
+ ]
474
+ },
475
+ "proprio": {
476
+ "mean": [
477
+ 0.3200362026691437,
478
+ -0.06315236538648605,
479
+ 0.15397155284881592,
480
+ 0.6064433455467224,
481
+ 0.06654603034257889,
482
+ -0.640325129032135,
483
+ -0.09359700232744217,
484
+ 0.9627139568328857,
485
+ 0.03187317028641701,
486
+ 0.6514412760734558,
487
+ 0.35154762864112854,
488
+ -0.018256496638059616,
489
+ 0.2389756739139557,
490
+ 0.9405066967010498,
491
+ -0.11245886981487274,
492
+ -0.2019510418176651,
493
+ 0.10934194922447205,
494
+ 0.9565088152885437,
495
+ -0.08098198473453522,
496
+ 0.8085420727729797
497
+ ],
498
+ "std": [
499
+ 0.07218372821807861,
500
+ 0.07983937114477158,
501
+ 0.08212247490882874,
502
+ 0.32560110092163086,
503
+ 0.1554078459739685,
504
+ 0.29600194096565247,
505
+ 0.147041916847229,
506
+ 0.06898749619722366,
507
+ 0.1924201399087906,
508
+ 0.47606807947158813,
509
+ 0.06400062888860703,
510
+ 0.03751807287335396,
511
+ 0.032336752861738205,
512
+ 0.10509955137968063,
513
+ 0.1779795140028,
514
+ 0.13889142870903015,
515
+ 0.20048992335796356,
516
+ 0.082735575735569,
517
+ 0.13973525166511536,
518
+ 0.40020087361335754
519
+ ],
520
+ "max": [
521
+ 0.47570154070854187,
522
+ 0.08932404220104218,
523
+ 0.44513142108917236,
524
+ 0.9999915361404419,
525
+ 0.6316148042678833,
526
+ 0.7311769127845764,
527
+ 0.5646719932556152,
528
+ 1.0,
529
+ 0.9345466494560242,
530
+ 1.3299691677093506,
531
+ 0.5250220894813538,
532
+ 0.07912999391555786,
533
+ 0.41775044798851013,
534
+ 0.9999979138374329,
535
+ 0.2288104146718979,
536
+ 0.2556033134460449,
537
+ 0.7930954098701477,
538
+ 1.0,
539
+ 0.8460071086883545,
540
+ 1.1448447704315186
541
+ ],
542
+ "min": [
543
+ 0.15507374703884125,
544
+ -0.24968452751636505,
545
+ -0.005626574158668518,
546
+ -0.12249666452407837,
547
+ -0.3874700665473938,
548
+ -1.0,
549
+ -0.8481224179267883,
550
+ 0.28493279218673706,
551
+ -0.8170893788337708,
552
+ -1.083611011505127,
553
+ 0.18484443426132202,
554
+ -0.1679670214653015,
555
+ 0.1543029397726059,
556
+ 0.2590605616569519,
557
+ -0.7203781604766846,
558
+ -0.8606433272361755,
559
+ -0.2443554699420929,
560
+ 0.2216777801513672,
561
+ -0.9731146693229675,
562
+ -1.0848060846328735
563
+ ],
564
+ "q01": [
565
+ 0.1903739631175995,
566
+ -0.22257488489151,
567
+ -0.0036250025033950804,
568
+ -0.015333320312201977,
569
+ -0.2553225290775299,
570
+ -0.9997995805740356,
571
+ -0.3545967137813568,
572
+ 0.6295642066001892,
573
+ -0.32733017563819883,
574
+ -0.4065189242362976,
575
+ 0.22028838396072387,
576
+ -0.1278022611141205,
577
+ 0.17875114858150482,
578
+ 0.488557243347168,
579
+ -0.6262442255020142,
580
+ -0.6858670902252197,
581
+ -0.17815817892551422,
582
+ 0.6348884439468384,
583
+ -0.5856496715545654,
584
+ -0.4086606001853943
585
+ ],
586
+ "q99": [
587
+ 0.4643457818031311,
588
+ 0.05302721098065367,
589
+ 0.32663319587707507,
590
+ 0.995180070400238,
591
+ 0.426870135068893,
592
+ 0.18705489814281454,
593
+ 0.3631119978427884,
594
+ 0.9999364447593689,
595
+ 0.7475578069686883,
596
+ 1.178509011268615,
597
+ 0.4939642870426177,
598
+ 0.051381030380725806,
599
+ 0.3385275864601135,
600
+ 0.999157931804657,
601
+ 0.16684140086173982,
602
+ 0.05098062053322772,
603
+ 0.7065742087364195,
604
+ 0.9998370099067688,
605
+ 0.5137611627578699,
606
+ 1.0447997903823851
607
+ ],
608
+ "mask": [
609
+ true,
610
+ true,
611
+ true,
612
+ false,
613
+ false,
614
+ false,
615
+ false,
616
+ false,
617
+ false,
618
+ false,
619
+ true,
620
+ true,
621
+ true,
622
+ false,
623
+ false,
624
+ false,
625
+ false,
626
+ false,
627
+ false,
628
+ false
629
+ ]
630
+ },
631
+ "num_transitions": 11829,
632
+ "num_trajectories": 50
633
+ }
634
+ }
1e-4/twinvla-scratch-1e-4-aloha_handover_box/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ef8ab0367cab07a1e4d78f1285753400e6b24622a602d21ecb5a635be5d8704
3
+ size 2889536104
1e-4/twinvla-scratch-1e-4-aloha_handover_box/singlevla_config/config.json ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_commit_hash": null,
3
+ "_name_or_path": "/scratch2/jellyho/rebuttal/singlevla-work/Eagle2_1B-Scratch-DiT-B",
4
+ "action_dim": 10,
5
+ "action_head": "DiT",
6
+ "action_head_hidden_dim": 1024,
7
+ "action_len": 20,
8
+ "aggregation": "None",
9
+ "architectures": [
10
+ "Eagle2_1BVLA"
11
+ ],
12
+ "auto_map": {},
13
+ "denoiser": "FM",
14
+ "diffusion_batch": 32,
15
+ "dit_size": "DiT-B",
16
+ "downsample_ratio": 0.5,
17
+ "dynamic_image_size": true,
18
+ "efficient_loss": true,
19
+ "enable_cfg": true,
20
+ "force_image_size": 448,
21
+ "global_normalization": true,
22
+ "image_size": 448,
23
+ "keep_aspect_ratio": false,
24
+ "knowledge_insulation": false,
25
+ "llm_config": {
26
+ "_attn_implementation_autoset": true,
27
+ "_name_or_path": "./pretrained/Qwen2_5-0_5B-Instruct",
28
+ "add_cross_attention": false,
29
+ "architectures": [
30
+ "Qwen2ForCausalLM"
31
+ ],
32
+ "attention_dropout": 0.0,
33
+ "auto_map": {
34
+ "AutoConfig": "configuration_qwen2.Qwen2Config",
35
+ "AutoModel": "modeling_qwen2.Qwen2Model",
36
+ "AutoModelForCausalLM": "modeling_qwen2.Qwen2ForCausalLM"
37
+ },
38
+ "bad_words_ids": null,
39
+ "begin_suppress_tokens": null,
40
+ "bos_token_id": 151643,
41
+ "chunk_size_feed_forward": 0,
42
+ "cross_attention_hidden_size": null,
43
+ "decoder_start_token_id": null,
44
+ "diversity_penalty": 0.0,
45
+ "do_sample": false,
46
+ "early_stopping": false,
47
+ "encoder_no_repeat_ngram_size": 0,
48
+ "eos_token_id": 151645,
49
+ "exponential_decay_length_penalty": null,
50
+ "finetuning_task": null,
51
+ "forced_bos_token_id": null,
52
+ "forced_eos_token_id": null,
53
+ "hidden_act": "silu",
54
+ "hidden_size": 896,
55
+ "id2label": {
56
+ "0": "LABEL_0",
57
+ "1": "LABEL_1"
58
+ },
59
+ "initializer_range": 0.02,
60
+ "intermediate_size": 4864,
61
+ "is_decoder": false,
62
+ "is_encoder_decoder": false,
63
+ "label2id": {
64
+ "LABEL_0": 0,
65
+ "LABEL_1": 1
66
+ },
67
+ "length_penalty": 1.0,
68
+ "max_length": 20,
69
+ "max_position_embeddings": 32768,
70
+ "max_window_layers": 21,
71
+ "min_length": 0,
72
+ "model_type": "qwen2",
73
+ "no_repeat_ngram_size": 0,
74
+ "num_attention_heads": 14,
75
+ "num_beam_groups": 1,
76
+ "num_beams": 1,
77
+ "num_hidden_layers": 24,
78
+ "num_key_value_heads": 2,
79
+ "num_return_sequences": 1,
80
+ "output_attentions": false,
81
+ "output_hidden_states": false,
82
+ "output_scores": false,
83
+ "pad_token_id": null,
84
+ "prefix": null,
85
+ "problem_type": null,
86
+ "pruned_heads": {},
87
+ "remove_invalid_values": false,
88
+ "repetition_penalty": 1.0,
89
+ "return_dict": true,
90
+ "return_dict_in_generate": false,
91
+ "rms_norm_eps": 1e-06,
92
+ "rope_scaling": null,
93
+ "rope_theta": 1000000.0,
94
+ "sep_token_id": null,
95
+ "sliding_window": 32768,
96
+ "suppress_tokens": null,
97
+ "task_specific_params": null,
98
+ "temperature": 1.0,
99
+ "tf_legacy_loss": false,
100
+ "tie_encoder_decoder": false,
101
+ "tie_word_embeddings": true,
102
+ "tokenizer_class": null,
103
+ "top_k": 50,
104
+ "top_p": 1.0,
105
+ "torch_dtype": "bfloat16",
106
+ "torchscript": false,
107
+ "transformers_version": "4.50.0.dev0",
108
+ "typical_p": 1.0,
109
+ "use_bfloat16": false,
110
+ "use_cache": false,
111
+ "use_sliding_window": false,
112
+ "vocab_size": 151674
113
+ },
114
+ "loss_version": "v4",
115
+ "max_dynamic_patch": 12,
116
+ "min_dynamic_patch": 1,
117
+ "mlp_checkpoint": true,
118
+ "model_path": "nvidia/Eagle2-1B",
119
+ "model_type": "Eagle2_1BVLA",
120
+ "modeling": "denoising",
121
+ "normalization": "quantile",
122
+ "num_readouts": 1,
123
+ "pad2square": false,
124
+ "pre_feature_reduction": false,
125
+ "ps_version": "v2",
126
+ "readout_token_as_eos": true,
127
+ "return_text": null,
128
+ "select_layer": -1,
129
+ "state_dim": 10,
130
+ "stopping_token": "|",
131
+ "template": "qwen2-chat",
132
+ "test_denoising_steps": 10,
133
+ "torch_dtype": "bfloat16",
134
+ "train_denoising_steps": 100,
135
+ "transformers_version": null,
136
+ "use_backbone_lora": 0,
137
+ "use_llm_lora": 0,
138
+ "use_thumbnail": true,
139
+ "vision_config": {
140
+ "_attn_implementation_autoset": true,
141
+ "_name_or_path": "",
142
+ "add_cross_attention": false,
143
+ "architectures": [
144
+ "SiglipVisionModel"
145
+ ],
146
+ "attention_dropout": 0.0,
147
+ "auto_map": {
148
+ "AutoConfig": "configuration_siglip.SiglipVisionConfig",
149
+ "AutoModel": "modeling_siglip.SiglipVisionModel"
150
+ },
151
+ "bad_words_ids": null,
152
+ "begin_suppress_tokens": null,
153
+ "bos_token_id": null,
154
+ "chunk_size_feed_forward": 0,
155
+ "cross_attention_hidden_size": null,
156
+ "decoder_start_token_id": null,
157
+ "diversity_penalty": 0.0,
158
+ "do_sample": false,
159
+ "drop_path_rate": 0.1,
160
+ "early_stopping": false,
161
+ "encoder_no_repeat_ngram_size": 0,
162
+ "eos_token_id": null,
163
+ "exponential_decay_length_penalty": null,
164
+ "finetuning_task": null,
165
+ "forced_bos_token_id": null,
166
+ "forced_eos_token_id": null,
167
+ "hidden_act": "gelu_pytorch_tanh",
168
+ "hidden_size": 1152,
169
+ "id2label": {
170
+ "0": "LABEL_0",
171
+ "1": "LABEL_1"
172
+ },
173
+ "image_size": 448,
174
+ "intermediate_size": 4304,
175
+ "is_decoder": false,
176
+ "is_encoder_decoder": false,
177
+ "label2id": {
178
+ "LABEL_0": 0,
179
+ "LABEL_1": 1
180
+ },
181
+ "layer_norm_eps": 1e-06,
182
+ "length_penalty": 1.0,
183
+ "max_length": 20,
184
+ "min_length": 0,
185
+ "model_type": "siglip_vision_model",
186
+ "no_repeat_ngram_size": 0,
187
+ "num_attention_heads": 16,
188
+ "num_beam_groups": 1,
189
+ "num_beams": 1,
190
+ "num_channels": 3,
191
+ "num_hidden_layers": 27,
192
+ "num_image_tokens": 1024,
193
+ "num_return_sequences": 1,
194
+ "output_attentions": false,
195
+ "output_hidden_states": false,
196
+ "output_scores": false,
197
+ "pad_token_id": null,
198
+ "patch_size": 14,
199
+ "prefix": null,
200
+ "problem_type": null,
201
+ "projection_dim": 2048,
202
+ "projector_hidden_act": "gelu_fast",
203
+ "pruned_heads": {},
204
+ "remove_invalid_values": false,
205
+ "repetition_penalty": 1.0,
206
+ "return_dict": true,
207
+ "return_dict_in_generate": false,
208
+ "sep_token_id": null,
209
+ "suppress_tokens": null,
210
+ "task_specific_params": null,
211
+ "temperature": 1.0,
212
+ "tf_legacy_loss": false,
213
+ "tie_encoder_decoder": false,
214
+ "tie_word_embeddings": true,
215
+ "tokenizer_class": null,
216
+ "top_k": 50,
217
+ "top_p": 1.0,
218
+ "torch_dtype": "bfloat16",
219
+ "torchscript": false,
220
+ "transformers_version": "4.50.0.dev0",
221
+ "typical_p": 1.0,
222
+ "use_bfloat16": false,
223
+ "vision_use_head": false
224
+ },
225
+ "vocab_size": 151674,
226
+ "vocab_start": null
227
+ }
1e-4/twinvla-scratch-1e-4-aloha_lift_box/config.json ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "action_dim": 10,
3
+ "action_head": "DiT",
4
+ "action_len": 20,
5
+ "architectures": [
6
+ "Eagle2_1BTwinVLA"
7
+ ],
8
+ "attn_reweighting": true,
9
+ "denoiser": "FM",
10
+ "dit_scratch": false,
11
+ "global_normalization": true,
12
+ "hz_interpolate": null,
13
+ "interpolate_gripper": false,
14
+ "knowledge_insulation": false,
15
+ "model_path": null,
16
+ "model_type": "Eagle2_1BTwinVLA",
17
+ "modeling": "denoising",
18
+ "normalization": "quantile",
19
+ "num_readouts": 1,
20
+ "readout_token_as_eos": true,
21
+ "share_decoder": true,
22
+ "share_embed_tokens": true,
23
+ "share_vision": true,
24
+ "singlevla_config": {
25
+ "_attn_implementation_autoset": false,
26
+ "_attn_implementation_internal": null,
27
+ "_commit_hash": null,
28
+ "_name_or_path": "/scratch2/jellyho/rebuttal/singlevla-work/Eagle2_1B-Scratch-DiT-B",
29
+ "action_dim": 10,
30
+ "action_head": "DiT",
31
+ "action_head_hidden_dim": 1024,
32
+ "action_len": 20,
33
+ "add_cross_attention": false,
34
+ "aggregation": "None",
35
+ "architectures": [
36
+ "Eagle2_1BVLA"
37
+ ],
38
+ "auto_map": {},
39
+ "bad_words_ids": null,
40
+ "begin_suppress_tokens": null,
41
+ "bos_token_id": null,
42
+ "chunk_size_feed_forward": 0,
43
+ "cross_attention_hidden_size": null,
44
+ "decoder_start_token_id": null,
45
+ "denoiser": "FM",
46
+ "diffusion_batch": 32,
47
+ "dit_size": "DiT-B",
48
+ "diversity_penalty": 0.0,
49
+ "do_sample": false,
50
+ "downsample_ratio": 0.5,
51
+ "dynamic_image_size": true,
52
+ "early_stopping": false,
53
+ "efficient_loss": true,
54
+ "enable_cfg": true,
55
+ "encoder_no_repeat_ngram_size": 0,
56
+ "eos_token_id": null,
57
+ "exponential_decay_length_penalty": null,
58
+ "finetuning_task": null,
59
+ "force_image_size": 448,
60
+ "forced_bos_token_id": null,
61
+ "forced_eos_token_id": null,
62
+ "global_normalization": true,
63
+ "id2label": {
64
+ "0": "LABEL_0",
65
+ "1": "LABEL_1"
66
+ },
67
+ "image_size": 448,
68
+ "is_decoder": false,
69
+ "is_encoder_decoder": false,
70
+ "keep_aspect_ratio": false,
71
+ "knowledge_insulation": false,
72
+ "label2id": {
73
+ "LABEL_0": 0,
74
+ "LABEL_1": 1
75
+ },
76
+ "length_penalty": 1.0,
77
+ "llm_config": {
78
+ "_attn_implementation_autoset": true,
79
+ "_name_or_path": "./pretrained/Qwen2_5-0_5B-Instruct",
80
+ "add_cross_attention": false,
81
+ "architectures": [
82
+ "Qwen2ForCausalLM"
83
+ ],
84
+ "attention_dropout": 0.0,
85
+ "auto_map": {
86
+ "AutoConfig": "configuration_qwen2.Qwen2Config",
87
+ "AutoModel": "modeling_qwen2.Qwen2Model",
88
+ "AutoModelForCausalLM": "modeling_qwen2.Qwen2ForCausalLM"
89
+ },
90
+ "bad_words_ids": null,
91
+ "begin_suppress_tokens": null,
92
+ "bos_token_id": 151643,
93
+ "chunk_size_feed_forward": 0,
94
+ "cross_attention_hidden_size": null,
95
+ "decoder_start_token_id": null,
96
+ "diversity_penalty": 0.0,
97
+ "do_sample": false,
98
+ "early_stopping": false,
99
+ "encoder_no_repeat_ngram_size": 0,
100
+ "eos_token_id": 151645,
101
+ "exponential_decay_length_penalty": null,
102
+ "finetuning_task": null,
103
+ "forced_bos_token_id": null,
104
+ "forced_eos_token_id": null,
105
+ "hidden_act": "silu",
106
+ "hidden_size": 896,
107
+ "id2label": {
108
+ "0": "LABEL_0",
109
+ "1": "LABEL_1"
110
+ },
111
+ "initializer_range": 0.02,
112
+ "intermediate_size": 4864,
113
+ "is_decoder": false,
114
+ "is_encoder_decoder": false,
115
+ "label2id": {
116
+ "LABEL_0": 0,
117
+ "LABEL_1": 1
118
+ },
119
+ "length_penalty": 1.0,
120
+ "max_length": 20,
121
+ "max_position_embeddings": 32768,
122
+ "max_window_layers": 21,
123
+ "min_length": 0,
124
+ "model_type": "qwen2",
125
+ "no_repeat_ngram_size": 0,
126
+ "num_attention_heads": 14,
127
+ "num_beam_groups": 1,
128
+ "num_beams": 1,
129
+ "num_hidden_layers": 24,
130
+ "num_key_value_heads": 2,
131
+ "num_return_sequences": 1,
132
+ "output_attentions": false,
133
+ "output_hidden_states": false,
134
+ "output_scores": false,
135
+ "pad_token_id": null,
136
+ "prefix": null,
137
+ "problem_type": null,
138
+ "pruned_heads": {},
139
+ "remove_invalid_values": false,
140
+ "repetition_penalty": 1.0,
141
+ "return_dict": true,
142
+ "return_dict_in_generate": false,
143
+ "rms_norm_eps": 1e-06,
144
+ "rope_scaling": null,
145
+ "rope_theta": 1000000.0,
146
+ "sep_token_id": null,
147
+ "sliding_window": 32768,
148
+ "suppress_tokens": null,
149
+ "task_specific_params": null,
150
+ "temperature": 1.0,
151
+ "tf_legacy_loss": false,
152
+ "tie_encoder_decoder": false,
153
+ "tie_word_embeddings": true,
154
+ "tokenizer_class": null,
155
+ "top_k": 50,
156
+ "top_p": 1.0,
157
+ "torch_dtype": "bfloat16",
158
+ "torchscript": false,
159
+ "transformers_version": "4.50.0.dev0",
160
+ "typical_p": 1.0,
161
+ "use_bfloat16": false,
162
+ "use_cache": false,
163
+ "use_sliding_window": false,
164
+ "vocab_size": 151674
165
+ },
166
+ "loss_version": "v4",
167
+ "max_dynamic_patch": 12,
168
+ "max_length": 20,
169
+ "min_dynamic_patch": 1,
170
+ "min_length": 0,
171
+ "mlp_checkpoint": true,
172
+ "model_path": "nvidia/Eagle2-1B",
173
+ "model_type": "Eagle2_1BVLA",
174
+ "modeling": "denoising",
175
+ "no_repeat_ngram_size": 0,
176
+ "normalization": "quantile",
177
+ "num_beam_groups": 1,
178
+ "num_beams": 1,
179
+ "num_readouts": 1,
180
+ "num_return_sequences": 1,
181
+ "output_attentions": false,
182
+ "output_hidden_states": false,
183
+ "output_scores": false,
184
+ "pad2square": false,
185
+ "pad_token_id": null,
186
+ "pre_feature_reduction": false,
187
+ "prefix": null,
188
+ "problem_type": null,
189
+ "pruned_heads": {},
190
+ "ps_version": "v2",
191
+ "readout_token_as_eos": true,
192
+ "remove_invalid_values": false,
193
+ "repetition_penalty": 1.0,
194
+ "return_dict": true,
195
+ "return_dict_in_generate": false,
196
+ "return_text": null,
197
+ "select_layer": -1,
198
+ "sep_token_id": null,
199
+ "state_dim": 10,
200
+ "stopping_token": "|",
201
+ "suppress_tokens": null,
202
+ "task_specific_params": null,
203
+ "temperature": 1.0,
204
+ "template": "qwen2-chat",
205
+ "test_denoising_steps": 10,
206
+ "tf_legacy_loss": false,
207
+ "tie_encoder_decoder": false,
208
+ "tie_word_embeddings": true,
209
+ "tokenizer_class": null,
210
+ "top_k": 50,
211
+ "top_p": 1.0,
212
+ "torch_dtype": "bfloat16",
213
+ "torchscript": false,
214
+ "train_denoising_steps": 100,
215
+ "typical_p": 1.0,
216
+ "use_backbone_lora": 0,
217
+ "use_bfloat16": false,
218
+ "use_llm_lora": 0,
219
+ "use_thumbnail": true,
220
+ "vision_config": {
221
+ "_attn_implementation_autoset": true,
222
+ "_name_or_path": "",
223
+ "add_cross_attention": false,
224
+ "architectures": [
225
+ "SiglipVisionModel"
226
+ ],
227
+ "attention_dropout": 0.0,
228
+ "auto_map": {
229
+ "AutoConfig": "configuration_siglip.SiglipVisionConfig",
230
+ "AutoModel": "modeling_siglip.SiglipVisionModel"
231
+ },
232
+ "bad_words_ids": null,
233
+ "begin_suppress_tokens": null,
234
+ "bos_token_id": null,
235
+ "chunk_size_feed_forward": 0,
236
+ "cross_attention_hidden_size": null,
237
+ "decoder_start_token_id": null,
238
+ "diversity_penalty": 0.0,
239
+ "do_sample": false,
240
+ "drop_path_rate": 0.1,
241
+ "early_stopping": false,
242
+ "encoder_no_repeat_ngram_size": 0,
243
+ "eos_token_id": null,
244
+ "exponential_decay_length_penalty": null,
245
+ "finetuning_task": null,
246
+ "forced_bos_token_id": null,
247
+ "forced_eos_token_id": null,
248
+ "hidden_act": "gelu_pytorch_tanh",
249
+ "hidden_size": 1152,
250
+ "id2label": {
251
+ "0": "LABEL_0",
252
+ "1": "LABEL_1"
253
+ },
254
+ "image_size": 448,
255
+ "intermediate_size": 4304,
256
+ "is_decoder": false,
257
+ "is_encoder_decoder": false,
258
+ "label2id": {
259
+ "LABEL_0": 0,
260
+ "LABEL_1": 1
261
+ },
262
+ "layer_norm_eps": 1e-06,
263
+ "length_penalty": 1.0,
264
+ "max_length": 20,
265
+ "min_length": 0,
266
+ "model_type": "siglip_vision_model",
267
+ "no_repeat_ngram_size": 0,
268
+ "num_attention_heads": 16,
269
+ "num_beam_groups": 1,
270
+ "num_beams": 1,
271
+ "num_channels": 3,
272
+ "num_hidden_layers": 27,
273
+ "num_image_tokens": 1024,
274
+ "num_return_sequences": 1,
275
+ "output_attentions": false,
276
+ "output_hidden_states": false,
277
+ "output_scores": false,
278
+ "pad_token_id": null,
279
+ "patch_size": 14,
280
+ "prefix": null,
281
+ "problem_type": null,
282
+ "projection_dim": 2048,
283
+ "projector_hidden_act": "gelu_fast",
284
+ "pruned_heads": {},
285
+ "remove_invalid_values": false,
286
+ "repetition_penalty": 1.0,
287
+ "return_dict": true,
288
+ "return_dict_in_generate": false,
289
+ "sep_token_id": null,
290
+ "suppress_tokens": null,
291
+ "task_specific_params": null,
292
+ "temperature": 1.0,
293
+ "tf_legacy_loss": false,
294
+ "tie_encoder_decoder": false,
295
+ "tie_word_embeddings": true,
296
+ "tokenizer_class": null,
297
+ "top_k": 50,
298
+ "top_p": 1.0,
299
+ "torch_dtype": "bfloat16",
300
+ "torchscript": false,
301
+ "transformers_version": "4.50.0.dev0",
302
+ "typical_p": 1.0,
303
+ "use_bfloat16": false,
304
+ "vision_use_head": false
305
+ },
306
+ "vocab_size": 151674,
307
+ "vocab_start": null
308
+ },
309
+ "singlevla_config_path": "/scratch2/jellyho/rebuttal/singlevla-work/Eagle2_1B-Scratch-DiT-B",
310
+ "singlevla_pretrained_path": null,
311
+ "state_dim": 10,
312
+ "torch_dtype": "bfloat16",
313
+ "transformers_version": "4.50.0.dev0"
314
+ }
1e-4/twinvla-scratch-1e-4-aloha_lift_box/dataset_statistics.json ADDED
@@ -0,0 +1,634 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "aloha_lift_box_new": {
3
+ "action": {
4
+ "mean": [
5
+ 0.3633340299129486,
6
+ -0.0188243817538023,
7
+ 0.1790345162153244,
8
+ 0.4083189070224762,
9
+ -0.11689117550849915,
10
+ -0.8073354959487915,
11
+ -0.10573232173919678,
12
+ 0.9415333271026611,
13
+ -0.16247375309467316,
14
+ 0.619253396987915,
15
+ 0.36085841059684753,
16
+ 0.013982076197862625,
17
+ 0.20412708818912506,
18
+ 0.5001599788665771,
19
+ 0.11137505620718002,
20
+ -0.7415233254432678,
21
+ 0.05212549492716789,
22
+ 0.9483596682548523,
23
+ 0.16254937648773193,
24
+ 0.7590736150741577
25
+ ],
26
+ "std": [
27
+ 0.0638059601187706,
28
+ 0.06317954510450363,
29
+ 0.11073724925518036,
30
+ 0.31736424565315247,
31
+ 0.13928908109664917,
32
+ 0.21841377019882202,
33
+ 0.22394296526908875,
34
+ 0.08008279651403427,
35
+ 0.1392110288143158,
36
+ 0.7852216362953186,
37
+ 0.056878820061683655,
38
+ 0.059404969215393066,
39
+ 0.1170634776353836,
40
+ 0.3238432705402374,
41
+ 0.14080214500427246,
42
+ 0.25074857473373413,
43
+ 0.2164432257413864,
44
+ 0.07544830441474915,
45
+ 0.1375824362039566,
46
+ 0.651036262512207
47
+ ],
48
+ "max": [
49
+ 0.5681452751159668,
50
+ 0.2437673658132553,
51
+ 0.45541316270828247,
52
+ 0.9999293088912964,
53
+ 0.523757815361023,
54
+ 0.4592168927192688,
55
+ 0.7756927013397217,
56
+ 0.9999935030937195,
57
+ 0.2805824279785156,
58
+ 1.0,
59
+ 0.5600330233573914,
60
+ 0.3342031240463257,
61
+ 0.4682213366031647,
62
+ 0.9998393058776855,
63
+ 0.7949740886688232,
64
+ 0.1664249449968338,
65
+ 0.9131186604499817,
66
+ 0.9999967813491821,
67
+ 0.7936055064201355,
68
+ 1.0
69
+ ],
70
+ "min": [
71
+ 0.1517709195613861,
72
+ -0.2900286316871643,
73
+ -0.07412093877792358,
74
+ -0.4022133946418762,
75
+ -0.7361933588981628,
76
+ -0.9999988079071045,
77
+ -0.9935019016265869,
78
+ 0.10709662735462189,
79
+ -0.8023554682731628,
80
+ -1.0,
81
+ 0.15366072952747345,
82
+ -0.23686714470386505,
83
+ 0.0008372184820473194,
84
+ -0.5509981513023376,
85
+ -0.35234102606773376,
86
+ -0.999956488609314,
87
+ -0.5318384766578674,
88
+ 0.3388061225414276,
89
+ -0.27330997586250305,
90
+ -1.0
91
+ ],
92
+ "q01": [
93
+ 0.21054682105779648,
94
+ -0.1866426882147789,
95
+ 0.008138886513188495,
96
+ -0.19710821226239203,
97
+ -0.5368945515155792,
98
+ -0.9981186389923096,
99
+ -0.6956261324882507,
100
+ 0.6267582887411117,
101
+ -0.5600040704011917,
102
+ -1.0,
103
+ 0.2190245844423771,
104
+ -0.15968348175287247,
105
+ 0.025033411756157874,
106
+ -0.23832830414175987,
107
+ -0.2097599548101425,
108
+ -0.9988620406389237,
109
+ -0.4039672353863716,
110
+ 0.6080100274085999,
111
+ -0.19206354618072508,
112
+ -1.0
113
+ ],
114
+ "q99": [
115
+ 0.5033414244651794,
116
+ 0.16928535521030416,
117
+ 0.41566276580095285,
118
+ 0.9899059218168258,
119
+ 0.15462822496891018,
120
+ 0.03764873944222882,
121
+ 0.4657947558164549,
122
+ 0.9995575082302094,
123
+ 0.12326683558523567,
124
+ 1.0,
125
+ 0.4801343524456022,
126
+ 0.1795493066310881,
127
+ 0.4235989159345625,
128
+ 0.9913575077056883,
129
+ 0.5356137681007356,
130
+ 0.044951977618036626,
131
+ 0.7084567189216593,
132
+ 0.9992782145738601,
133
+ 0.6150667482614517,
134
+ 1.0
135
+ ],
136
+ "mask": [
137
+ true,
138
+ true,
139
+ true,
140
+ false,
141
+ false,
142
+ false,
143
+ false,
144
+ false,
145
+ false,
146
+ false,
147
+ true,
148
+ true,
149
+ true,
150
+ false,
151
+ false,
152
+ false,
153
+ false,
154
+ false,
155
+ false,
156
+ false
157
+ ]
158
+ },
159
+ "proprio": {
160
+ "mean": [
161
+ 0.3302275836467743,
162
+ -0.022152910009026527,
163
+ 0.15129828453063965,
164
+ 0.2713927626609802,
165
+ -0.12320414930582047,
166
+ -0.8610084652900696,
167
+ -0.11408621817827225,
168
+ 0.939403235912323,
169
+ -0.15196871757507324,
170
+ 0.565762460231781,
171
+ 0.32600972056388855,
172
+ 0.012888750061392784,
173
+ 0.17190414667129517,
174
+ 0.3495619595050812,
175
+ 0.11379297822713852,
176
+ -0.8095759749412537,
177
+ 0.03240593522787094,
178
+ 0.9490123391151428,
179
+ 0.14386720955371857,
180
+ 0.7014500498771667
181
+ ],
182
+ "std": [
183
+ 0.0579490028321743,
184
+ 0.07013536244630814,
185
+ 0.09934847801923752,
186
+ 0.311722993850708,
187
+ 0.13217779994010925,
188
+ 0.23493210971355438,
189
+ 0.22829721868038177,
190
+ 0.08239603042602539,
191
+ 0.1500341296195984,
192
+ 0.85152268409729,
193
+ 0.04813271760940552,
194
+ 0.06810667365789413,
195
+ 0.10921778529882431,
196
+ 0.34190264344215393,
197
+ 0.14661571383476257,
198
+ 0.26654884219169617,
199
+ 0.22192326188087463,
200
+ 0.08192053437232971,
201
+ 0.1472003310918808,
202
+ 0.7158037424087524
203
+ ],
204
+ "max": [
205
+ 0.559941291809082,
206
+ 0.26086756587028503,
207
+ 0.4504527747631073,
208
+ 0.9999247789382935,
209
+ 0.4198993146419525,
210
+ 0.3512286841869354,
211
+ 0.7522457242012024,
212
+ 1.0,
213
+ 0.8956095576286316,
214
+ 1.470957636833191,
215
+ 0.5437091588973999,
216
+ 0.32627788186073303,
217
+ 0.4945259988307953,
218
+ 0.9998428821563721,
219
+ 0.7737792730331421,
220
+ 0.4633983373641968,
221
+ 0.9018308520317078,
222
+ 1.0,
223
+ 0.9907073378562927,
224
+ 1.361535668373108
225
+ ],
226
+ "min": [
227
+ 0.16681896150112152,
228
+ -0.20499344170093536,
229
+ -0.0030731656588613987,
230
+ -0.4872298836708069,
231
+ -0.6995252966880798,
232
+ -0.999997615814209,
233
+ -0.988165020942688,
234
+ 0.14152538776397705,
235
+ -0.8483264446258545,
236
+ -1.2196638584136963,
237
+ 0.14598572254180908,
238
+ -0.2277291864156723,
239
+ 0.004666368011385202,
240
+ -0.5699886679649353,
241
+ -0.40678924322128296,
242
+ -0.9999999403953552,
243
+ -0.6972882151603699,
244
+ 0.13462646305561066,
245
+ -0.643044650554657,
246
+ -1.164451003074646
247
+ ],
248
+ "q01": [
249
+ 0.2053149801492691,
250
+ -0.17586381256580352,
251
+ 0.015469378884881736,
252
+ -0.2516648331284523,
253
+ -0.5193796420097351,
254
+ -0.9995058274269104,
255
+ -0.7092818850278855,
256
+ 0.608681161403656,
257
+ -0.578884813785553,
258
+ -1.1618710005283355,
259
+ 0.21638940930366515,
260
+ -0.1691040216386318,
261
+ 0.011891756923869252,
262
+ -0.29012590169906616,
263
+ -0.20126488715410232,
264
+ -0.9995589327812194,
265
+ -0.49963704913854595,
266
+ 0.533765652179718,
267
+ -0.18726778730750085,
268
+ -1.082753186225891
269
+ ],
270
+ "q99": [
271
+ 0.5071819436550137,
272
+ 0.165744510143995,
273
+ 0.40272374808788297,
274
+ 0.995180070400238,
275
+ 0.16266889929771197,
276
+ 0.09040380395948588,
277
+ 0.5001266032457347,
278
+ 0.9997656464576721,
279
+ 0.10759550034999843,
280
+ 1.4176189756393425,
281
+ 0.47452601760625834,
282
+ 0.1839943121373646,
283
+ 0.40895662158727647,
284
+ 0.995180070400238,
285
+ 0.5622373461723318,
286
+ 0.07441098906099738,
287
+ 0.7114433652162524,
288
+ 0.999856880903244,
289
+ 0.5974926966428754,
290
+ 1.321595377922058
291
+ ],
292
+ "mask": [
293
+ true,
294
+ true,
295
+ true,
296
+ false,
297
+ false,
298
+ false,
299
+ false,
300
+ false,
301
+ false,
302
+ false,
303
+ true,
304
+ true,
305
+ true,
306
+ false,
307
+ false,
308
+ false,
309
+ false,
310
+ false,
311
+ false,
312
+ false
313
+ ]
314
+ },
315
+ "num_transitions": 11572,
316
+ "num_trajectories": 50
317
+ },
318
+ "aloha_lift_box": {
319
+ "action": {
320
+ "mean": [
321
+ 0.3633340299129486,
322
+ -0.0188243817538023,
323
+ 0.1790345162153244,
324
+ 0.4083189070224762,
325
+ -0.11689117550849915,
326
+ -0.8073354959487915,
327
+ -0.10573232173919678,
328
+ 0.9415333271026611,
329
+ -0.16247375309467316,
330
+ 0.619253396987915,
331
+ 0.36085841059684753,
332
+ 0.013982076197862625,
333
+ 0.20412708818912506,
334
+ 0.5001599788665771,
335
+ 0.11137505620718002,
336
+ -0.7415233254432678,
337
+ 0.05212549492716789,
338
+ 0.9483596682548523,
339
+ 0.16254937648773193,
340
+ 0.7590736150741577
341
+ ],
342
+ "std": [
343
+ 0.0638059601187706,
344
+ 0.06317954510450363,
345
+ 0.11073724925518036,
346
+ 0.31736424565315247,
347
+ 0.13928908109664917,
348
+ 0.21841377019882202,
349
+ 0.22394296526908875,
350
+ 0.08008279651403427,
351
+ 0.1392110288143158,
352
+ 0.7852216362953186,
353
+ 0.056878820061683655,
354
+ 0.059404969215393066,
355
+ 0.1170634776353836,
356
+ 0.3238432705402374,
357
+ 0.14080214500427246,
358
+ 0.25074857473373413,
359
+ 0.2164432257413864,
360
+ 0.07544830441474915,
361
+ 0.1375824362039566,
362
+ 0.651036262512207
363
+ ],
364
+ "max": [
365
+ 0.5681452751159668,
366
+ 0.2437673658132553,
367
+ 0.45541316270828247,
368
+ 0.9999293088912964,
369
+ 0.523757815361023,
370
+ 0.4592168927192688,
371
+ 0.7756927013397217,
372
+ 0.9999935030937195,
373
+ 0.2805824279785156,
374
+ 1.0,
375
+ 0.5600330233573914,
376
+ 0.3342031240463257,
377
+ 0.4682213366031647,
378
+ 0.9998393058776855,
379
+ 0.7949740886688232,
380
+ 0.1664249449968338,
381
+ 0.9131186604499817,
382
+ 0.9999967813491821,
383
+ 0.7936055064201355,
384
+ 1.0
385
+ ],
386
+ "min": [
387
+ 0.1517709195613861,
388
+ -0.2900286316871643,
389
+ -0.07412093877792358,
390
+ -0.4022133946418762,
391
+ -0.7361933588981628,
392
+ -0.9999988079071045,
393
+ -0.9935019016265869,
394
+ 0.10709662735462189,
395
+ -0.8023554682731628,
396
+ -1.0,
397
+ 0.15366072952747345,
398
+ -0.23686714470386505,
399
+ 0.0008372184820473194,
400
+ -0.5509981513023376,
401
+ -0.35234102606773376,
402
+ -0.999956488609314,
403
+ -0.5318384766578674,
404
+ 0.3388061225414276,
405
+ -0.27330997586250305,
406
+ -1.0
407
+ ],
408
+ "q01": [
409
+ 0.21054682105779648,
410
+ -0.1866426882147789,
411
+ 0.008138886513188495,
412
+ -0.19710821226239203,
413
+ -0.5368945515155792,
414
+ -0.9981186389923096,
415
+ -0.6956261324882507,
416
+ 0.6267582887411117,
417
+ -0.5600040704011917,
418
+ -1.0,
419
+ 0.2190245844423771,
420
+ -0.15968348175287247,
421
+ 0.025033411756157874,
422
+ -0.23832830414175987,
423
+ -0.2097599548101425,
424
+ -0.9988620406389237,
425
+ -0.4039672353863716,
426
+ 0.6080100274085999,
427
+ -0.19206354618072508,
428
+ -1.0
429
+ ],
430
+ "q99": [
431
+ 0.5033414244651794,
432
+ 0.16928535521030416,
433
+ 0.41566276580095285,
434
+ 0.9899059218168258,
435
+ 0.15462822496891018,
436
+ 0.03764873944222882,
437
+ 0.4657947558164549,
438
+ 0.9995575082302094,
439
+ 0.12326683558523567,
440
+ 1.0,
441
+ 0.4801343524456022,
442
+ 0.1795493066310881,
443
+ 0.4235989159345625,
444
+ 0.9913575077056883,
445
+ 0.5356137681007356,
446
+ 0.044951977618036626,
447
+ 0.7084567189216593,
448
+ 0.9992782145738601,
449
+ 0.6150667482614517,
450
+ 1.0
451
+ ],
452
+ "mask": [
453
+ true,
454
+ true,
455
+ true,
456
+ false,
457
+ false,
458
+ false,
459
+ false,
460
+ false,
461
+ false,
462
+ false,
463
+ true,
464
+ true,
465
+ true,
466
+ false,
467
+ false,
468
+ false,
469
+ false,
470
+ false,
471
+ false,
472
+ false
473
+ ]
474
+ },
475
+ "proprio": {
476
+ "mean": [
477
+ 0.3302275836467743,
478
+ -0.022152910009026527,
479
+ 0.15129828453063965,
480
+ 0.2713927626609802,
481
+ -0.12320414930582047,
482
+ -0.8610084652900696,
483
+ -0.11408621817827225,
484
+ 0.939403235912323,
485
+ -0.15196871757507324,
486
+ 0.565762460231781,
487
+ 0.32600972056388855,
488
+ 0.012888750061392784,
489
+ 0.17190414667129517,
490
+ 0.3495619595050812,
491
+ 0.11379297822713852,
492
+ -0.8095759749412537,
493
+ 0.03240593522787094,
494
+ 0.9490123391151428,
495
+ 0.14386720955371857,
496
+ 0.7014500498771667
497
+ ],
498
+ "std": [
499
+ 0.0579490028321743,
500
+ 0.07013536244630814,
501
+ 0.09934847801923752,
502
+ 0.311722993850708,
503
+ 0.13217779994010925,
504
+ 0.23493210971355438,
505
+ 0.22829721868038177,
506
+ 0.08239603042602539,
507
+ 0.1500341296195984,
508
+ 0.85152268409729,
509
+ 0.04813271760940552,
510
+ 0.06810667365789413,
511
+ 0.10921778529882431,
512
+ 0.34190264344215393,
513
+ 0.14661571383476257,
514
+ 0.26654884219169617,
515
+ 0.22192326188087463,
516
+ 0.08192053437232971,
517
+ 0.1472003310918808,
518
+ 0.7158037424087524
519
+ ],
520
+ "max": [
521
+ 0.559941291809082,
522
+ 0.26086756587028503,
523
+ 0.4504527747631073,
524
+ 0.9999247789382935,
525
+ 0.4198993146419525,
526
+ 0.3512286841869354,
527
+ 0.7522457242012024,
528
+ 1.0,
529
+ 0.8956095576286316,
530
+ 1.470957636833191,
531
+ 0.5437091588973999,
532
+ 0.32627788186073303,
533
+ 0.4945259988307953,
534
+ 0.9998428821563721,
535
+ 0.7737792730331421,
536
+ 0.4633983373641968,
537
+ 0.9018308520317078,
538
+ 1.0,
539
+ 0.9907073378562927,
540
+ 1.361535668373108
541
+ ],
542
+ "min": [
543
+ 0.16681896150112152,
544
+ -0.20499344170093536,
545
+ -0.0030731656588613987,
546
+ -0.4872298836708069,
547
+ -0.6995252966880798,
548
+ -0.999997615814209,
549
+ -0.988165020942688,
550
+ 0.14152538776397705,
551
+ -0.8483264446258545,
552
+ -1.2196638584136963,
553
+ 0.14598572254180908,
554
+ -0.2277291864156723,
555
+ 0.004666368011385202,
556
+ -0.5699886679649353,
557
+ -0.40678924322128296,
558
+ -0.9999999403953552,
559
+ -0.6972882151603699,
560
+ 0.13462646305561066,
561
+ -0.643044650554657,
562
+ -1.164451003074646
563
+ ],
564
+ "q01": [
565
+ 0.2053149801492691,
566
+ -0.17586381256580352,
567
+ 0.015469378884881736,
568
+ -0.2516648331284523,
569
+ -0.5193796420097351,
570
+ -0.9995058274269104,
571
+ -0.7092818850278855,
572
+ 0.608681161403656,
573
+ -0.578884813785553,
574
+ -1.1618710005283355,
575
+ 0.21638940930366515,
576
+ -0.1691040216386318,
577
+ 0.011891756923869252,
578
+ -0.29012590169906616,
579
+ -0.20126488715410232,
580
+ -0.9995589327812194,
581
+ -0.49963704913854595,
582
+ 0.533765652179718,
583
+ -0.18726778730750085,
584
+ -1.082753186225891
585
+ ],
586
+ "q99": [
587
+ 0.5071819436550137,
588
+ 0.165744510143995,
589
+ 0.40272374808788297,
590
+ 0.995180070400238,
591
+ 0.16266889929771197,
592
+ 0.09040380395948588,
593
+ 0.5001266032457347,
594
+ 0.9997656464576721,
595
+ 0.10759550034999843,
596
+ 1.4176189756393425,
597
+ 0.47452601760625834,
598
+ 0.1839943121373646,
599
+ 0.40895662158727647,
600
+ 0.995180070400238,
601
+ 0.5622373461723318,
602
+ 0.07441098906099738,
603
+ 0.7114433652162524,
604
+ 0.999856880903244,
605
+ 0.5974926966428754,
606
+ 1.321595377922058
607
+ ],
608
+ "mask": [
609
+ true,
610
+ true,
611
+ true,
612
+ false,
613
+ false,
614
+ false,
615
+ false,
616
+ false,
617
+ false,
618
+ false,
619
+ true,
620
+ true,
621
+ true,
622
+ false,
623
+ false,
624
+ false,
625
+ false,
626
+ false,
627
+ false,
628
+ false
629
+ ]
630
+ },
631
+ "num_transitions": 11572,
632
+ "num_trajectories": 50
633
+ }
634
+ }
1e-4/twinvla-scratch-1e-4-aloha_lift_box/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd03a177af8c0205abb9cb9d4e871bf3cab1238e469e9b7315941232a26e68a6
3
+ size 2889536104
1e-4/twinvla-scratch-1e-4-aloha_lift_box/singlevla_config/config.json ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_commit_hash": null,
3
+ "_name_or_path": "/scratch2/jellyho/rebuttal/singlevla-work/Eagle2_1B-Scratch-DiT-B",
4
+ "action_dim": 10,
5
+ "action_head": "DiT",
6
+ "action_head_hidden_dim": 1024,
7
+ "action_len": 20,
8
+ "aggregation": "None",
9
+ "architectures": [
10
+ "Eagle2_1BVLA"
11
+ ],
12
+ "auto_map": {},
13
+ "denoiser": "FM",
14
+ "diffusion_batch": 32,
15
+ "dit_size": "DiT-B",
16
+ "downsample_ratio": 0.5,
17
+ "dynamic_image_size": true,
18
+ "efficient_loss": true,
19
+ "enable_cfg": true,
20
+ "force_image_size": 448,
21
+ "global_normalization": true,
22
+ "image_size": 448,
23
+ "keep_aspect_ratio": false,
24
+ "knowledge_insulation": false,
25
+ "llm_config": {
26
+ "_attn_implementation_autoset": true,
27
+ "_name_or_path": "./pretrained/Qwen2_5-0_5B-Instruct",
28
+ "add_cross_attention": false,
29
+ "architectures": [
30
+ "Qwen2ForCausalLM"
31
+ ],
32
+ "attention_dropout": 0.0,
33
+ "auto_map": {
34
+ "AutoConfig": "configuration_qwen2.Qwen2Config",
35
+ "AutoModel": "modeling_qwen2.Qwen2Model",
36
+ "AutoModelForCausalLM": "modeling_qwen2.Qwen2ForCausalLM"
37
+ },
38
+ "bad_words_ids": null,
39
+ "begin_suppress_tokens": null,
40
+ "bos_token_id": 151643,
41
+ "chunk_size_feed_forward": 0,
42
+ "cross_attention_hidden_size": null,
43
+ "decoder_start_token_id": null,
44
+ "diversity_penalty": 0.0,
45
+ "do_sample": false,
46
+ "early_stopping": false,
47
+ "encoder_no_repeat_ngram_size": 0,
48
+ "eos_token_id": 151645,
49
+ "exponential_decay_length_penalty": null,
50
+ "finetuning_task": null,
51
+ "forced_bos_token_id": null,
52
+ "forced_eos_token_id": null,
53
+ "hidden_act": "silu",
54
+ "hidden_size": 896,
55
+ "id2label": {
56
+ "0": "LABEL_0",
57
+ "1": "LABEL_1"
58
+ },
59
+ "initializer_range": 0.02,
60
+ "intermediate_size": 4864,
61
+ "is_decoder": false,
62
+ "is_encoder_decoder": false,
63
+ "label2id": {
64
+ "LABEL_0": 0,
65
+ "LABEL_1": 1
66
+ },
67
+ "length_penalty": 1.0,
68
+ "max_length": 20,
69
+ "max_position_embeddings": 32768,
70
+ "max_window_layers": 21,
71
+ "min_length": 0,
72
+ "model_type": "qwen2",
73
+ "no_repeat_ngram_size": 0,
74
+ "num_attention_heads": 14,
75
+ "num_beam_groups": 1,
76
+ "num_beams": 1,
77
+ "num_hidden_layers": 24,
78
+ "num_key_value_heads": 2,
79
+ "num_return_sequences": 1,
80
+ "output_attentions": false,
81
+ "output_hidden_states": false,
82
+ "output_scores": false,
83
+ "pad_token_id": null,
84
+ "prefix": null,
85
+ "problem_type": null,
86
+ "pruned_heads": {},
87
+ "remove_invalid_values": false,
88
+ "repetition_penalty": 1.0,
89
+ "return_dict": true,
90
+ "return_dict_in_generate": false,
91
+ "rms_norm_eps": 1e-06,
92
+ "rope_scaling": null,
93
+ "rope_theta": 1000000.0,
94
+ "sep_token_id": null,
95
+ "sliding_window": 32768,
96
+ "suppress_tokens": null,
97
+ "task_specific_params": null,
98
+ "temperature": 1.0,
99
+ "tf_legacy_loss": false,
100
+ "tie_encoder_decoder": false,
101
+ "tie_word_embeddings": true,
102
+ "tokenizer_class": null,
103
+ "top_k": 50,
104
+ "top_p": 1.0,
105
+ "torch_dtype": "bfloat16",
106
+ "torchscript": false,
107
+ "transformers_version": "4.50.0.dev0",
108
+ "typical_p": 1.0,
109
+ "use_bfloat16": false,
110
+ "use_cache": false,
111
+ "use_sliding_window": false,
112
+ "vocab_size": 151674
113
+ },
114
+ "loss_version": "v4",
115
+ "max_dynamic_patch": 12,
116
+ "min_dynamic_patch": 1,
117
+ "mlp_checkpoint": true,
118
+ "model_path": "nvidia/Eagle2-1B",
119
+ "model_type": "Eagle2_1BVLA",
120
+ "modeling": "denoising",
121
+ "normalization": "quantile",
122
+ "num_readouts": 1,
123
+ "pad2square": false,
124
+ "pre_feature_reduction": false,
125
+ "ps_version": "v2",
126
+ "readout_token_as_eos": true,
127
+ "return_text": null,
128
+ "select_layer": -1,
129
+ "state_dim": 10,
130
+ "stopping_token": "|",
131
+ "template": "qwen2-chat",
132
+ "test_denoising_steps": 10,
133
+ "torch_dtype": "bfloat16",
134
+ "train_denoising_steps": 100,
135
+ "transformers_version": null,
136
+ "use_backbone_lora": 0,
137
+ "use_llm_lora": 0,
138
+ "use_thumbnail": true,
139
+ "vision_config": {
140
+ "_attn_implementation_autoset": true,
141
+ "_name_or_path": "",
142
+ "add_cross_attention": false,
143
+ "architectures": [
144
+ "SiglipVisionModel"
145
+ ],
146
+ "attention_dropout": 0.0,
147
+ "auto_map": {
148
+ "AutoConfig": "configuration_siglip.SiglipVisionConfig",
149
+ "AutoModel": "modeling_siglip.SiglipVisionModel"
150
+ },
151
+ "bad_words_ids": null,
152
+ "begin_suppress_tokens": null,
153
+ "bos_token_id": null,
154
+ "chunk_size_feed_forward": 0,
155
+ "cross_attention_hidden_size": null,
156
+ "decoder_start_token_id": null,
157
+ "diversity_penalty": 0.0,
158
+ "do_sample": false,
159
+ "drop_path_rate": 0.1,
160
+ "early_stopping": false,
161
+ "encoder_no_repeat_ngram_size": 0,
162
+ "eos_token_id": null,
163
+ "exponential_decay_length_penalty": null,
164
+ "finetuning_task": null,
165
+ "forced_bos_token_id": null,
166
+ "forced_eos_token_id": null,
167
+ "hidden_act": "gelu_pytorch_tanh",
168
+ "hidden_size": 1152,
169
+ "id2label": {
170
+ "0": "LABEL_0",
171
+ "1": "LABEL_1"
172
+ },
173
+ "image_size": 448,
174
+ "intermediate_size": 4304,
175
+ "is_decoder": false,
176
+ "is_encoder_decoder": false,
177
+ "label2id": {
178
+ "LABEL_0": 0,
179
+ "LABEL_1": 1
180
+ },
181
+ "layer_norm_eps": 1e-06,
182
+ "length_penalty": 1.0,
183
+ "max_length": 20,
184
+ "min_length": 0,
185
+ "model_type": "siglip_vision_model",
186
+ "no_repeat_ngram_size": 0,
187
+ "num_attention_heads": 16,
188
+ "num_beam_groups": 1,
189
+ "num_beams": 1,
190
+ "num_channels": 3,
191
+ "num_hidden_layers": 27,
192
+ "num_image_tokens": 1024,
193
+ "num_return_sequences": 1,
194
+ "output_attentions": false,
195
+ "output_hidden_states": false,
196
+ "output_scores": false,
197
+ "pad_token_id": null,
198
+ "patch_size": 14,
199
+ "prefix": null,
200
+ "problem_type": null,
201
+ "projection_dim": 2048,
202
+ "projector_hidden_act": "gelu_fast",
203
+ "pruned_heads": {},
204
+ "remove_invalid_values": false,
205
+ "repetition_penalty": 1.0,
206
+ "return_dict": true,
207
+ "return_dict_in_generate": false,
208
+ "sep_token_id": null,
209
+ "suppress_tokens": null,
210
+ "task_specific_params": null,
211
+ "temperature": 1.0,
212
+ "tf_legacy_loss": false,
213
+ "tie_encoder_decoder": false,
214
+ "tie_word_embeddings": true,
215
+ "tokenizer_class": null,
216
+ "top_k": 50,
217
+ "top_p": 1.0,
218
+ "torch_dtype": "bfloat16",
219
+ "torchscript": false,
220
+ "transformers_version": "4.50.0.dev0",
221
+ "typical_p": 1.0,
222
+ "use_bfloat16": false,
223
+ "vision_use_head": false
224
+ },
225
+ "vocab_size": 151674,
226
+ "vocab_start": null
227
+ }
1e-4/twinvla-scratch-1e-4-aloha_shoes_table/config.json ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "action_dim": 10,
3
+ "action_head": "DiT",
4
+ "action_len": 20,
5
+ "architectures": [
6
+ "Eagle2_1BTwinVLA"
7
+ ],
8
+ "attn_reweighting": true,
9
+ "denoiser": "FM",
10
+ "dit_scratch": false,
11
+ "global_normalization": true,
12
+ "hz_interpolate": null,
13
+ "interpolate_gripper": false,
14
+ "knowledge_insulation": false,
15
+ "model_path": null,
16
+ "model_type": "Eagle2_1BTwinVLA",
17
+ "modeling": "denoising",
18
+ "normalization": "quantile",
19
+ "num_readouts": 1,
20
+ "readout_token_as_eos": true,
21
+ "share_decoder": true,
22
+ "share_embed_tokens": true,
23
+ "share_vision": true,
24
+ "singlevla_config": {
25
+ "_attn_implementation_autoset": false,
26
+ "_attn_implementation_internal": null,
27
+ "_commit_hash": null,
28
+ "_name_or_path": "/scratch2/jellyho/rebuttal/singlevla-work/Eagle2_1B-Scratch-DiT-B",
29
+ "action_dim": 10,
30
+ "action_head": "DiT",
31
+ "action_head_hidden_dim": 1024,
32
+ "action_len": 20,
33
+ "add_cross_attention": false,
34
+ "aggregation": "None",
35
+ "architectures": [
36
+ "Eagle2_1BVLA"
37
+ ],
38
+ "auto_map": {},
39
+ "bad_words_ids": null,
40
+ "begin_suppress_tokens": null,
41
+ "bos_token_id": null,
42
+ "chunk_size_feed_forward": 0,
43
+ "cross_attention_hidden_size": null,
44
+ "decoder_start_token_id": null,
45
+ "denoiser": "FM",
46
+ "diffusion_batch": 32,
47
+ "dit_size": "DiT-B",
48
+ "diversity_penalty": 0.0,
49
+ "do_sample": false,
50
+ "downsample_ratio": 0.5,
51
+ "dynamic_image_size": true,
52
+ "early_stopping": false,
53
+ "efficient_loss": true,
54
+ "enable_cfg": true,
55
+ "encoder_no_repeat_ngram_size": 0,
56
+ "eos_token_id": null,
57
+ "exponential_decay_length_penalty": null,
58
+ "finetuning_task": null,
59
+ "force_image_size": 448,
60
+ "forced_bos_token_id": null,
61
+ "forced_eos_token_id": null,
62
+ "global_normalization": true,
63
+ "id2label": {
64
+ "0": "LABEL_0",
65
+ "1": "LABEL_1"
66
+ },
67
+ "image_size": 448,
68
+ "is_decoder": false,
69
+ "is_encoder_decoder": false,
70
+ "keep_aspect_ratio": false,
71
+ "knowledge_insulation": false,
72
+ "label2id": {
73
+ "LABEL_0": 0,
74
+ "LABEL_1": 1
75
+ },
76
+ "length_penalty": 1.0,
77
+ "llm_config": {
78
+ "_attn_implementation_autoset": true,
79
+ "_name_or_path": "./pretrained/Qwen2_5-0_5B-Instruct",
80
+ "add_cross_attention": false,
81
+ "architectures": [
82
+ "Qwen2ForCausalLM"
83
+ ],
84
+ "attention_dropout": 0.0,
85
+ "auto_map": {
86
+ "AutoConfig": "configuration_qwen2.Qwen2Config",
87
+ "AutoModel": "modeling_qwen2.Qwen2Model",
88
+ "AutoModelForCausalLM": "modeling_qwen2.Qwen2ForCausalLM"
89
+ },
90
+ "bad_words_ids": null,
91
+ "begin_suppress_tokens": null,
92
+ "bos_token_id": 151643,
93
+ "chunk_size_feed_forward": 0,
94
+ "cross_attention_hidden_size": null,
95
+ "decoder_start_token_id": null,
96
+ "diversity_penalty": 0.0,
97
+ "do_sample": false,
98
+ "early_stopping": false,
99
+ "encoder_no_repeat_ngram_size": 0,
100
+ "eos_token_id": 151645,
101
+ "exponential_decay_length_penalty": null,
102
+ "finetuning_task": null,
103
+ "forced_bos_token_id": null,
104
+ "forced_eos_token_id": null,
105
+ "hidden_act": "silu",
106
+ "hidden_size": 896,
107
+ "id2label": {
108
+ "0": "LABEL_0",
109
+ "1": "LABEL_1"
110
+ },
111
+ "initializer_range": 0.02,
112
+ "intermediate_size": 4864,
113
+ "is_decoder": false,
114
+ "is_encoder_decoder": false,
115
+ "label2id": {
116
+ "LABEL_0": 0,
117
+ "LABEL_1": 1
118
+ },
119
+ "length_penalty": 1.0,
120
+ "max_length": 20,
121
+ "max_position_embeddings": 32768,
122
+ "max_window_layers": 21,
123
+ "min_length": 0,
124
+ "model_type": "qwen2",
125
+ "no_repeat_ngram_size": 0,
126
+ "num_attention_heads": 14,
127
+ "num_beam_groups": 1,
128
+ "num_beams": 1,
129
+ "num_hidden_layers": 24,
130
+ "num_key_value_heads": 2,
131
+ "num_return_sequences": 1,
132
+ "output_attentions": false,
133
+ "output_hidden_states": false,
134
+ "output_scores": false,
135
+ "pad_token_id": null,
136
+ "prefix": null,
137
+ "problem_type": null,
138
+ "pruned_heads": {},
139
+ "remove_invalid_values": false,
140
+ "repetition_penalty": 1.0,
141
+ "return_dict": true,
142
+ "return_dict_in_generate": false,
143
+ "rms_norm_eps": 1e-06,
144
+ "rope_scaling": null,
145
+ "rope_theta": 1000000.0,
146
+ "sep_token_id": null,
147
+ "sliding_window": 32768,
148
+ "suppress_tokens": null,
149
+ "task_specific_params": null,
150
+ "temperature": 1.0,
151
+ "tf_legacy_loss": false,
152
+ "tie_encoder_decoder": false,
153
+ "tie_word_embeddings": true,
154
+ "tokenizer_class": null,
155
+ "top_k": 50,
156
+ "top_p": 1.0,
157
+ "torch_dtype": "bfloat16",
158
+ "torchscript": false,
159
+ "transformers_version": "4.50.0.dev0",
160
+ "typical_p": 1.0,
161
+ "use_bfloat16": false,
162
+ "use_cache": false,
163
+ "use_sliding_window": false,
164
+ "vocab_size": 151674
165
+ },
166
+ "loss_version": "v4",
167
+ "max_dynamic_patch": 12,
168
+ "max_length": 20,
169
+ "min_dynamic_patch": 1,
170
+ "min_length": 0,
171
+ "mlp_checkpoint": true,
172
+ "model_path": "nvidia/Eagle2-1B",
173
+ "model_type": "Eagle2_1BVLA",
174
+ "modeling": "denoising",
175
+ "no_repeat_ngram_size": 0,
176
+ "normalization": "quantile",
177
+ "num_beam_groups": 1,
178
+ "num_beams": 1,
179
+ "num_readouts": 1,
180
+ "num_return_sequences": 1,
181
+ "output_attentions": false,
182
+ "output_hidden_states": false,
183
+ "output_scores": false,
184
+ "pad2square": false,
185
+ "pad_token_id": null,
186
+ "pre_feature_reduction": false,
187
+ "prefix": null,
188
+ "problem_type": null,
189
+ "pruned_heads": {},
190
+ "ps_version": "v2",
191
+ "readout_token_as_eos": true,
192
+ "remove_invalid_values": false,
193
+ "repetition_penalty": 1.0,
194
+ "return_dict": true,
195
+ "return_dict_in_generate": false,
196
+ "return_text": null,
197
+ "select_layer": -1,
198
+ "sep_token_id": null,
199
+ "state_dim": 10,
200
+ "stopping_token": "|",
201
+ "suppress_tokens": null,
202
+ "task_specific_params": null,
203
+ "temperature": 1.0,
204
+ "template": "qwen2-chat",
205
+ "test_denoising_steps": 10,
206
+ "tf_legacy_loss": false,
207
+ "tie_encoder_decoder": false,
208
+ "tie_word_embeddings": true,
209
+ "tokenizer_class": null,
210
+ "top_k": 50,
211
+ "top_p": 1.0,
212
+ "torch_dtype": "bfloat16",
213
+ "torchscript": false,
214
+ "train_denoising_steps": 100,
215
+ "typical_p": 1.0,
216
+ "use_backbone_lora": 0,
217
+ "use_bfloat16": false,
218
+ "use_llm_lora": 0,
219
+ "use_thumbnail": true,
220
+ "vision_config": {
221
+ "_attn_implementation_autoset": true,
222
+ "_name_or_path": "",
223
+ "add_cross_attention": false,
224
+ "architectures": [
225
+ "SiglipVisionModel"
226
+ ],
227
+ "attention_dropout": 0.0,
228
+ "auto_map": {
229
+ "AutoConfig": "configuration_siglip.SiglipVisionConfig",
230
+ "AutoModel": "modeling_siglip.SiglipVisionModel"
231
+ },
232
+ "bad_words_ids": null,
233
+ "begin_suppress_tokens": null,
234
+ "bos_token_id": null,
235
+ "chunk_size_feed_forward": 0,
236
+ "cross_attention_hidden_size": null,
237
+ "decoder_start_token_id": null,
238
+ "diversity_penalty": 0.0,
239
+ "do_sample": false,
240
+ "drop_path_rate": 0.1,
241
+ "early_stopping": false,
242
+ "encoder_no_repeat_ngram_size": 0,
243
+ "eos_token_id": null,
244
+ "exponential_decay_length_penalty": null,
245
+ "finetuning_task": null,
246
+ "forced_bos_token_id": null,
247
+ "forced_eos_token_id": null,
248
+ "hidden_act": "gelu_pytorch_tanh",
249
+ "hidden_size": 1152,
250
+ "id2label": {
251
+ "0": "LABEL_0",
252
+ "1": "LABEL_1"
253
+ },
254
+ "image_size": 448,
255
+ "intermediate_size": 4304,
256
+ "is_decoder": false,
257
+ "is_encoder_decoder": false,
258
+ "label2id": {
259
+ "LABEL_0": 0,
260
+ "LABEL_1": 1
261
+ },
262
+ "layer_norm_eps": 1e-06,
263
+ "length_penalty": 1.0,
264
+ "max_length": 20,
265
+ "min_length": 0,
266
+ "model_type": "siglip_vision_model",
267
+ "no_repeat_ngram_size": 0,
268
+ "num_attention_heads": 16,
269
+ "num_beam_groups": 1,
270
+ "num_beams": 1,
271
+ "num_channels": 3,
272
+ "num_hidden_layers": 27,
273
+ "num_image_tokens": 1024,
274
+ "num_return_sequences": 1,
275
+ "output_attentions": false,
276
+ "output_hidden_states": false,
277
+ "output_scores": false,
278
+ "pad_token_id": null,
279
+ "patch_size": 14,
280
+ "prefix": null,
281
+ "problem_type": null,
282
+ "projection_dim": 2048,
283
+ "projector_hidden_act": "gelu_fast",
284
+ "pruned_heads": {},
285
+ "remove_invalid_values": false,
286
+ "repetition_penalty": 1.0,
287
+ "return_dict": true,
288
+ "return_dict_in_generate": false,
289
+ "sep_token_id": null,
290
+ "suppress_tokens": null,
291
+ "task_specific_params": null,
292
+ "temperature": 1.0,
293
+ "tf_legacy_loss": false,
294
+ "tie_encoder_decoder": false,
295
+ "tie_word_embeddings": true,
296
+ "tokenizer_class": null,
297
+ "top_k": 50,
298
+ "top_p": 1.0,
299
+ "torch_dtype": "bfloat16",
300
+ "torchscript": false,
301
+ "transformers_version": "4.50.0.dev0",
302
+ "typical_p": 1.0,
303
+ "use_bfloat16": false,
304
+ "vision_use_head": false
305
+ },
306
+ "vocab_size": 151674,
307
+ "vocab_start": null
308
+ },
309
+ "singlevla_config_path": "/scratch2/jellyho/rebuttal/singlevla-work/Eagle2_1B-Scratch-DiT-B",
310
+ "singlevla_pretrained_path": null,
311
+ "state_dim": 10,
312
+ "torch_dtype": "bfloat16",
313
+ "transformers_version": "4.50.0.dev0"
314
+ }
1e-4/twinvla-scratch-1e-4-aloha_shoes_table/dataset_statistics.json ADDED
@@ -0,0 +1,634 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "aloha_shoes_table_new": {
3
+ "action": {
4
+ "mean": [
5
+ 0.3260679841041565,
6
+ -0.03153973072767258,
7
+ 0.17551672458648682,
8
+ 0.341669499874115,
9
+ 0.1495978981256485,
10
+ -0.7719317078590393,
11
+ -0.7661705613136292,
12
+ 0.3032568395137787,
13
+ -0.1495625078678131,
14
+ 0.11935558915138245,
15
+ 0.34630629420280457,
16
+ 0.037363290786743164,
17
+ 0.18034809827804565,
18
+ 0.3275623619556427,
19
+ 0.12991519272327423,
20
+ -0.831997811794281,
21
+ -0.7951440811157227,
22
+ 0.25535571575164795,
23
+ -0.18214371800422668,
24
+ 0.19820308685302734
25
+ ],
26
+ "std": [
27
+ 0.0646045058965683,
28
+ 0.09588947147130966,
29
+ 0.10945845395326614,
30
+ 0.3783091902732849,
31
+ 0.1452837437391281,
32
+ 0.3174603581428528,
33
+ 0.3114127218723297,
34
+ 0.417682021856308,
35
+ 0.16497297585010529,
36
+ 0.9928225874900818,
37
+ 0.07045891135931015,
38
+ 0.10143465548753738,
39
+ 0.11245165765285492,
40
+ 0.3225674331188202,
41
+ 0.15707552433013916,
42
+ 0.2342674732208252,
43
+ 0.2608211040496826,
44
+ 0.40032699704170227,
45
+ 0.2026672214269638,
46
+ 0.9801287651062012
47
+ ],
48
+ "max": [
49
+ 0.48015326261520386,
50
+ 0.179313525557518,
51
+ 0.3523038923740387,
52
+ 0.9999800324440002,
53
+ 0.7157489657402039,
54
+ 0.11180483549833298,
55
+ 0.1242646798491478,
56
+ 0.9998366832733154,
57
+ 0.285250186920166,
58
+ 1.0,
59
+ 0.49179938435554504,
60
+ 0.3557826578617096,
61
+ 0.42447179555892944,
62
+ 0.9987993836402893,
63
+ 0.5477575659751892,
64
+ 0.05208699405193329,
65
+ 0.034653306007385254,
66
+ 0.9937106966972351,
67
+ 0.3852289915084839,
68
+ 1.0
69
+ ],
70
+ "min": [
71
+ 0.12527647614479065,
72
+ -0.31394532322883606,
73
+ -0.04988693445920944,
74
+ -0.2561202347278595,
75
+ -0.30035507678985596,
76
+ -0.999981164932251,
77
+ -0.9999915957450867,
78
+ -0.644327700138092,
79
+ -0.7897446751594543,
80
+ -1.0,
81
+ 0.17444398999214172,
82
+ -0.23238857090473175,
83
+ -0.0659869983792305,
84
+ -0.35028380155563354,
85
+ -0.3673132658004761,
86
+ -0.9999988079071045,
87
+ -0.9999988675117493,
88
+ -0.7761710286140442,
89
+ -0.9717934131622314,
90
+ -1.0
91
+ ],
92
+ "q01": [
93
+ 0.1924597442150116,
94
+ -0.23709256052970887,
95
+ -0.031008305028080944,
96
+ -0.15678457915782928,
97
+ -0.1863800033926964,
98
+ -0.9994285225868225,
99
+ -0.9997011423110962,
100
+ -0.5719999492168426,
101
+ -0.6091587543487549,
102
+ -1.0,
103
+ 0.20810787677764891,
104
+ -0.16556282192468644,
105
+ -0.017654908634722234,
106
+ -0.15096036493778228,
107
+ -0.22608168572187423,
108
+ -0.998928040266037,
109
+ -0.9990629017353058,
110
+ -0.5276546537876129,
111
+ -0.7234344184398651,
112
+ -1.0
113
+ ],
114
+ "q99": [
115
+ 0.457787150144577,
116
+ 0.1451936572790145,
117
+ 0.34190638065338136,
118
+ 0.9928598761558532,
119
+ 0.5035569965839384,
120
+ -0.05978704243898392,
121
+ -0.01781813446432354,
122
+ 0.9893046200275422,
123
+ 0.17466527372598611,
124
+ 1.0,
125
+ 0.4768679320812225,
126
+ 0.2598331540822982,
127
+ 0.39134971201419827,
128
+ 0.9723170220851898,
129
+ 0.47033962905406923,
130
+ -0.1864572197198868,
131
+ -0.060312222316861244,
132
+ 0.9807472229003906,
133
+ 0.21941211968660337,
134
+ 1.0
135
+ ],
136
+ "mask": [
137
+ true,
138
+ true,
139
+ true,
140
+ false,
141
+ false,
142
+ false,
143
+ false,
144
+ false,
145
+ false,
146
+ false,
147
+ true,
148
+ true,
149
+ true,
150
+ false,
151
+ false,
152
+ false,
153
+ false,
154
+ false,
155
+ false,
156
+ false
157
+ ]
158
+ },
159
+ "proprio": {
160
+ "mean": [
161
+ 0.32339081168174744,
162
+ -0.03213495388627052,
163
+ 0.16396664083003998,
164
+ 0.3156684637069702,
165
+ 0.1496039181947708,
166
+ -0.7826846837997437,
167
+ -0.7692199945449829,
168
+ 0.30184611678123474,
169
+ -0.12293443828821182,
170
+ 0.6964511275291443,
171
+ 0.34235236048698425,
172
+ 0.03810478746891022,
173
+ 0.16778817772865295,
174
+ 0.29598966240882874,
175
+ 0.12479892373085022,
176
+ -0.8384969234466553,
177
+ -0.7965013384819031,
178
+ 0.2529187500476837,
179
+ -0.1519845873117447,
180
+ 0.7593688368797302
181
+ ],
182
+ "std": [
183
+ 0.062407124787569046,
184
+ 0.09441280364990234,
185
+ 0.10514319688081741,
186
+ 0.3824003040790558,
187
+ 0.14718197286128998,
188
+ 0.3122296929359436,
189
+ 0.3191576600074768,
190
+ 0.41190221905708313,
191
+ 0.1747966706752777,
192
+ 0.3845253884792328,
193
+ 0.06875813007354736,
194
+ 0.09827680140733719,
195
+ 0.10889745503664017,
196
+ 0.33462250232696533,
197
+ 0.15245455503463745,
198
+ 0.24192583560943604,
199
+ 0.2739580273628235,
200
+ 0.40077677369117737,
201
+ 0.20697632431983948,
202
+ 0.3062511384487152
203
+ ],
204
+ "max": [
205
+ 0.4779528081417084,
206
+ 0.17402252554893494,
207
+ 0.412266343832016,
208
+ 0.9999450445175171,
209
+ 0.6999034285545349,
210
+ 0.4170636534690857,
211
+ 0.4215781092643738,
212
+ 1.0,
213
+ 0.932714581489563,
214
+ 1.3076640367507935,
215
+ 0.48697036504745483,
216
+ 0.34565815329551697,
217
+ 0.415988564491272,
218
+ 0.9998390078544617,
219
+ 0.5593472123146057,
220
+ 0.20724913477897644,
221
+ 0.26142606139183044,
222
+ 1.0,
223
+ 0.5777683854103088,
224
+ 1.3103067874908447
225
+ ],
226
+ "min": [
227
+ 0.12448469549417496,
228
+ -0.30521926283836365,
229
+ -0.004976626019924879,
230
+ -0.21920020878314972,
231
+ -0.5096501708030701,
232
+ -1.0,
233
+ -0.9999982118606567,
234
+ -0.6262368559837341,
235
+ -0.7456304430961609,
236
+ -1.1091713905334473,
237
+ 0.1249726265668869,
238
+ -0.22351478040218353,
239
+ -0.006724653299897909,
240
+ -0.36625856161117554,
241
+ -0.4249938726425171,
242
+ -0.9999956488609314,
243
+ -0.999992847442627,
244
+ -0.77183997631073,
245
+ -0.9583328366279602,
246
+ -1.04777991771698
247
+ ],
248
+ "q01": [
249
+ 0.19817760735750198,
250
+ -0.2323527842760086,
251
+ -0.004393648169934749,
252
+ -0.14680973589420318,
253
+ -0.1899831309914589,
254
+ -0.9998269140720367,
255
+ -0.9998353064060211,
256
+ -0.49930458664894106,
257
+ -0.5963611721992492,
258
+ -1.0807034492492675,
259
+ 0.20821888744831085,
260
+ -0.13953636586666107,
261
+ -0.0033576888265088203,
262
+ -0.1788107320666313,
263
+ -0.22050866037607195,
264
+ -0.999310964345932,
265
+ -0.9993988335132599,
266
+ -0.5106797099113465,
267
+ -0.7302295982837677,
268
+ 0.05842937603592872
269
+ ],
270
+ "q99": [
271
+ 0.44812404513359066,
272
+ 0.14210240542888639,
273
+ 0.337252739071846,
274
+ 0.9943239092826842,
275
+ 0.5118523061275481,
276
+ 0.031205366365610953,
277
+ 0.04714705012738701,
278
+ 0.992770653963089,
279
+ 0.18282963484525644,
280
+ 1.1270769238471985,
281
+ 0.47021201252937317,
282
+ 0.2550090014934537,
283
+ 0.3824465185403823,
284
+ 0.995180070400238,
285
+ 0.46117363572120657,
286
+ -0.047424964234233064,
287
+ -3.7679112665500725e-06,
288
+ 0.9869830250740051,
289
+ 0.23266565054655072,
290
+ 1.1919615149497986
291
+ ],
292
+ "mask": [
293
+ true,
294
+ true,
295
+ true,
296
+ false,
297
+ false,
298
+ false,
299
+ false,
300
+ false,
301
+ false,
302
+ false,
303
+ true,
304
+ true,
305
+ true,
306
+ false,
307
+ false,
308
+ false,
309
+ false,
310
+ false,
311
+ false,
312
+ false
313
+ ]
314
+ },
315
+ "num_transitions": 12911,
316
+ "num_trajectories": 50
317
+ },
318
+ "aloha_shoes_table": {
319
+ "action": {
320
+ "mean": [
321
+ 0.3260679841041565,
322
+ -0.03153973072767258,
323
+ 0.17551672458648682,
324
+ 0.341669499874115,
325
+ 0.1495978981256485,
326
+ -0.7719317078590393,
327
+ -0.7661705613136292,
328
+ 0.3032568395137787,
329
+ -0.1495625078678131,
330
+ 0.11935558915138245,
331
+ 0.34630629420280457,
332
+ 0.037363290786743164,
333
+ 0.18034809827804565,
334
+ 0.3275623619556427,
335
+ 0.12991519272327423,
336
+ -0.831997811794281,
337
+ -0.7951440811157227,
338
+ 0.25535571575164795,
339
+ -0.18214371800422668,
340
+ 0.19820308685302734
341
+ ],
342
+ "std": [
343
+ 0.0646045058965683,
344
+ 0.09588947147130966,
345
+ 0.10945845395326614,
346
+ 0.3783091902732849,
347
+ 0.1452837437391281,
348
+ 0.3174603581428528,
349
+ 0.3114127218723297,
350
+ 0.417682021856308,
351
+ 0.16497297585010529,
352
+ 0.9928225874900818,
353
+ 0.07045891135931015,
354
+ 0.10143465548753738,
355
+ 0.11245165765285492,
356
+ 0.3225674331188202,
357
+ 0.15707552433013916,
358
+ 0.2342674732208252,
359
+ 0.2608211040496826,
360
+ 0.40032699704170227,
361
+ 0.2026672214269638,
362
+ 0.9801287651062012
363
+ ],
364
+ "max": [
365
+ 0.48015326261520386,
366
+ 0.179313525557518,
367
+ 0.3523038923740387,
368
+ 0.9999800324440002,
369
+ 0.7157489657402039,
370
+ 0.11180483549833298,
371
+ 0.1242646798491478,
372
+ 0.9998366832733154,
373
+ 0.285250186920166,
374
+ 1.0,
375
+ 0.49179938435554504,
376
+ 0.3557826578617096,
377
+ 0.42447179555892944,
378
+ 0.9987993836402893,
379
+ 0.5477575659751892,
380
+ 0.05208699405193329,
381
+ 0.034653306007385254,
382
+ 0.9937106966972351,
383
+ 0.3852289915084839,
384
+ 1.0
385
+ ],
386
+ "min": [
387
+ 0.12527647614479065,
388
+ -0.31394532322883606,
389
+ -0.04988693445920944,
390
+ -0.2561202347278595,
391
+ -0.30035507678985596,
392
+ -0.999981164932251,
393
+ -0.9999915957450867,
394
+ -0.644327700138092,
395
+ -0.7897446751594543,
396
+ -1.0,
397
+ 0.17444398999214172,
398
+ -0.23238857090473175,
399
+ -0.0659869983792305,
400
+ -0.35028380155563354,
401
+ -0.3673132658004761,
402
+ -0.9999988079071045,
403
+ -0.9999988675117493,
404
+ -0.7761710286140442,
405
+ -0.9717934131622314,
406
+ -1.0
407
+ ],
408
+ "q01": [
409
+ 0.1924597442150116,
410
+ -0.23709256052970887,
411
+ -0.031008305028080944,
412
+ -0.15678457915782928,
413
+ -0.1863800033926964,
414
+ -0.9994285225868225,
415
+ -0.9997011423110962,
416
+ -0.5719999492168426,
417
+ -0.6091587543487549,
418
+ -1.0,
419
+ 0.20810787677764891,
420
+ -0.16556282192468644,
421
+ -0.017654908634722234,
422
+ -0.15096036493778228,
423
+ -0.22608168572187423,
424
+ -0.998928040266037,
425
+ -0.9990629017353058,
426
+ -0.5276546537876129,
427
+ -0.7234344184398651,
428
+ -1.0
429
+ ],
430
+ "q99": [
431
+ 0.457787150144577,
432
+ 0.1451936572790145,
433
+ 0.34190638065338136,
434
+ 0.9928598761558532,
435
+ 0.5035569965839384,
436
+ -0.05978704243898392,
437
+ -0.01781813446432354,
438
+ 0.9893046200275422,
439
+ 0.17466527372598611,
440
+ 1.0,
441
+ 0.4768679320812225,
442
+ 0.2598331540822982,
443
+ 0.39134971201419827,
444
+ 0.9723170220851898,
445
+ 0.47033962905406923,
446
+ -0.1864572197198868,
447
+ -0.060312222316861244,
448
+ 0.9807472229003906,
449
+ 0.21941211968660337,
450
+ 1.0
451
+ ],
452
+ "mask": [
453
+ true,
454
+ true,
455
+ true,
456
+ false,
457
+ false,
458
+ false,
459
+ false,
460
+ false,
461
+ false,
462
+ false,
463
+ true,
464
+ true,
465
+ true,
466
+ false,
467
+ false,
468
+ false,
469
+ false,
470
+ false,
471
+ false,
472
+ false
473
+ ]
474
+ },
475
+ "proprio": {
476
+ "mean": [
477
+ 0.32339081168174744,
478
+ -0.03213495388627052,
479
+ 0.16396664083003998,
480
+ 0.3156684637069702,
481
+ 0.1496039181947708,
482
+ -0.7826846837997437,
483
+ -0.7692199945449829,
484
+ 0.30184611678123474,
485
+ -0.12293443828821182,
486
+ 0.6964511275291443,
487
+ 0.34235236048698425,
488
+ 0.03810478746891022,
489
+ 0.16778817772865295,
490
+ 0.29598966240882874,
491
+ 0.12479892373085022,
492
+ -0.8384969234466553,
493
+ -0.7965013384819031,
494
+ 0.2529187500476837,
495
+ -0.1519845873117447,
496
+ 0.7593688368797302
497
+ ],
498
+ "std": [
499
+ 0.062407124787569046,
500
+ 0.09441280364990234,
501
+ 0.10514319688081741,
502
+ 0.3824003040790558,
503
+ 0.14718197286128998,
504
+ 0.3122296929359436,
505
+ 0.3191576600074768,
506
+ 0.41190221905708313,
507
+ 0.1747966706752777,
508
+ 0.3845253884792328,
509
+ 0.06875813007354736,
510
+ 0.09827680140733719,
511
+ 0.10889745503664017,
512
+ 0.33462250232696533,
513
+ 0.15245455503463745,
514
+ 0.24192583560943604,
515
+ 0.2739580273628235,
516
+ 0.40077677369117737,
517
+ 0.20697632431983948,
518
+ 0.3062511384487152
519
+ ],
520
+ "max": [
521
+ 0.4779528081417084,
522
+ 0.17402252554893494,
523
+ 0.412266343832016,
524
+ 0.9999450445175171,
525
+ 0.6999034285545349,
526
+ 0.4170636534690857,
527
+ 0.4215781092643738,
528
+ 1.0,
529
+ 0.932714581489563,
530
+ 1.3076640367507935,
531
+ 0.48697036504745483,
532
+ 0.34565815329551697,
533
+ 0.415988564491272,
534
+ 0.9998390078544617,
535
+ 0.5593472123146057,
536
+ 0.20724913477897644,
537
+ 0.26142606139183044,
538
+ 1.0,
539
+ 0.5777683854103088,
540
+ 1.3103067874908447
541
+ ],
542
+ "min": [
543
+ 0.12448469549417496,
544
+ -0.30521926283836365,
545
+ -0.004976626019924879,
546
+ -0.21920020878314972,
547
+ -0.5096501708030701,
548
+ -1.0,
549
+ -0.9999982118606567,
550
+ -0.6262368559837341,
551
+ -0.7456304430961609,
552
+ -1.1091713905334473,
553
+ 0.1249726265668869,
554
+ -0.22351478040218353,
555
+ -0.006724653299897909,
556
+ -0.36625856161117554,
557
+ -0.4249938726425171,
558
+ -0.9999956488609314,
559
+ -0.999992847442627,
560
+ -0.77183997631073,
561
+ -0.9583328366279602,
562
+ -1.04777991771698
563
+ ],
564
+ "q01": [
565
+ 0.19817760735750198,
566
+ -0.2323527842760086,
567
+ -0.004393648169934749,
568
+ -0.14680973589420318,
569
+ -0.1899831309914589,
570
+ -0.9998269140720367,
571
+ -0.9998353064060211,
572
+ -0.49930458664894106,
573
+ -0.5963611721992492,
574
+ -1.0807034492492675,
575
+ 0.20821888744831085,
576
+ -0.13953636586666107,
577
+ -0.0033576888265088203,
578
+ -0.1788107320666313,
579
+ -0.22050866037607195,
580
+ -0.999310964345932,
581
+ -0.9993988335132599,
582
+ -0.5106797099113465,
583
+ -0.7302295982837677,
584
+ 0.05842937603592872
585
+ ],
586
+ "q99": [
587
+ 0.44812404513359066,
588
+ 0.14210240542888639,
589
+ 0.337252739071846,
590
+ 0.9943239092826842,
591
+ 0.5118523061275481,
592
+ 0.031205366365610953,
593
+ 0.04714705012738701,
594
+ 0.992770653963089,
595
+ 0.18282963484525644,
596
+ 1.1270769238471985,
597
+ 0.47021201252937317,
598
+ 0.2550090014934537,
599
+ 0.3824465185403823,
600
+ 0.995180070400238,
601
+ 0.46117363572120657,
602
+ -0.047424964234233064,
603
+ -3.7679112665500725e-06,
604
+ 0.9869830250740051,
605
+ 0.23266565054655072,
606
+ 1.1919615149497986
607
+ ],
608
+ "mask": [
609
+ true,
610
+ true,
611
+ true,
612
+ false,
613
+ false,
614
+ false,
615
+ false,
616
+ false,
617
+ false,
618
+ false,
619
+ true,
620
+ true,
621
+ true,
622
+ false,
623
+ false,
624
+ false,
625
+ false,
626
+ false,
627
+ false,
628
+ false
629
+ ]
630
+ },
631
+ "num_transitions": 12911,
632
+ "num_trajectories": 50
633
+ }
634
+ }
1e-4/twinvla-scratch-1e-4-aloha_shoes_table/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03fd044ea6df68237b7ac32e1e28050faa5813813df1357668cba2b59a5e8146
3
+ size 2889536104
1e-4/twinvla-scratch-1e-4-aloha_shoes_table/singlevla_config/config.json ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_commit_hash": null,
3
+ "_name_or_path": "/scratch2/jellyho/rebuttal/singlevla-work/Eagle2_1B-Scratch-DiT-B",
4
+ "action_dim": 10,
5
+ "action_head": "DiT",
6
+ "action_head_hidden_dim": 1024,
7
+ "action_len": 20,
8
+ "aggregation": "None",
9
+ "architectures": [
10
+ "Eagle2_1BVLA"
11
+ ],
12
+ "auto_map": {},
13
+ "denoiser": "FM",
14
+ "diffusion_batch": 32,
15
+ "dit_size": "DiT-B",
16
+ "downsample_ratio": 0.5,
17
+ "dynamic_image_size": true,
18
+ "efficient_loss": true,
19
+ "enable_cfg": true,
20
+ "force_image_size": 448,
21
+ "global_normalization": true,
22
+ "image_size": 448,
23
+ "keep_aspect_ratio": false,
24
+ "knowledge_insulation": false,
25
+ "llm_config": {
26
+ "_attn_implementation_autoset": true,
27
+ "_name_or_path": "./pretrained/Qwen2_5-0_5B-Instruct",
28
+ "add_cross_attention": false,
29
+ "architectures": [
30
+ "Qwen2ForCausalLM"
31
+ ],
32
+ "attention_dropout": 0.0,
33
+ "auto_map": {
34
+ "AutoConfig": "configuration_qwen2.Qwen2Config",
35
+ "AutoModel": "modeling_qwen2.Qwen2Model",
36
+ "AutoModelForCausalLM": "modeling_qwen2.Qwen2ForCausalLM"
37
+ },
38
+ "bad_words_ids": null,
39
+ "begin_suppress_tokens": null,
40
+ "bos_token_id": 151643,
41
+ "chunk_size_feed_forward": 0,
42
+ "cross_attention_hidden_size": null,
43
+ "decoder_start_token_id": null,
44
+ "diversity_penalty": 0.0,
45
+ "do_sample": false,
46
+ "early_stopping": false,
47
+ "encoder_no_repeat_ngram_size": 0,
48
+ "eos_token_id": 151645,
49
+ "exponential_decay_length_penalty": null,
50
+ "finetuning_task": null,
51
+ "forced_bos_token_id": null,
52
+ "forced_eos_token_id": null,
53
+ "hidden_act": "silu",
54
+ "hidden_size": 896,
55
+ "id2label": {
56
+ "0": "LABEL_0",
57
+ "1": "LABEL_1"
58
+ },
59
+ "initializer_range": 0.02,
60
+ "intermediate_size": 4864,
61
+ "is_decoder": false,
62
+ "is_encoder_decoder": false,
63
+ "label2id": {
64
+ "LABEL_0": 0,
65
+ "LABEL_1": 1
66
+ },
67
+ "length_penalty": 1.0,
68
+ "max_length": 20,
69
+ "max_position_embeddings": 32768,
70
+ "max_window_layers": 21,
71
+ "min_length": 0,
72
+ "model_type": "qwen2",
73
+ "no_repeat_ngram_size": 0,
74
+ "num_attention_heads": 14,
75
+ "num_beam_groups": 1,
76
+ "num_beams": 1,
77
+ "num_hidden_layers": 24,
78
+ "num_key_value_heads": 2,
79
+ "num_return_sequences": 1,
80
+ "output_attentions": false,
81
+ "output_hidden_states": false,
82
+ "output_scores": false,
83
+ "pad_token_id": null,
84
+ "prefix": null,
85
+ "problem_type": null,
86
+ "pruned_heads": {},
87
+ "remove_invalid_values": false,
88
+ "repetition_penalty": 1.0,
89
+ "return_dict": true,
90
+ "return_dict_in_generate": false,
91
+ "rms_norm_eps": 1e-06,
92
+ "rope_scaling": null,
93
+ "rope_theta": 1000000.0,
94
+ "sep_token_id": null,
95
+ "sliding_window": 32768,
96
+ "suppress_tokens": null,
97
+ "task_specific_params": null,
98
+ "temperature": 1.0,
99
+ "tf_legacy_loss": false,
100
+ "tie_encoder_decoder": false,
101
+ "tie_word_embeddings": true,
102
+ "tokenizer_class": null,
103
+ "top_k": 50,
104
+ "top_p": 1.0,
105
+ "torch_dtype": "bfloat16",
106
+ "torchscript": false,
107
+ "transformers_version": "4.50.0.dev0",
108
+ "typical_p": 1.0,
109
+ "use_bfloat16": false,
110
+ "use_cache": false,
111
+ "use_sliding_window": false,
112
+ "vocab_size": 151674
113
+ },
114
+ "loss_version": "v4",
115
+ "max_dynamic_patch": 12,
116
+ "min_dynamic_patch": 1,
117
+ "mlp_checkpoint": true,
118
+ "model_path": "nvidia/Eagle2-1B",
119
+ "model_type": "Eagle2_1BVLA",
120
+ "modeling": "denoising",
121
+ "normalization": "quantile",
122
+ "num_readouts": 1,
123
+ "pad2square": false,
124
+ "pre_feature_reduction": false,
125
+ "ps_version": "v2",
126
+ "readout_token_as_eos": true,
127
+ "return_text": null,
128
+ "select_layer": -1,
129
+ "state_dim": 10,
130
+ "stopping_token": "|",
131
+ "template": "qwen2-chat",
132
+ "test_denoising_steps": 10,
133
+ "torch_dtype": "bfloat16",
134
+ "train_denoising_steps": 100,
135
+ "transformers_version": null,
136
+ "use_backbone_lora": 0,
137
+ "use_llm_lora": 0,
138
+ "use_thumbnail": true,
139
+ "vision_config": {
140
+ "_attn_implementation_autoset": true,
141
+ "_name_or_path": "",
142
+ "add_cross_attention": false,
143
+ "architectures": [
144
+ "SiglipVisionModel"
145
+ ],
146
+ "attention_dropout": 0.0,
147
+ "auto_map": {
148
+ "AutoConfig": "configuration_siglip.SiglipVisionConfig",
149
+ "AutoModel": "modeling_siglip.SiglipVisionModel"
150
+ },
151
+ "bad_words_ids": null,
152
+ "begin_suppress_tokens": null,
153
+ "bos_token_id": null,
154
+ "chunk_size_feed_forward": 0,
155
+ "cross_attention_hidden_size": null,
156
+ "decoder_start_token_id": null,
157
+ "diversity_penalty": 0.0,
158
+ "do_sample": false,
159
+ "drop_path_rate": 0.1,
160
+ "early_stopping": false,
161
+ "encoder_no_repeat_ngram_size": 0,
162
+ "eos_token_id": null,
163
+ "exponential_decay_length_penalty": null,
164
+ "finetuning_task": null,
165
+ "forced_bos_token_id": null,
166
+ "forced_eos_token_id": null,
167
+ "hidden_act": "gelu_pytorch_tanh",
168
+ "hidden_size": 1152,
169
+ "id2label": {
170
+ "0": "LABEL_0",
171
+ "1": "LABEL_1"
172
+ },
173
+ "image_size": 448,
174
+ "intermediate_size": 4304,
175
+ "is_decoder": false,
176
+ "is_encoder_decoder": false,
177
+ "label2id": {
178
+ "LABEL_0": 0,
179
+ "LABEL_1": 1
180
+ },
181
+ "layer_norm_eps": 1e-06,
182
+ "length_penalty": 1.0,
183
+ "max_length": 20,
184
+ "min_length": 0,
185
+ "model_type": "siglip_vision_model",
186
+ "no_repeat_ngram_size": 0,
187
+ "num_attention_heads": 16,
188
+ "num_beam_groups": 1,
189
+ "num_beams": 1,
190
+ "num_channels": 3,
191
+ "num_hidden_layers": 27,
192
+ "num_image_tokens": 1024,
193
+ "num_return_sequences": 1,
194
+ "output_attentions": false,
195
+ "output_hidden_states": false,
196
+ "output_scores": false,
197
+ "pad_token_id": null,
198
+ "patch_size": 14,
199
+ "prefix": null,
200
+ "problem_type": null,
201
+ "projection_dim": 2048,
202
+ "projector_hidden_act": "gelu_fast",
203
+ "pruned_heads": {},
204
+ "remove_invalid_values": false,
205
+ "repetition_penalty": 1.0,
206
+ "return_dict": true,
207
+ "return_dict_in_generate": false,
208
+ "sep_token_id": null,
209
+ "suppress_tokens": null,
210
+ "task_specific_params": null,
211
+ "temperature": 1.0,
212
+ "tf_legacy_loss": false,
213
+ "tie_encoder_decoder": false,
214
+ "tie_word_embeddings": true,
215
+ "tokenizer_class": null,
216
+ "top_k": 50,
217
+ "top_p": 1.0,
218
+ "torch_dtype": "bfloat16",
219
+ "torchscript": false,
220
+ "transformers_version": "4.50.0.dev0",
221
+ "typical_p": 1.0,
222
+ "use_bfloat16": false,
223
+ "vision_use_head": false
224
+ },
225
+ "vocab_size": 151674,
226
+ "vocab_start": null
227
+ }
2e-5/twinvla-aloha_shoes_table/config.json ADDED
@@ -0,0 +1,317 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "action_dim": 10,
3
+ "action_head": "DiT",
4
+ "action_len": 20,
5
+ "architectures": [
6
+ "Eagle2_1BTwinVLA"
7
+ ],
8
+ "attn_reweighting": true,
9
+ "denoiser": "FM",
10
+ "dit_scratch": false,
11
+ "global_normalization": true,
12
+ "hz_interpolate": null,
13
+ "interpolate_gripper": false,
14
+ "knowledge_insulation": false,
15
+ "model_path": null,
16
+ "model_type": "Eagle2_1BTwinVLA",
17
+ "modeling": "denoising",
18
+ "normalization": "quantile",
19
+ "num_readouts": 1,
20
+ "readout_token_as_eos": false,
21
+ "share_decoder": true,
22
+ "share_embed_tokens": true,
23
+ "share_vision": true,
24
+ "singlevla_config": {
25
+ "_attn_implementation_autoset": false,
26
+ "_attn_implementation_internal": null,
27
+ "_commit_hash": "428b4d21376ff21d70b8b8830db6f6ab3907bfd8",
28
+ "_name_or_path": "jellyho/TwinVLA",
29
+ "action_dim": 10,
30
+ "action_head": "DiT",
31
+ "action_head_hidden_dim": 1024,
32
+ "action_len": 20,
33
+ "add_cross_attention": false,
34
+ "aggregation": "false",
35
+ "architectures": [
36
+ "Eagle2_1BVLA"
37
+ ],
38
+ "auto_map": {},
39
+ "bad_words_ids": null,
40
+ "begin_suppress_tokens": null,
41
+ "bos_token_id": null,
42
+ "chunk_size_feed_forward": 0,
43
+ "cross_attention_hidden_size": null,
44
+ "dataset_statistics_path": null,
45
+ "decoder_start_token_id": null,
46
+ "denoiser": "FM",
47
+ "diffusion_batch": 32,
48
+ "dit_size": "DiT-B",
49
+ "diversity_penalty": 0.0,
50
+ "do_sample": false,
51
+ "downsample_ratio": 0.5,
52
+ "dynamic_image_size": true,
53
+ "early_stopping": false,
54
+ "efficient_loss": true,
55
+ "enable_cfg": true,
56
+ "encoder_no_repeat_ngram_size": 0,
57
+ "eos_token_id": null,
58
+ "exponential_decay_length_penalty": null,
59
+ "finetuning_task": null,
60
+ "force_image_size": 448,
61
+ "forced_bos_token_id": null,
62
+ "forced_eos_token_id": null,
63
+ "global_normalization": true,
64
+ "hz_interpolate": 20,
65
+ "id2label": {
66
+ "0": "LABEL_0",
67
+ "1": "LABEL_1"
68
+ },
69
+ "image_size": 224,
70
+ "interpolate_gripper": false,
71
+ "is_decoder": false,
72
+ "is_encoder_decoder": false,
73
+ "keep_aspect_ratio": false,
74
+ "knowledge_insulation": false,
75
+ "label2id": {
76
+ "LABEL_0": 0,
77
+ "LABEL_1": 1
78
+ },
79
+ "length_penalty": 1.0,
80
+ "llm_config": {
81
+ "_attn_implementation_autoset": true,
82
+ "_name_or_path": "./pretrained/Qwen2_5-0_5B-Instruct",
83
+ "add_cross_attention": false,
84
+ "architectures": [
85
+ "Qwen2ForCausalLM"
86
+ ],
87
+ "attention_dropout": 0.0,
88
+ "auto_map": {
89
+ "AutoConfig": "configuration_qwen2.Qwen2Config",
90
+ "AutoModel": "modeling_qwen2.Qwen2Model",
91
+ "AutoModelForCausalLM": "modeling_qwen2.Qwen2ForCausalLM"
92
+ },
93
+ "bad_words_ids": null,
94
+ "begin_suppress_tokens": null,
95
+ "bos_token_id": 151643,
96
+ "chunk_size_feed_forward": 0,
97
+ "cross_attention_hidden_size": null,
98
+ "decoder_start_token_id": null,
99
+ "diversity_penalty": 0.0,
100
+ "do_sample": false,
101
+ "early_stopping": false,
102
+ "encoder_no_repeat_ngram_size": 0,
103
+ "eos_token_id": 151645,
104
+ "exponential_decay_length_penalty": null,
105
+ "finetuning_task": null,
106
+ "forced_bos_token_id": null,
107
+ "forced_eos_token_id": null,
108
+ "hidden_act": "silu",
109
+ "hidden_size": 896,
110
+ "id2label": {
111
+ "0": "LABEL_0",
112
+ "1": "LABEL_1"
113
+ },
114
+ "initializer_range": 0.02,
115
+ "intermediate_size": 4864,
116
+ "is_decoder": false,
117
+ "is_encoder_decoder": false,
118
+ "label2id": {
119
+ "LABEL_0": 0,
120
+ "LABEL_1": 1
121
+ },
122
+ "length_penalty": 1.0,
123
+ "max_length": 20,
124
+ "max_position_embeddings": 32768,
125
+ "max_window_layers": 21,
126
+ "min_length": 0,
127
+ "model_type": "qwen2",
128
+ "no_repeat_ngram_size": 0,
129
+ "num_attention_heads": 14,
130
+ "num_beam_groups": 1,
131
+ "num_beams": 1,
132
+ "num_hidden_layers": 24,
133
+ "num_key_value_heads": 2,
134
+ "num_return_sequences": 1,
135
+ "output_attentions": false,
136
+ "output_hidden_states": false,
137
+ "output_scores": false,
138
+ "pad_token_id": null,
139
+ "prefix": null,
140
+ "problem_type": null,
141
+ "pruned_heads": {},
142
+ "remove_invalid_values": false,
143
+ "repetition_penalty": 1.0,
144
+ "return_dict": true,
145
+ "return_dict_in_generate": false,
146
+ "rms_norm_eps": 1e-06,
147
+ "rope_scaling": null,
148
+ "rope_theta": 1000000.0,
149
+ "sep_token_id": null,
150
+ "sliding_window": 32768,
151
+ "suppress_tokens": null,
152
+ "task_specific_params": null,
153
+ "temperature": 1.0,
154
+ "tf_legacy_loss": false,
155
+ "tie_encoder_decoder": false,
156
+ "tie_word_embeddings": true,
157
+ "tokenizer_class": null,
158
+ "top_k": 50,
159
+ "top_p": 1.0,
160
+ "torch_dtype": "bfloat16",
161
+ "torchscript": false,
162
+ "transformers_version": "4.50.0.dev0",
163
+ "typical_p": 1.0,
164
+ "use_bfloat16": false,
165
+ "use_cache": false,
166
+ "use_sliding_window": false,
167
+ "vocab_size": 151674
168
+ },
169
+ "loss_version": "v4",
170
+ "max_dynamic_patch": 12,
171
+ "max_length": 20,
172
+ "min_dynamic_patch": 1,
173
+ "min_length": 0,
174
+ "mlp_checkpoint": true,
175
+ "model_path": "nvidia/Eagle2-1B",
176
+ "model_type": "Eagle2_1BVLA",
177
+ "modeling": "denoising",
178
+ "no_repeat_ngram_size": 0,
179
+ "normalization": "quantile",
180
+ "num_beam_groups": 1,
181
+ "num_beams": 1,
182
+ "num_readouts": 1,
183
+ "num_return_sequences": 1,
184
+ "output_attentions": false,
185
+ "output_hidden_states": false,
186
+ "output_scores": false,
187
+ "pad2square": false,
188
+ "pad_token_id": null,
189
+ "pre_feature_reduction": false,
190
+ "prefix": null,
191
+ "problem_type": null,
192
+ "pruned_heads": {},
193
+ "ps_version": "v2",
194
+ "readout_token_as_eos": false,
195
+ "remove_invalid_values": false,
196
+ "repetition_penalty": 1.0,
197
+ "return_dict": true,
198
+ "return_dict_in_generate": false,
199
+ "return_text": null,
200
+ "select_layer": -1,
201
+ "sep_token_id": null,
202
+ "state_dim": 10,
203
+ "stopping_token": "|",
204
+ "suppress_tokens": null,
205
+ "task_specific_params": null,
206
+ "temperature": 1.0,
207
+ "template": "qwen2-chat",
208
+ "test_denoising_steps": 10,
209
+ "tf_legacy_loss": false,
210
+ "tie_encoder_decoder": false,
211
+ "tie_word_embeddings": true,
212
+ "tokenizer_class": null,
213
+ "top_k": 50,
214
+ "top_p": 1.0,
215
+ "torch_dtype": "bfloat16",
216
+ "torchscript": false,
217
+ "train_denoising_steps": 100,
218
+ "typical_p": 1.0,
219
+ "use_backbone_lora": 0,
220
+ "use_bfloat16": false,
221
+ "use_llm_lora": 0,
222
+ "use_thumbnail": true,
223
+ "vision_config": {
224
+ "_attn_implementation_autoset": true,
225
+ "_name_or_path": "",
226
+ "add_cross_attention": false,
227
+ "architectures": [
228
+ "SiglipVisionModel"
229
+ ],
230
+ "attention_dropout": 0.0,
231
+ "auto_map": {
232
+ "AutoConfig": "configuration_siglip.SiglipVisionConfig",
233
+ "AutoModel": "modeling_siglip.SiglipVisionModel"
234
+ },
235
+ "bad_words_ids": null,
236
+ "begin_suppress_tokens": null,
237
+ "bos_token_id": null,
238
+ "chunk_size_feed_forward": 0,
239
+ "cross_attention_hidden_size": null,
240
+ "decoder_start_token_id": null,
241
+ "diversity_penalty": 0.0,
242
+ "do_sample": false,
243
+ "drop_path_rate": 0.1,
244
+ "early_stopping": false,
245
+ "encoder_no_repeat_ngram_size": 0,
246
+ "eos_token_id": null,
247
+ "exponential_decay_length_penalty": null,
248
+ "finetuning_task": null,
249
+ "forced_bos_token_id": null,
250
+ "forced_eos_token_id": null,
251
+ "hidden_act": "gelu_pytorch_tanh",
252
+ "hidden_size": 1152,
253
+ "id2label": {
254
+ "0": "LABEL_0",
255
+ "1": "LABEL_1"
256
+ },
257
+ "image_size": 448,
258
+ "intermediate_size": 4304,
259
+ "is_decoder": false,
260
+ "is_encoder_decoder": false,
261
+ "label2id": {
262
+ "LABEL_0": 0,
263
+ "LABEL_1": 1
264
+ },
265
+ "layer_norm_eps": 1e-06,
266
+ "length_penalty": 1.0,
267
+ "max_length": 20,
268
+ "min_length": 0,
269
+ "model_type": "siglip_vision_model",
270
+ "no_repeat_ngram_size": 0,
271
+ "num_attention_heads": 16,
272
+ "num_beam_groups": 1,
273
+ "num_beams": 1,
274
+ "num_channels": 3,
275
+ "num_hidden_layers": 27,
276
+ "num_image_tokens": 1024,
277
+ "num_return_sequences": 1,
278
+ "output_attentions": false,
279
+ "output_hidden_states": false,
280
+ "output_scores": false,
281
+ "pad_token_id": null,
282
+ "patch_size": 14,
283
+ "prefix": null,
284
+ "problem_type": null,
285
+ "projection_dim": 2048,
286
+ "projector_hidden_act": "gelu_fast",
287
+ "pruned_heads": {},
288
+ "remove_invalid_values": false,
289
+ "repetition_penalty": 1.0,
290
+ "return_dict": true,
291
+ "return_dict_in_generate": false,
292
+ "sep_token_id": null,
293
+ "suppress_tokens": null,
294
+ "task_specific_params": null,
295
+ "temperature": 1.0,
296
+ "tf_legacy_loss": false,
297
+ "tie_encoder_decoder": false,
298
+ "tie_word_embeddings": true,
299
+ "tokenizer_class": null,
300
+ "top_k": 50,
301
+ "top_p": 1.0,
302
+ "torch_dtype": "bfloat16",
303
+ "torchscript": false,
304
+ "transformers_version": "4.50.0.dev0",
305
+ "typical_p": 1.0,
306
+ "use_bfloat16": false,
307
+ "vision_use_head": false
308
+ },
309
+ "vocab_size": 151674,
310
+ "vocab_start": null
311
+ },
312
+ "singlevla_config_path": "jellyho/TwinVLA",
313
+ "singlevla_pretrained_path": null,
314
+ "state_dim": 10,
315
+ "torch_dtype": "bfloat16",
316
+ "transformers_version": "4.50.0.dev0"
317
+ }
2e-5/twinvla-aloha_shoes_table/dataset_statistics.json ADDED
@@ -0,0 +1,634 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "aloha_shoes_table_new": {
3
+ "action": {
4
+ "mean": [
5
+ 0.32606810331344604,
6
+ -0.03153972327709198,
7
+ 0.17551687359809875,
8
+ 0.3416697382926941,
9
+ 0.14959800243377686,
10
+ -0.7719306349754333,
11
+ -0.7661699652671814,
12
+ 0.30325645208358765,
13
+ -0.1495625525712967,
14
+ 0.11935558915138245,
15
+ 0.3463062345981598,
16
+ 0.03736328333616257,
17
+ 0.1803482174873352,
18
+ 0.3275619447231293,
19
+ 0.1299152374267578,
20
+ -0.8319970369338989,
21
+ -0.7951449155807495,
22
+ 0.255355566740036,
23
+ -0.18214373290538788,
24
+ 0.19820308685302734
25
+ ],
26
+ "std": [
27
+ 0.06460446119308472,
28
+ 0.09588943421840668,
29
+ 0.10945848375558853,
30
+ 0.3783090114593506,
31
+ 0.1452838033437729,
32
+ 0.3174605965614319,
33
+ 0.3114127814769745,
34
+ 0.41768184304237366,
35
+ 0.16497282683849335,
36
+ 0.9928211569786072,
37
+ 0.07045891135931015,
38
+ 0.1014346107840538,
39
+ 0.11245167255401611,
40
+ 0.32256755232810974,
41
+ 0.15707549452781677,
42
+ 0.2342674434185028,
43
+ 0.26082107424736023,
44
+ 0.4003267288208008,
45
+ 0.20266719162464142,
46
+ 0.980128288269043
47
+ ],
48
+ "max": [
49
+ 0.48015326261520386,
50
+ 0.179313525557518,
51
+ 0.3523038923740387,
52
+ 0.9999800324440002,
53
+ 0.7157489657402039,
54
+ 0.11180483549833298,
55
+ 0.1242646798491478,
56
+ 0.9998366832733154,
57
+ 0.285250186920166,
58
+ 1.0,
59
+ 0.49179938435554504,
60
+ 0.3557826578617096,
61
+ 0.42447179555892944,
62
+ 0.9987993836402893,
63
+ 0.5477575659751892,
64
+ 0.05208699405193329,
65
+ 0.034653306007385254,
66
+ 0.9937106966972351,
67
+ 0.3852289915084839,
68
+ 1.0
69
+ ],
70
+ "min": [
71
+ 0.12527647614479065,
72
+ -0.31394532322883606,
73
+ -0.04988693445920944,
74
+ -0.2561202347278595,
75
+ -0.30035507678985596,
76
+ -0.999981164932251,
77
+ -0.9999915957450867,
78
+ -0.644327700138092,
79
+ -0.7897446751594543,
80
+ -1.0,
81
+ 0.17444398999214172,
82
+ -0.23238857090473175,
83
+ -0.0659869983792305,
84
+ -0.35028380155563354,
85
+ -0.3673132658004761,
86
+ -0.9999988079071045,
87
+ -0.9999988675117493,
88
+ -0.7761710286140442,
89
+ -0.9717934131622314,
90
+ -1.0
91
+ ],
92
+ "q01": [
93
+ 0.1924597442150116,
94
+ -0.23709256052970887,
95
+ -0.031008305028080944,
96
+ -0.15678457915782928,
97
+ -0.1863800033926964,
98
+ -0.9994285225868225,
99
+ -0.9997011423110962,
100
+ -0.5719999492168426,
101
+ -0.6091587543487549,
102
+ -1.0,
103
+ 0.20810787677764891,
104
+ -0.16556282192468644,
105
+ -0.017654908634722234,
106
+ -0.15096036493778228,
107
+ -0.22608168572187423,
108
+ -0.998928040266037,
109
+ -0.9990629017353058,
110
+ -0.5276546537876129,
111
+ -0.7234344184398651,
112
+ -1.0
113
+ ],
114
+ "q99": [
115
+ 0.457787150144577,
116
+ 0.1451936572790145,
117
+ 0.34190638065338136,
118
+ 0.9928598761558532,
119
+ 0.5035569965839384,
120
+ -0.05978704243898392,
121
+ -0.01781813446432354,
122
+ 0.9893046200275422,
123
+ 0.17466527372598611,
124
+ 1.0,
125
+ 0.4768679320812225,
126
+ 0.2598331540822982,
127
+ 0.39134971201419827,
128
+ 0.9723170220851898,
129
+ 0.47033962905406923,
130
+ -0.1864572197198868,
131
+ -0.060312222316861244,
132
+ 0.9807472229003906,
133
+ 0.21941211968660337,
134
+ 1.0
135
+ ],
136
+ "mask": [
137
+ true,
138
+ true,
139
+ true,
140
+ false,
141
+ false,
142
+ false,
143
+ false,
144
+ false,
145
+ false,
146
+ false,
147
+ true,
148
+ true,
149
+ true,
150
+ false,
151
+ false,
152
+ false,
153
+ false,
154
+ false,
155
+ false,
156
+ false
157
+ ]
158
+ },
159
+ "proprio": {
160
+ "mean": [
161
+ 0.3233906924724579,
162
+ -0.032134901732206345,
163
+ 0.16396647691726685,
164
+ 0.3156682550907135,
165
+ 0.1496039479970932,
166
+ -0.7826839685440063,
167
+ -0.769219696521759,
168
+ 0.3018459677696228,
169
+ -0.12293437123298645,
170
+ 0.6964512467384338,
171
+ 0.34235188364982605,
172
+ 0.03810477629303932,
173
+ 0.16778846085071564,
174
+ 0.2959897220134735,
175
+ 0.12479893863201141,
176
+ -0.8384960293769836,
177
+ -0.7965015172958374,
178
+ 0.2529186010360718,
179
+ -0.1519845426082611,
180
+ 0.7593687772750854
181
+ ],
182
+ "std": [
183
+ 0.062407124787569046,
184
+ 0.09441278874874115,
185
+ 0.10514318943023682,
186
+ 0.38240039348602295,
187
+ 0.14718197286128998,
188
+ 0.31222963333129883,
189
+ 0.31915774941444397,
190
+ 0.41190215945243835,
191
+ 0.1747966706752777,
192
+ 0.3845251202583313,
193
+ 0.06875818967819214,
194
+ 0.09827672690153122,
195
+ 0.10889741778373718,
196
+ 0.3346223533153534,
197
+ 0.15245452523231506,
198
+ 0.241925910115242,
199
+ 0.2739580273628235,
200
+ 0.4007769823074341,
201
+ 0.20697632431983948,
202
+ 0.30625119805336
203
+ ],
204
+ "max": [
205
+ 0.4779528081417084,
206
+ 0.17402252554893494,
207
+ 0.412266343832016,
208
+ 0.9999450445175171,
209
+ 0.6999034285545349,
210
+ 0.4170636534690857,
211
+ 0.4215781092643738,
212
+ 1.0,
213
+ 0.932714581489563,
214
+ 1.3076640367507935,
215
+ 0.48697036504745483,
216
+ 0.34565815329551697,
217
+ 0.415988564491272,
218
+ 0.9998390078544617,
219
+ 0.5593472123146057,
220
+ 0.20724913477897644,
221
+ 0.26142606139183044,
222
+ 1.0,
223
+ 0.5777683854103088,
224
+ 1.3103067874908447
225
+ ],
226
+ "min": [
227
+ 0.12448469549417496,
228
+ -0.30521926283836365,
229
+ -0.004976626019924879,
230
+ -0.21920020878314972,
231
+ -0.5096501708030701,
232
+ -1.0,
233
+ -0.9999982118606567,
234
+ -0.6262368559837341,
235
+ -0.7456304430961609,
236
+ -1.1091713905334473,
237
+ 0.1249726265668869,
238
+ -0.22351478040218353,
239
+ -0.006724653299897909,
240
+ -0.36625856161117554,
241
+ -0.4249938726425171,
242
+ -0.9999956488609314,
243
+ -0.999992847442627,
244
+ -0.77183997631073,
245
+ -0.9583328366279602,
246
+ -1.04777991771698
247
+ ],
248
+ "q01": [
249
+ 0.19817760735750198,
250
+ -0.2323527842760086,
251
+ -0.004393648169934749,
252
+ -0.14680973589420318,
253
+ -0.1899831309914589,
254
+ -0.9998269140720367,
255
+ -0.9998353064060211,
256
+ -0.49930458664894106,
257
+ -0.5963611721992492,
258
+ -1.0807034492492675,
259
+ 0.20821888744831085,
260
+ -0.13953636586666107,
261
+ -0.0033576888265088203,
262
+ -0.1788107320666313,
263
+ -0.22050866037607195,
264
+ -0.999310964345932,
265
+ -0.9993988335132599,
266
+ -0.5106797099113465,
267
+ -0.7302295982837677,
268
+ 0.05842937603592872
269
+ ],
270
+ "q99": [
271
+ 0.44812404513359066,
272
+ 0.14210240542888639,
273
+ 0.337252739071846,
274
+ 0.9943239092826842,
275
+ 0.5118523061275481,
276
+ 0.031205366365610953,
277
+ 0.04714705012738701,
278
+ 0.992770653963089,
279
+ 0.18282963484525644,
280
+ 1.1270769238471985,
281
+ 0.47021201252937317,
282
+ 0.2550090014934537,
283
+ 0.3824465185403823,
284
+ 0.995180070400238,
285
+ 0.46117363572120657,
286
+ -0.047424964234233064,
287
+ -3.7679112665500725e-06,
288
+ 0.9869830250740051,
289
+ 0.23266565054655072,
290
+ 1.1919615149497986
291
+ ],
292
+ "mask": [
293
+ true,
294
+ true,
295
+ true,
296
+ false,
297
+ false,
298
+ false,
299
+ false,
300
+ false,
301
+ false,
302
+ false,
303
+ true,
304
+ true,
305
+ true,
306
+ false,
307
+ false,
308
+ false,
309
+ false,
310
+ false,
311
+ false,
312
+ false
313
+ ]
314
+ },
315
+ "num_transitions": 12911,
316
+ "num_trajectories": 50
317
+ },
318
+ "aloha_shoes_table": {
319
+ "action": {
320
+ "mean": [
321
+ 0.32606810331344604,
322
+ -0.03153972327709198,
323
+ 0.17551687359809875,
324
+ 0.3416697382926941,
325
+ 0.14959800243377686,
326
+ -0.7719306349754333,
327
+ -0.7661699652671814,
328
+ 0.30325645208358765,
329
+ -0.1495625525712967,
330
+ 0.11935558915138245,
331
+ 0.3463062345981598,
332
+ 0.03736328333616257,
333
+ 0.1803482174873352,
334
+ 0.3275619447231293,
335
+ 0.1299152374267578,
336
+ -0.8319970369338989,
337
+ -0.7951449155807495,
338
+ 0.255355566740036,
339
+ -0.18214373290538788,
340
+ 0.19820308685302734
341
+ ],
342
+ "std": [
343
+ 0.06460446119308472,
344
+ 0.09588943421840668,
345
+ 0.10945848375558853,
346
+ 0.3783090114593506,
347
+ 0.1452838033437729,
348
+ 0.3174605965614319,
349
+ 0.3114127814769745,
350
+ 0.41768184304237366,
351
+ 0.16497282683849335,
352
+ 0.9928211569786072,
353
+ 0.07045891135931015,
354
+ 0.1014346107840538,
355
+ 0.11245167255401611,
356
+ 0.32256755232810974,
357
+ 0.15707549452781677,
358
+ 0.2342674434185028,
359
+ 0.26082107424736023,
360
+ 0.4003267288208008,
361
+ 0.20266719162464142,
362
+ 0.980128288269043
363
+ ],
364
+ "max": [
365
+ 0.48015326261520386,
366
+ 0.179313525557518,
367
+ 0.3523038923740387,
368
+ 0.9999800324440002,
369
+ 0.7157489657402039,
370
+ 0.11180483549833298,
371
+ 0.1242646798491478,
372
+ 0.9998366832733154,
373
+ 0.285250186920166,
374
+ 1.0,
375
+ 0.49179938435554504,
376
+ 0.3557826578617096,
377
+ 0.42447179555892944,
378
+ 0.9987993836402893,
379
+ 0.5477575659751892,
380
+ 0.05208699405193329,
381
+ 0.034653306007385254,
382
+ 0.9937106966972351,
383
+ 0.3852289915084839,
384
+ 1.0
385
+ ],
386
+ "min": [
387
+ 0.12527647614479065,
388
+ -0.31394532322883606,
389
+ -0.04988693445920944,
390
+ -0.2561202347278595,
391
+ -0.30035507678985596,
392
+ -0.999981164932251,
393
+ -0.9999915957450867,
394
+ -0.644327700138092,
395
+ -0.7897446751594543,
396
+ -1.0,
397
+ 0.17444398999214172,
398
+ -0.23238857090473175,
399
+ -0.0659869983792305,
400
+ -0.35028380155563354,
401
+ -0.3673132658004761,
402
+ -0.9999988079071045,
403
+ -0.9999988675117493,
404
+ -0.7761710286140442,
405
+ -0.9717934131622314,
406
+ -1.0
407
+ ],
408
+ "q01": [
409
+ 0.1924597442150116,
410
+ -0.23709256052970887,
411
+ -0.031008305028080944,
412
+ -0.15678457915782928,
413
+ -0.1863800033926964,
414
+ -0.9994285225868225,
415
+ -0.9997011423110962,
416
+ -0.5719999492168426,
417
+ -0.6091587543487549,
418
+ -1.0,
419
+ 0.20810787677764891,
420
+ -0.16556282192468644,
421
+ -0.017654908634722234,
422
+ -0.15096036493778228,
423
+ -0.22608168572187423,
424
+ -0.998928040266037,
425
+ -0.9990629017353058,
426
+ -0.5276546537876129,
427
+ -0.7234344184398651,
428
+ -1.0
429
+ ],
430
+ "q99": [
431
+ 0.457787150144577,
432
+ 0.1451936572790145,
433
+ 0.34190638065338136,
434
+ 0.9928598761558532,
435
+ 0.5035569965839384,
436
+ -0.05978704243898392,
437
+ -0.01781813446432354,
438
+ 0.9893046200275422,
439
+ 0.17466527372598611,
440
+ 1.0,
441
+ 0.4768679320812225,
442
+ 0.2598331540822982,
443
+ 0.39134971201419827,
444
+ 0.9723170220851898,
445
+ 0.47033962905406923,
446
+ -0.1864572197198868,
447
+ -0.060312222316861244,
448
+ 0.9807472229003906,
449
+ 0.21941211968660337,
450
+ 1.0
451
+ ],
452
+ "mask": [
453
+ true,
454
+ true,
455
+ true,
456
+ false,
457
+ false,
458
+ false,
459
+ false,
460
+ false,
461
+ false,
462
+ false,
463
+ true,
464
+ true,
465
+ true,
466
+ false,
467
+ false,
468
+ false,
469
+ false,
470
+ false,
471
+ false,
472
+ false
473
+ ]
474
+ },
475
+ "proprio": {
476
+ "mean": [
477
+ 0.3233906924724579,
478
+ -0.032134901732206345,
479
+ 0.16396647691726685,
480
+ 0.3156682550907135,
481
+ 0.1496039479970932,
482
+ -0.7826839685440063,
483
+ -0.769219696521759,
484
+ 0.3018459677696228,
485
+ -0.12293437123298645,
486
+ 0.6964512467384338,
487
+ 0.34235188364982605,
488
+ 0.03810477629303932,
489
+ 0.16778846085071564,
490
+ 0.2959897220134735,
491
+ 0.12479893863201141,
492
+ -0.8384960293769836,
493
+ -0.7965015172958374,
494
+ 0.2529186010360718,
495
+ -0.1519845426082611,
496
+ 0.7593687772750854
497
+ ],
498
+ "std": [
499
+ 0.062407124787569046,
500
+ 0.09441278874874115,
501
+ 0.10514318943023682,
502
+ 0.38240039348602295,
503
+ 0.14718197286128998,
504
+ 0.31222963333129883,
505
+ 0.31915774941444397,
506
+ 0.41190215945243835,
507
+ 0.1747966706752777,
508
+ 0.3845251202583313,
509
+ 0.06875818967819214,
510
+ 0.09827672690153122,
511
+ 0.10889741778373718,
512
+ 0.3346223533153534,
513
+ 0.15245452523231506,
514
+ 0.241925910115242,
515
+ 0.2739580273628235,
516
+ 0.4007769823074341,
517
+ 0.20697632431983948,
518
+ 0.30625119805336
519
+ ],
520
+ "max": [
521
+ 0.4779528081417084,
522
+ 0.17402252554893494,
523
+ 0.412266343832016,
524
+ 0.9999450445175171,
525
+ 0.6999034285545349,
526
+ 0.4170636534690857,
527
+ 0.4215781092643738,
528
+ 1.0,
529
+ 0.932714581489563,
530
+ 1.3076640367507935,
531
+ 0.48697036504745483,
532
+ 0.34565815329551697,
533
+ 0.415988564491272,
534
+ 0.9998390078544617,
535
+ 0.5593472123146057,
536
+ 0.20724913477897644,
537
+ 0.26142606139183044,
538
+ 1.0,
539
+ 0.5777683854103088,
540
+ 1.3103067874908447
541
+ ],
542
+ "min": [
543
+ 0.12448469549417496,
544
+ -0.30521926283836365,
545
+ -0.004976626019924879,
546
+ -0.21920020878314972,
547
+ -0.5096501708030701,
548
+ -1.0,
549
+ -0.9999982118606567,
550
+ -0.6262368559837341,
551
+ -0.7456304430961609,
552
+ -1.1091713905334473,
553
+ 0.1249726265668869,
554
+ -0.22351478040218353,
555
+ -0.006724653299897909,
556
+ -0.36625856161117554,
557
+ -0.4249938726425171,
558
+ -0.9999956488609314,
559
+ -0.999992847442627,
560
+ -0.77183997631073,
561
+ -0.9583328366279602,
562
+ -1.04777991771698
563
+ ],
564
+ "q01": [
565
+ 0.19817760735750198,
566
+ -0.2323527842760086,
567
+ -0.004393648169934749,
568
+ -0.14680973589420318,
569
+ -0.1899831309914589,
570
+ -0.9998269140720367,
571
+ -0.9998353064060211,
572
+ -0.49930458664894106,
573
+ -0.5963611721992492,
574
+ -1.0807034492492675,
575
+ 0.20821888744831085,
576
+ -0.13953636586666107,
577
+ -0.0033576888265088203,
578
+ -0.1788107320666313,
579
+ -0.22050866037607195,
580
+ -0.999310964345932,
581
+ -0.9993988335132599,
582
+ -0.5106797099113465,
583
+ -0.7302295982837677,
584
+ 0.05842937603592872
585
+ ],
586
+ "q99": [
587
+ 0.44812404513359066,
588
+ 0.14210240542888639,
589
+ 0.337252739071846,
590
+ 0.9943239092826842,
591
+ 0.5118523061275481,
592
+ 0.031205366365610953,
593
+ 0.04714705012738701,
594
+ 0.992770653963089,
595
+ 0.18282963484525644,
596
+ 1.1270769238471985,
597
+ 0.47021201252937317,
598
+ 0.2550090014934537,
599
+ 0.3824465185403823,
600
+ 0.995180070400238,
601
+ 0.46117363572120657,
602
+ -0.047424964234233064,
603
+ -3.7679112665500725e-06,
604
+ 0.9869830250740051,
605
+ 0.23266565054655072,
606
+ 1.1919615149497986
607
+ ],
608
+ "mask": [
609
+ true,
610
+ true,
611
+ true,
612
+ false,
613
+ false,
614
+ false,
615
+ false,
616
+ false,
617
+ false,
618
+ false,
619
+ true,
620
+ true,
621
+ true,
622
+ false,
623
+ false,
624
+ false,
625
+ false,
626
+ false,
627
+ false,
628
+ false
629
+ ]
630
+ },
631
+ "num_transitions": 12911,
632
+ "num_trajectories": 50
633
+ }
634
+ }
2e-5/twinvla-aloha_shoes_table/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a57f9a891e27b6f51bc90a4252de9c8ac6dbdd21c49456fd501405bfdd1589fa
3
+ size 2889539864
2e-5/twinvla-aloha_shoes_table/singlevla_config/config.json ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_commit_hash": "428b4d21376ff21d70b8b8830db6f6ab3907bfd8",
3
+ "_name_or_path": "jellyho/TwinVLA",
4
+ "action_dim": 10,
5
+ "action_head": "DiT",
6
+ "action_head_hidden_dim": 1024,
7
+ "action_len": 20,
8
+ "aggregation": "false",
9
+ "architectures": [
10
+ "Eagle2_1BVLA"
11
+ ],
12
+ "auto_map": {},
13
+ "dataset_statistics_path": null,
14
+ "denoiser": "FM",
15
+ "diffusion_batch": 32,
16
+ "dit_size": "DiT-B",
17
+ "downsample_ratio": 0.5,
18
+ "dynamic_image_size": true,
19
+ "efficient_loss": true,
20
+ "enable_cfg": true,
21
+ "force_image_size": 448,
22
+ "global_normalization": true,
23
+ "hz_interpolate": 20,
24
+ "image_size": 224,
25
+ "interpolate_gripper": false,
26
+ "keep_aspect_ratio": false,
27
+ "knowledge_insulation": false,
28
+ "llm_config": {
29
+ "_attn_implementation_autoset": true,
30
+ "_name_or_path": "./pretrained/Qwen2_5-0_5B-Instruct",
31
+ "add_cross_attention": false,
32
+ "architectures": [
33
+ "Qwen2ForCausalLM"
34
+ ],
35
+ "attention_dropout": 0.0,
36
+ "auto_map": {
37
+ "AutoConfig": "configuration_qwen2.Qwen2Config",
38
+ "AutoModel": "modeling_qwen2.Qwen2Model",
39
+ "AutoModelForCausalLM": "modeling_qwen2.Qwen2ForCausalLM"
40
+ },
41
+ "bad_words_ids": null,
42
+ "begin_suppress_tokens": null,
43
+ "bos_token_id": 151643,
44
+ "chunk_size_feed_forward": 0,
45
+ "cross_attention_hidden_size": null,
46
+ "decoder_start_token_id": null,
47
+ "diversity_penalty": 0.0,
48
+ "do_sample": false,
49
+ "early_stopping": false,
50
+ "encoder_no_repeat_ngram_size": 0,
51
+ "eos_token_id": 151645,
52
+ "exponential_decay_length_penalty": null,
53
+ "finetuning_task": null,
54
+ "forced_bos_token_id": null,
55
+ "forced_eos_token_id": null,
56
+ "hidden_act": "silu",
57
+ "hidden_size": 896,
58
+ "id2label": {
59
+ "0": "LABEL_0",
60
+ "1": "LABEL_1"
61
+ },
62
+ "initializer_range": 0.02,
63
+ "intermediate_size": 4864,
64
+ "is_decoder": false,
65
+ "is_encoder_decoder": false,
66
+ "label2id": {
67
+ "LABEL_0": 0,
68
+ "LABEL_1": 1
69
+ },
70
+ "length_penalty": 1.0,
71
+ "max_length": 20,
72
+ "max_position_embeddings": 32768,
73
+ "max_window_layers": 21,
74
+ "min_length": 0,
75
+ "model_type": "qwen2",
76
+ "no_repeat_ngram_size": 0,
77
+ "num_attention_heads": 14,
78
+ "num_beam_groups": 1,
79
+ "num_beams": 1,
80
+ "num_hidden_layers": 24,
81
+ "num_key_value_heads": 2,
82
+ "num_return_sequences": 1,
83
+ "output_attentions": false,
84
+ "output_hidden_states": false,
85
+ "output_scores": false,
86
+ "pad_token_id": null,
87
+ "prefix": null,
88
+ "problem_type": null,
89
+ "pruned_heads": {},
90
+ "remove_invalid_values": false,
91
+ "repetition_penalty": 1.0,
92
+ "return_dict": true,
93
+ "return_dict_in_generate": false,
94
+ "rms_norm_eps": 1e-06,
95
+ "rope_scaling": null,
96
+ "rope_theta": 1000000.0,
97
+ "sep_token_id": null,
98
+ "sliding_window": 32768,
99
+ "suppress_tokens": null,
100
+ "task_specific_params": null,
101
+ "temperature": 1.0,
102
+ "tf_legacy_loss": false,
103
+ "tie_encoder_decoder": false,
104
+ "tie_word_embeddings": true,
105
+ "tokenizer_class": null,
106
+ "top_k": 50,
107
+ "top_p": 1.0,
108
+ "torch_dtype": "bfloat16",
109
+ "torchscript": false,
110
+ "transformers_version": "4.50.0.dev0",
111
+ "typical_p": 1.0,
112
+ "use_bfloat16": false,
113
+ "use_cache": false,
114
+ "use_sliding_window": false,
115
+ "vocab_size": 151674
116
+ },
117
+ "loss_version": "v4",
118
+ "max_dynamic_patch": 12,
119
+ "min_dynamic_patch": 1,
120
+ "mlp_checkpoint": true,
121
+ "model_path": "nvidia/Eagle2-1B",
122
+ "model_type": "Eagle2_1BVLA",
123
+ "modeling": "denoising",
124
+ "normalization": "quantile",
125
+ "num_readouts": 1,
126
+ "pad2square": false,
127
+ "pre_feature_reduction": false,
128
+ "ps_version": "v2",
129
+ "readout_token_as_eos": false,
130
+ "return_text": null,
131
+ "select_layer": -1,
132
+ "state_dim": 10,
133
+ "stopping_token": "|",
134
+ "template": "qwen2-chat",
135
+ "test_denoising_steps": 10,
136
+ "torch_dtype": "bfloat16",
137
+ "train_denoising_steps": 100,
138
+ "transformers_version": null,
139
+ "use_backbone_lora": 0,
140
+ "use_llm_lora": 0,
141
+ "use_thumbnail": true,
142
+ "vision_config": {
143
+ "_attn_implementation_autoset": true,
144
+ "_name_or_path": "",
145
+ "add_cross_attention": false,
146
+ "architectures": [
147
+ "SiglipVisionModel"
148
+ ],
149
+ "attention_dropout": 0.0,
150
+ "auto_map": {
151
+ "AutoConfig": "configuration_siglip.SiglipVisionConfig",
152
+ "AutoModel": "modeling_siglip.SiglipVisionModel"
153
+ },
154
+ "bad_words_ids": null,
155
+ "begin_suppress_tokens": null,
156
+ "bos_token_id": null,
157
+ "chunk_size_feed_forward": 0,
158
+ "cross_attention_hidden_size": null,
159
+ "decoder_start_token_id": null,
160
+ "diversity_penalty": 0.0,
161
+ "do_sample": false,
162
+ "drop_path_rate": 0.1,
163
+ "early_stopping": false,
164
+ "encoder_no_repeat_ngram_size": 0,
165
+ "eos_token_id": null,
166
+ "exponential_decay_length_penalty": null,
167
+ "finetuning_task": null,
168
+ "forced_bos_token_id": null,
169
+ "forced_eos_token_id": null,
170
+ "hidden_act": "gelu_pytorch_tanh",
171
+ "hidden_size": 1152,
172
+ "id2label": {
173
+ "0": "LABEL_0",
174
+ "1": "LABEL_1"
175
+ },
176
+ "image_size": 448,
177
+ "intermediate_size": 4304,
178
+ "is_decoder": false,
179
+ "is_encoder_decoder": false,
180
+ "label2id": {
181
+ "LABEL_0": 0,
182
+ "LABEL_1": 1
183
+ },
184
+ "layer_norm_eps": 1e-06,
185
+ "length_penalty": 1.0,
186
+ "max_length": 20,
187
+ "min_length": 0,
188
+ "model_type": "siglip_vision_model",
189
+ "no_repeat_ngram_size": 0,
190
+ "num_attention_heads": 16,
191
+ "num_beam_groups": 1,
192
+ "num_beams": 1,
193
+ "num_channels": 3,
194
+ "num_hidden_layers": 27,
195
+ "num_image_tokens": 1024,
196
+ "num_return_sequences": 1,
197
+ "output_attentions": false,
198
+ "output_hidden_states": false,
199
+ "output_scores": false,
200
+ "pad_token_id": null,
201
+ "patch_size": 14,
202
+ "prefix": null,
203
+ "problem_type": null,
204
+ "projection_dim": 2048,
205
+ "projector_hidden_act": "gelu_fast",
206
+ "pruned_heads": {},
207
+ "remove_invalid_values": false,
208
+ "repetition_penalty": 1.0,
209
+ "return_dict": true,
210
+ "return_dict_in_generate": false,
211
+ "sep_token_id": null,
212
+ "suppress_tokens": null,
213
+ "task_specific_params": null,
214
+ "temperature": 1.0,
215
+ "tf_legacy_loss": false,
216
+ "tie_encoder_decoder": false,
217
+ "tie_word_embeddings": true,
218
+ "tokenizer_class": null,
219
+ "top_k": 50,
220
+ "top_p": 1.0,
221
+ "torch_dtype": "bfloat16",
222
+ "torchscript": false,
223
+ "transformers_version": "4.50.0.dev0",
224
+ "typical_p": 1.0,
225
+ "use_bfloat16": false,
226
+ "vision_use_head": false
227
+ },
228
+ "vocab_size": 151674,
229
+ "vocab_start": null
230
+ }
2e-5/twinvla-scratch-aloha_dish_drainer/config.json ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "action_dim": 10,
3
+ "action_head": "DiT",
4
+ "action_len": 20,
5
+ "architectures": [
6
+ "Eagle2_1BTwinVLA"
7
+ ],
8
+ "attn_reweighting": true,
9
+ "denoiser": "FM",
10
+ "dit_scratch": false,
11
+ "global_normalization": true,
12
+ "hz_interpolate": null,
13
+ "interpolate_gripper": false,
14
+ "knowledge_insulation": false,
15
+ "model_path": null,
16
+ "model_type": "Eagle2_1BTwinVLA",
17
+ "modeling": "denoising",
18
+ "normalization": "quantile",
19
+ "num_readouts": 1,
20
+ "readout_token_as_eos": true,
21
+ "share_decoder": true,
22
+ "share_embed_tokens": true,
23
+ "share_vision": true,
24
+ "singlevla_config": {
25
+ "_attn_implementation_autoset": false,
26
+ "_attn_implementation_internal": null,
27
+ "_commit_hash": null,
28
+ "_name_or_path": "/scratch2/jellyho/rebuttal/singlevla-work/Eagle2_1B-Scratch-DiT-B",
29
+ "action_dim": 10,
30
+ "action_head": "DiT",
31
+ "action_head_hidden_dim": 1024,
32
+ "action_len": 20,
33
+ "add_cross_attention": false,
34
+ "aggregation": "None",
35
+ "architectures": [
36
+ "Eagle2_1BVLA"
37
+ ],
38
+ "auto_map": {},
39
+ "bad_words_ids": null,
40
+ "begin_suppress_tokens": null,
41
+ "bos_token_id": null,
42
+ "chunk_size_feed_forward": 0,
43
+ "cross_attention_hidden_size": null,
44
+ "decoder_start_token_id": null,
45
+ "denoiser": "FM",
46
+ "diffusion_batch": 32,
47
+ "dit_size": "DiT-B",
48
+ "diversity_penalty": 0.0,
49
+ "do_sample": false,
50
+ "downsample_ratio": 0.5,
51
+ "dynamic_image_size": true,
52
+ "early_stopping": false,
53
+ "efficient_loss": true,
54
+ "enable_cfg": true,
55
+ "encoder_no_repeat_ngram_size": 0,
56
+ "eos_token_id": null,
57
+ "exponential_decay_length_penalty": null,
58
+ "finetuning_task": null,
59
+ "force_image_size": 448,
60
+ "forced_bos_token_id": null,
61
+ "forced_eos_token_id": null,
62
+ "global_normalization": true,
63
+ "id2label": {
64
+ "0": "LABEL_0",
65
+ "1": "LABEL_1"
66
+ },
67
+ "image_size": 448,
68
+ "is_decoder": false,
69
+ "is_encoder_decoder": false,
70
+ "keep_aspect_ratio": false,
71
+ "knowledge_insulation": false,
72
+ "label2id": {
73
+ "LABEL_0": 0,
74
+ "LABEL_1": 1
75
+ },
76
+ "length_penalty": 1.0,
77
+ "llm_config": {
78
+ "_attn_implementation_autoset": true,
79
+ "_name_or_path": "./pretrained/Qwen2_5-0_5B-Instruct",
80
+ "add_cross_attention": false,
81
+ "architectures": [
82
+ "Qwen2ForCausalLM"
83
+ ],
84
+ "attention_dropout": 0.0,
85
+ "auto_map": {
86
+ "AutoConfig": "configuration_qwen2.Qwen2Config",
87
+ "AutoModel": "modeling_qwen2.Qwen2Model",
88
+ "AutoModelForCausalLM": "modeling_qwen2.Qwen2ForCausalLM"
89
+ },
90
+ "bad_words_ids": null,
91
+ "begin_suppress_tokens": null,
92
+ "bos_token_id": 151643,
93
+ "chunk_size_feed_forward": 0,
94
+ "cross_attention_hidden_size": null,
95
+ "decoder_start_token_id": null,
96
+ "diversity_penalty": 0.0,
97
+ "do_sample": false,
98
+ "early_stopping": false,
99
+ "encoder_no_repeat_ngram_size": 0,
100
+ "eos_token_id": 151645,
101
+ "exponential_decay_length_penalty": null,
102
+ "finetuning_task": null,
103
+ "forced_bos_token_id": null,
104
+ "forced_eos_token_id": null,
105
+ "hidden_act": "silu",
106
+ "hidden_size": 896,
107
+ "id2label": {
108
+ "0": "LABEL_0",
109
+ "1": "LABEL_1"
110
+ },
111
+ "initializer_range": 0.02,
112
+ "intermediate_size": 4864,
113
+ "is_decoder": false,
114
+ "is_encoder_decoder": false,
115
+ "label2id": {
116
+ "LABEL_0": 0,
117
+ "LABEL_1": 1
118
+ },
119
+ "length_penalty": 1.0,
120
+ "max_length": 20,
121
+ "max_position_embeddings": 32768,
122
+ "max_window_layers": 21,
123
+ "min_length": 0,
124
+ "model_type": "qwen2",
125
+ "no_repeat_ngram_size": 0,
126
+ "num_attention_heads": 14,
127
+ "num_beam_groups": 1,
128
+ "num_beams": 1,
129
+ "num_hidden_layers": 24,
130
+ "num_key_value_heads": 2,
131
+ "num_return_sequences": 1,
132
+ "output_attentions": false,
133
+ "output_hidden_states": false,
134
+ "output_scores": false,
135
+ "pad_token_id": null,
136
+ "prefix": null,
137
+ "problem_type": null,
138
+ "pruned_heads": {},
139
+ "remove_invalid_values": false,
140
+ "repetition_penalty": 1.0,
141
+ "return_dict": true,
142
+ "return_dict_in_generate": false,
143
+ "rms_norm_eps": 1e-06,
144
+ "rope_scaling": null,
145
+ "rope_theta": 1000000.0,
146
+ "sep_token_id": null,
147
+ "sliding_window": 32768,
148
+ "suppress_tokens": null,
149
+ "task_specific_params": null,
150
+ "temperature": 1.0,
151
+ "tf_legacy_loss": false,
152
+ "tie_encoder_decoder": false,
153
+ "tie_word_embeddings": true,
154
+ "tokenizer_class": null,
155
+ "top_k": 50,
156
+ "top_p": 1.0,
157
+ "torch_dtype": "bfloat16",
158
+ "torchscript": false,
159
+ "transformers_version": "4.50.0.dev0",
160
+ "typical_p": 1.0,
161
+ "use_bfloat16": false,
162
+ "use_cache": false,
163
+ "use_sliding_window": false,
164
+ "vocab_size": 151674
165
+ },
166
+ "loss_version": "v4",
167
+ "max_dynamic_patch": 12,
168
+ "max_length": 20,
169
+ "min_dynamic_patch": 1,
170
+ "min_length": 0,
171
+ "mlp_checkpoint": true,
172
+ "model_path": "nvidia/Eagle2-1B",
173
+ "model_type": "Eagle2_1BVLA",
174
+ "modeling": "denoising",
175
+ "no_repeat_ngram_size": 0,
176
+ "normalization": "quantile",
177
+ "num_beam_groups": 1,
178
+ "num_beams": 1,
179
+ "num_readouts": 1,
180
+ "num_return_sequences": 1,
181
+ "output_attentions": false,
182
+ "output_hidden_states": false,
183
+ "output_scores": false,
184
+ "pad2square": false,
185
+ "pad_token_id": null,
186
+ "pre_feature_reduction": false,
187
+ "prefix": null,
188
+ "problem_type": null,
189
+ "pruned_heads": {},
190
+ "ps_version": "v2",
191
+ "readout_token_as_eos": true,
192
+ "remove_invalid_values": false,
193
+ "repetition_penalty": 1.0,
194
+ "return_dict": true,
195
+ "return_dict_in_generate": false,
196
+ "return_text": null,
197
+ "select_layer": -1,
198
+ "sep_token_id": null,
199
+ "state_dim": 10,
200
+ "stopping_token": "|",
201
+ "suppress_tokens": null,
202
+ "task_specific_params": null,
203
+ "temperature": 1.0,
204
+ "template": "qwen2-chat",
205
+ "test_denoising_steps": 10,
206
+ "tf_legacy_loss": false,
207
+ "tie_encoder_decoder": false,
208
+ "tie_word_embeddings": true,
209
+ "tokenizer_class": null,
210
+ "top_k": 50,
211
+ "top_p": 1.0,
212
+ "torch_dtype": "bfloat16",
213
+ "torchscript": false,
214
+ "train_denoising_steps": 100,
215
+ "typical_p": 1.0,
216
+ "use_backbone_lora": 0,
217
+ "use_bfloat16": false,
218
+ "use_llm_lora": 0,
219
+ "use_thumbnail": true,
220
+ "vision_config": {
221
+ "_attn_implementation_autoset": true,
222
+ "_name_or_path": "",
223
+ "add_cross_attention": false,
224
+ "architectures": [
225
+ "SiglipVisionModel"
226
+ ],
227
+ "attention_dropout": 0.0,
228
+ "auto_map": {
229
+ "AutoConfig": "configuration_siglip.SiglipVisionConfig",
230
+ "AutoModel": "modeling_siglip.SiglipVisionModel"
231
+ },
232
+ "bad_words_ids": null,
233
+ "begin_suppress_tokens": null,
234
+ "bos_token_id": null,
235
+ "chunk_size_feed_forward": 0,
236
+ "cross_attention_hidden_size": null,
237
+ "decoder_start_token_id": null,
238
+ "diversity_penalty": 0.0,
239
+ "do_sample": false,
240
+ "drop_path_rate": 0.1,
241
+ "early_stopping": false,
242
+ "encoder_no_repeat_ngram_size": 0,
243
+ "eos_token_id": null,
244
+ "exponential_decay_length_penalty": null,
245
+ "finetuning_task": null,
246
+ "forced_bos_token_id": null,
247
+ "forced_eos_token_id": null,
248
+ "hidden_act": "gelu_pytorch_tanh",
249
+ "hidden_size": 1152,
250
+ "id2label": {
251
+ "0": "LABEL_0",
252
+ "1": "LABEL_1"
253
+ },
254
+ "image_size": 448,
255
+ "intermediate_size": 4304,
256
+ "is_decoder": false,
257
+ "is_encoder_decoder": false,
258
+ "label2id": {
259
+ "LABEL_0": 0,
260
+ "LABEL_1": 1
261
+ },
262
+ "layer_norm_eps": 1e-06,
263
+ "length_penalty": 1.0,
264
+ "max_length": 20,
265
+ "min_length": 0,
266
+ "model_type": "siglip_vision_model",
267
+ "no_repeat_ngram_size": 0,
268
+ "num_attention_heads": 16,
269
+ "num_beam_groups": 1,
270
+ "num_beams": 1,
271
+ "num_channels": 3,
272
+ "num_hidden_layers": 27,
273
+ "num_image_tokens": 1024,
274
+ "num_return_sequences": 1,
275
+ "output_attentions": false,
276
+ "output_hidden_states": false,
277
+ "output_scores": false,
278
+ "pad_token_id": null,
279
+ "patch_size": 14,
280
+ "prefix": null,
281
+ "problem_type": null,
282
+ "projection_dim": 2048,
283
+ "projector_hidden_act": "gelu_fast",
284
+ "pruned_heads": {},
285
+ "remove_invalid_values": false,
286
+ "repetition_penalty": 1.0,
287
+ "return_dict": true,
288
+ "return_dict_in_generate": false,
289
+ "sep_token_id": null,
290
+ "suppress_tokens": null,
291
+ "task_specific_params": null,
292
+ "temperature": 1.0,
293
+ "tf_legacy_loss": false,
294
+ "tie_encoder_decoder": false,
295
+ "tie_word_embeddings": true,
296
+ "tokenizer_class": null,
297
+ "top_k": 50,
298
+ "top_p": 1.0,
299
+ "torch_dtype": "bfloat16",
300
+ "torchscript": false,
301
+ "transformers_version": "4.50.0.dev0",
302
+ "typical_p": 1.0,
303
+ "use_bfloat16": false,
304
+ "vision_use_head": false
305
+ },
306
+ "vocab_size": 151674,
307
+ "vocab_start": null
308
+ },
309
+ "singlevla_config_path": "/scratch2/jellyho/rebuttal/singlevla-work/Eagle2_1B-Scratch-DiT-B",
310
+ "singlevla_pretrained_path": null,
311
+ "state_dim": 10,
312
+ "torch_dtype": "bfloat16",
313
+ "transformers_version": "4.50.0.dev0"
314
+ }
2e-5/twinvla-scratch-aloha_dish_drainer/dataset_statistics.json ADDED
@@ -0,0 +1,634 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "aloha_dish_drainer_new": {
3
+ "action": {
4
+ "mean": [
5
+ 0.40166160464286804,
6
+ -0.011121422052383423,
7
+ 0.18381044268608093,
8
+ 0.7193055748939514,
9
+ 0.25373342633247375,
10
+ -0.5631368160247803,
11
+ -0.1440804898738861,
12
+ 0.8170215487480164,
13
+ 0.1759030520915985,
14
+ 0.3052484393119812,
15
+ 0.27496522665023804,
16
+ 0.07536163926124573,
17
+ 0.11210401356220245,
18
+ 0.5866131782531738,
19
+ 0.166164368391037,
20
+ -0.6815540790557861,
21
+ -0.029566073790192604,
22
+ 0.9651414752006531,
23
+ 0.16927561163902283,
24
+ -0.015535339713096619
25
+ ],
26
+ "std": [
27
+ 0.11050388216972351,
28
+ 0.09560802578926086,
29
+ 0.07149423658847809,
30
+ 0.16429585218429565,
31
+ 0.23607663810253143,
32
+ 0.13553042709827423,
33
+ 0.4136315882205963,
34
+ 0.16760703921318054,
35
+ 0.28564009070396423,
36
+ 0.9522888660430908,
37
+ 0.031309936195611954,
38
+ 0.04574710130691528,
39
+ 0.0856705829501152,
40
+ 0.29802361130714417,
41
+ 0.15602004528045654,
42
+ 0.22492949664592743,
43
+ 0.10802315920591354,
44
+ 0.041615039110183716,
45
+ 0.15993009507656097,
46
+ 0.9998806715011597
47
+ ],
48
+ "max": [
49
+ 0.6568294763565063,
50
+ 0.20922525227069855,
51
+ 0.329291433095932,
52
+ 0.9988790154457092,
53
+ 0.8221861720085144,
54
+ -0.02126980759203434,
55
+ 0.554952085018158,
56
+ 0.9999961256980896,
57
+ 0.8352594971656799,
58
+ 1.0,
59
+ 0.3725535273551941,
60
+ 0.20133008062839508,
61
+ 0.2683204710483551,
62
+ 0.9969081878662109,
63
+ 0.5947288274765015,
64
+ 0.135818213224411,
65
+ 0.297533243894577,
66
+ 0.9999833106994629,
67
+ 0.6284497380256653,
68
+ 1.0
69
+ ],
70
+ "min": [
71
+ 0.1679762601852417,
72
+ -0.2037276178598404,
73
+ 0.026118876412510872,
74
+ 0.06734701991081238,
75
+ -0.3303077816963196,
76
+ -0.865761399269104,
77
+ -0.9697803854942322,
78
+ 0.24385260045528412,
79
+ -0.3337814211845398,
80
+ -1.0,
81
+ 0.17690593004226685,
82
+ -0.019342761486768723,
83
+ -0.045900676399469376,
84
+ -0.08388058096170425,
85
+ -0.1825810670852661,
86
+ -0.9999706149101257,
87
+ -0.4282298684120178,
88
+ 0.7756603956222534,
89
+ -0.19046637415885925,
90
+ -1.0
91
+ ],
92
+ "q01": [
93
+ 0.22041156470775605,
94
+ -0.17958899974822998,
95
+ 0.04473079532384872,
96
+ 0.2284090793132782,
97
+ -0.2088965356349945,
98
+ -0.811203727722168,
99
+ -0.9306126594543457,
100
+ 0.3530711317062378,
101
+ -0.2207678198814392,
102
+ -1.0,
103
+ 0.2055837804079056,
104
+ 0.005079864375293255,
105
+ -0.04285515695810318,
106
+ 0.023393160849809646,
107
+ -0.12909780085086822,
108
+ -0.9969730639457702,
109
+ -0.31871861577033994,
110
+ 0.8128526282310485,
111
+ -0.12555764615535736,
112
+ -1.0
113
+ ],
114
+ "q99": [
115
+ 0.6256548881530761,
116
+ 0.16506216526031484,
117
+ 0.3053938007354736,
118
+ 0.9577780866622918,
119
+ 0.7322160029411315,
120
+ -0.22335838973522296,
121
+ 0.43161325573921194,
122
+ 0.9983374524116516,
123
+ 0.7683744573593138,
124
+ 1.0,
125
+ 0.344862767457962,
126
+ 0.19341405749320983,
127
+ 0.24164194464683514,
128
+ 0.9684402346611023,
129
+ 0.5674381494522094,
130
+ -0.24195577383041442,
131
+ 0.2379095745086669,
132
+ 0.9997554516792297,
133
+ 0.564831252098083,
134
+ 1.0
135
+ ],
136
+ "mask": [
137
+ true,
138
+ true,
139
+ true,
140
+ false,
141
+ false,
142
+ false,
143
+ false,
144
+ false,
145
+ false,
146
+ false,
147
+ true,
148
+ true,
149
+ true,
150
+ false,
151
+ false,
152
+ false,
153
+ false,
154
+ false,
155
+ false,
156
+ false
157
+ ]
158
+ },
159
+ "proprio": {
160
+ "mean": [
161
+ 0.3995276093482971,
162
+ -0.01743783988058567,
163
+ 0.17103439569473267,
164
+ 0.7105359435081482,
165
+ 0.238600954413414,
166
+ -0.5759879946708679,
167
+ -0.11663730442523956,
168
+ 0.8159795999526978,
169
+ 0.18427881598472595,
170
+ 0.32329148054122925,
171
+ 0.26429733633995056,
172
+ 0.056828975677490234,
173
+ 0.10836688429117203,
174
+ 0.5435153841972351,
175
+ 0.10053253918886185,
176
+ -0.7011978030204773,
177
+ -0.03383757919073105,
178
+ 0.9509052038192749,
179
+ 0.0682743564248085,
180
+ -0.11205186694860458
181
+ ],
182
+ "std": [
183
+ 0.10546877980232239,
184
+ 0.09407489001750946,
185
+ 0.07594858855009079,
186
+ 0.15545178949832916,
187
+ 0.23370550572872162,
188
+ 0.1663108617067337,
189
+ 0.417312353849411,
190
+ 0.1589633673429489,
191
+ 0.29529041051864624,
192
+ 0.8386000394821167,
193
+ 0.03193666785955429,
194
+ 0.03702627867460251,
195
+ 0.08499232679605484,
196
+ 0.33746665716171265,
197
+ 0.13817641139030457,
198
+ 0.2642515003681183,
199
+ 0.13742685317993164,
200
+ 0.10328594595193863,
201
+ 0.24581077694892883,
202
+ 0.9880411624908447
203
+ ],
204
+ "max": [
205
+ 0.6216338276863098,
206
+ 0.1681845635175705,
207
+ 0.3582729399204254,
208
+ 0.9998778104782104,
209
+ 0.7569742202758789,
210
+ 0.29317960143089294,
211
+ 0.5474420785903931,
212
+ 1.0,
213
+ 0.9644882678985596,
214
+ 1.2399240732192993,
215
+ 0.36810019612312317,
216
+ 0.15229015052318573,
217
+ 0.3755773603916168,
218
+ 0.9999530911445618,
219
+ 0.47173869609832764,
220
+ 0.4396477937698364,
221
+ 0.5856077671051025,
222
+ 1.0,
223
+ 0.9141661524772644,
224
+ 1.0335123538970947
225
+ ],
226
+ "min": [
227
+ 0.17779576778411865,
228
+ -0.2223799079656601,
229
+ 0.009585360996425152,
230
+ 0.27525120973587036,
231
+ -0.3401731848716736,
232
+ -0.8740139603614807,
233
+ -0.922980010509491,
234
+ 0.20966650545597076,
235
+ -0.5117865800857544,
236
+ -1.04777991771698,
237
+ 0.13721425831317902,
238
+ -0.11607959121465683,
239
+ -0.006126723252236843,
240
+ -0.12117788940668106,
241
+ -0.5865428447723389,
242
+ -0.9999897480010986,
243
+ -0.48856121301651,
244
+ -0.09543908387422562,
245
+ -0.9954046607017517,
246
+ -1.1056499481201172
247
+ ],
248
+ "q01": [
249
+ 0.22190109610557557,
250
+ -0.19557971894741058,
251
+ 0.02071425139904022,
252
+ 0.33727509021759033,
253
+ -0.20722176849842072,
254
+ -0.8324707460403442,
255
+ -0.8625615048408508,
256
+ 0.48607451796531675,
257
+ -0.30660848736763,
258
+ -0.9315443730354309,
259
+ 0.19041878879070281,
260
+ -0.04380948930978775,
261
+ -0.0050327684171497826,
262
+ -0.05638677150011063,
263
+ -0.26807846426963805,
264
+ -0.9989835453033448,
265
+ -0.329305921792984,
266
+ 0.4013799297809601,
267
+ -0.8769975972175598,
268
+ -1.088063154220581
269
+ ],
270
+ "q99": [
271
+ 0.6066525983810425,
272
+ 0.13703446269035335,
273
+ 0.3049686551094055,
274
+ 0.995180070400238,
275
+ 0.6961594343185422,
276
+ 0.0980641171336174,
277
+ 0.463529108762741,
278
+ 0.9997917461395264,
279
+ 0.7787499904632564,
280
+ 1.0201601552963255,
281
+ 0.3442830562591551,
282
+ 0.1231292974948883,
283
+ 0.30282683849334713,
284
+ 0.9978944087028503,
285
+ 0.41937308669090234,
286
+ 0.0980641171336174,
287
+ 0.27175513744354135,
288
+ 0.999885528087616,
289
+ 0.5146499085426329,
290
+ 1.0030832004547119
291
+ ],
292
+ "mask": [
293
+ true,
294
+ true,
295
+ true,
296
+ false,
297
+ false,
298
+ false,
299
+ false,
300
+ false,
301
+ false,
302
+ false,
303
+ true,
304
+ true,
305
+ true,
306
+ false,
307
+ false,
308
+ false,
309
+ false,
310
+ false,
311
+ false,
312
+ false
313
+ ]
314
+ },
315
+ "num_transitions": 7145,
316
+ "num_trajectories": 50
317
+ },
318
+ "aloha_dish_drainer": {
319
+ "action": {
320
+ "mean": [
321
+ 0.40166160464286804,
322
+ -0.011121422052383423,
323
+ 0.18381044268608093,
324
+ 0.7193055748939514,
325
+ 0.25373342633247375,
326
+ -0.5631368160247803,
327
+ -0.1440804898738861,
328
+ 0.8170215487480164,
329
+ 0.1759030520915985,
330
+ 0.3052484393119812,
331
+ 0.27496522665023804,
332
+ 0.07536163926124573,
333
+ 0.11210401356220245,
334
+ 0.5866131782531738,
335
+ 0.166164368391037,
336
+ -0.6815540790557861,
337
+ -0.029566073790192604,
338
+ 0.9651414752006531,
339
+ 0.16927561163902283,
340
+ -0.015535339713096619
341
+ ],
342
+ "std": [
343
+ 0.11050388216972351,
344
+ 0.09560802578926086,
345
+ 0.07149423658847809,
346
+ 0.16429585218429565,
347
+ 0.23607663810253143,
348
+ 0.13553042709827423,
349
+ 0.4136315882205963,
350
+ 0.16760703921318054,
351
+ 0.28564009070396423,
352
+ 0.9522888660430908,
353
+ 0.031309936195611954,
354
+ 0.04574710130691528,
355
+ 0.0856705829501152,
356
+ 0.29802361130714417,
357
+ 0.15602004528045654,
358
+ 0.22492949664592743,
359
+ 0.10802315920591354,
360
+ 0.041615039110183716,
361
+ 0.15993009507656097,
362
+ 0.9998806715011597
363
+ ],
364
+ "max": [
365
+ 0.6568294763565063,
366
+ 0.20922525227069855,
367
+ 0.329291433095932,
368
+ 0.9988790154457092,
369
+ 0.8221861720085144,
370
+ -0.02126980759203434,
371
+ 0.554952085018158,
372
+ 0.9999961256980896,
373
+ 0.8352594971656799,
374
+ 1.0,
375
+ 0.3725535273551941,
376
+ 0.20133008062839508,
377
+ 0.2683204710483551,
378
+ 0.9969081878662109,
379
+ 0.5947288274765015,
380
+ 0.135818213224411,
381
+ 0.297533243894577,
382
+ 0.9999833106994629,
383
+ 0.6284497380256653,
384
+ 1.0
385
+ ],
386
+ "min": [
387
+ 0.1679762601852417,
388
+ -0.2037276178598404,
389
+ 0.026118876412510872,
390
+ 0.06734701991081238,
391
+ -0.3303077816963196,
392
+ -0.865761399269104,
393
+ -0.9697803854942322,
394
+ 0.24385260045528412,
395
+ -0.3337814211845398,
396
+ -1.0,
397
+ 0.17690593004226685,
398
+ -0.019342761486768723,
399
+ -0.045900676399469376,
400
+ -0.08388058096170425,
401
+ -0.1825810670852661,
402
+ -0.9999706149101257,
403
+ -0.4282298684120178,
404
+ 0.7756603956222534,
405
+ -0.19046637415885925,
406
+ -1.0
407
+ ],
408
+ "q01": [
409
+ 0.22041156470775605,
410
+ -0.17958899974822998,
411
+ 0.04473079532384872,
412
+ 0.2284090793132782,
413
+ -0.2088965356349945,
414
+ -0.811203727722168,
415
+ -0.9306126594543457,
416
+ 0.3530711317062378,
417
+ -0.2207678198814392,
418
+ -1.0,
419
+ 0.2055837804079056,
420
+ 0.005079864375293255,
421
+ -0.04285515695810318,
422
+ 0.023393160849809646,
423
+ -0.12909780085086822,
424
+ -0.9969730639457702,
425
+ -0.31871861577033994,
426
+ 0.8128526282310485,
427
+ -0.12555764615535736,
428
+ -1.0
429
+ ],
430
+ "q99": [
431
+ 0.6256548881530761,
432
+ 0.16506216526031484,
433
+ 0.3053938007354736,
434
+ 0.9577780866622918,
435
+ 0.7322160029411315,
436
+ -0.22335838973522296,
437
+ 0.43161325573921194,
438
+ 0.9983374524116516,
439
+ 0.7683744573593138,
440
+ 1.0,
441
+ 0.344862767457962,
442
+ 0.19341405749320983,
443
+ 0.24164194464683514,
444
+ 0.9684402346611023,
445
+ 0.5674381494522094,
446
+ -0.24195577383041442,
447
+ 0.2379095745086669,
448
+ 0.9997554516792297,
449
+ 0.564831252098083,
450
+ 1.0
451
+ ],
452
+ "mask": [
453
+ true,
454
+ true,
455
+ true,
456
+ false,
457
+ false,
458
+ false,
459
+ false,
460
+ false,
461
+ false,
462
+ false,
463
+ true,
464
+ true,
465
+ true,
466
+ false,
467
+ false,
468
+ false,
469
+ false,
470
+ false,
471
+ false,
472
+ false
473
+ ]
474
+ },
475
+ "proprio": {
476
+ "mean": [
477
+ 0.3995276093482971,
478
+ -0.01743783988058567,
479
+ 0.17103439569473267,
480
+ 0.7105359435081482,
481
+ 0.238600954413414,
482
+ -0.5759879946708679,
483
+ -0.11663730442523956,
484
+ 0.8159795999526978,
485
+ 0.18427881598472595,
486
+ 0.32329148054122925,
487
+ 0.26429733633995056,
488
+ 0.056828975677490234,
489
+ 0.10836688429117203,
490
+ 0.5435153841972351,
491
+ 0.10053253918886185,
492
+ -0.7011978030204773,
493
+ -0.03383757919073105,
494
+ 0.9509052038192749,
495
+ 0.0682743564248085,
496
+ -0.11205186694860458
497
+ ],
498
+ "std": [
499
+ 0.10546877980232239,
500
+ 0.09407489001750946,
501
+ 0.07594858855009079,
502
+ 0.15545178949832916,
503
+ 0.23370550572872162,
504
+ 0.1663108617067337,
505
+ 0.417312353849411,
506
+ 0.1589633673429489,
507
+ 0.29529041051864624,
508
+ 0.8386000394821167,
509
+ 0.03193666785955429,
510
+ 0.03702627867460251,
511
+ 0.08499232679605484,
512
+ 0.33746665716171265,
513
+ 0.13817641139030457,
514
+ 0.2642515003681183,
515
+ 0.13742685317993164,
516
+ 0.10328594595193863,
517
+ 0.24581077694892883,
518
+ 0.9880411624908447
519
+ ],
520
+ "max": [
521
+ 0.6216338276863098,
522
+ 0.1681845635175705,
523
+ 0.3582729399204254,
524
+ 0.9998778104782104,
525
+ 0.7569742202758789,
526
+ 0.29317960143089294,
527
+ 0.5474420785903931,
528
+ 1.0,
529
+ 0.9644882678985596,
530
+ 1.2399240732192993,
531
+ 0.36810019612312317,
532
+ 0.15229015052318573,
533
+ 0.3755773603916168,
534
+ 0.9999530911445618,
535
+ 0.47173869609832764,
536
+ 0.4396477937698364,
537
+ 0.5856077671051025,
538
+ 1.0,
539
+ 0.9141661524772644,
540
+ 1.0335123538970947
541
+ ],
542
+ "min": [
543
+ 0.17779576778411865,
544
+ -0.2223799079656601,
545
+ 0.009585360996425152,
546
+ 0.27525120973587036,
547
+ -0.3401731848716736,
548
+ -0.8740139603614807,
549
+ -0.922980010509491,
550
+ 0.20966650545597076,
551
+ -0.5117865800857544,
552
+ -1.04777991771698,
553
+ 0.13721425831317902,
554
+ -0.11607959121465683,
555
+ -0.006126723252236843,
556
+ -0.12117788940668106,
557
+ -0.5865428447723389,
558
+ -0.9999897480010986,
559
+ -0.48856121301651,
560
+ -0.09543908387422562,
561
+ -0.9954046607017517,
562
+ -1.1056499481201172
563
+ ],
564
+ "q01": [
565
+ 0.22190109610557557,
566
+ -0.19557971894741058,
567
+ 0.02071425139904022,
568
+ 0.33727509021759033,
569
+ -0.20722176849842072,
570
+ -0.8324707460403442,
571
+ -0.8625615048408508,
572
+ 0.48607451796531675,
573
+ -0.30660848736763,
574
+ -0.9315443730354309,
575
+ 0.19041878879070281,
576
+ -0.04380948930978775,
577
+ -0.0050327684171497826,
578
+ -0.05638677150011063,
579
+ -0.26807846426963805,
580
+ -0.9989835453033448,
581
+ -0.329305921792984,
582
+ 0.4013799297809601,
583
+ -0.8769975972175598,
584
+ -1.088063154220581
585
+ ],
586
+ "q99": [
587
+ 0.6066525983810425,
588
+ 0.13703446269035335,
589
+ 0.3049686551094055,
590
+ 0.995180070400238,
591
+ 0.6961594343185422,
592
+ 0.0980641171336174,
593
+ 0.463529108762741,
594
+ 0.9997917461395264,
595
+ 0.7787499904632564,
596
+ 1.0201601552963255,
597
+ 0.3442830562591551,
598
+ 0.1231292974948883,
599
+ 0.30282683849334713,
600
+ 0.9978944087028503,
601
+ 0.41937308669090234,
602
+ 0.0980641171336174,
603
+ 0.27175513744354135,
604
+ 0.999885528087616,
605
+ 0.5146499085426329,
606
+ 1.0030832004547119
607
+ ],
608
+ "mask": [
609
+ true,
610
+ true,
611
+ true,
612
+ false,
613
+ false,
614
+ false,
615
+ false,
616
+ false,
617
+ false,
618
+ false,
619
+ true,
620
+ true,
621
+ true,
622
+ false,
623
+ false,
624
+ false,
625
+ false,
626
+ false,
627
+ false,
628
+ false
629
+ ]
630
+ },
631
+ "num_transitions": 7145,
632
+ "num_trajectories": 50
633
+ }
634
+ }
2e-5/twinvla-scratch-aloha_dish_drainer/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6ce8e400165455003d78690e7c4c24eb46ad2b6febf7d1b4396cba4383e28c7
3
+ size 2889536104
2e-5/twinvla-scratch-aloha_dish_drainer/singlevla_config/config.json ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_commit_hash": null,
3
+ "_name_or_path": "/scratch2/jellyho/rebuttal/singlevla-work/Eagle2_1B-Scratch-DiT-B",
4
+ "action_dim": 10,
5
+ "action_head": "DiT",
6
+ "action_head_hidden_dim": 1024,
7
+ "action_len": 20,
8
+ "aggregation": "None",
9
+ "architectures": [
10
+ "Eagle2_1BVLA"
11
+ ],
12
+ "auto_map": {},
13
+ "denoiser": "FM",
14
+ "diffusion_batch": 32,
15
+ "dit_size": "DiT-B",
16
+ "downsample_ratio": 0.5,
17
+ "dynamic_image_size": true,
18
+ "efficient_loss": true,
19
+ "enable_cfg": true,
20
+ "force_image_size": 448,
21
+ "global_normalization": true,
22
+ "image_size": 448,
23
+ "keep_aspect_ratio": false,
24
+ "knowledge_insulation": false,
25
+ "llm_config": {
26
+ "_attn_implementation_autoset": true,
27
+ "_name_or_path": "./pretrained/Qwen2_5-0_5B-Instruct",
28
+ "add_cross_attention": false,
29
+ "architectures": [
30
+ "Qwen2ForCausalLM"
31
+ ],
32
+ "attention_dropout": 0.0,
33
+ "auto_map": {
34
+ "AutoConfig": "configuration_qwen2.Qwen2Config",
35
+ "AutoModel": "modeling_qwen2.Qwen2Model",
36
+ "AutoModelForCausalLM": "modeling_qwen2.Qwen2ForCausalLM"
37
+ },
38
+ "bad_words_ids": null,
39
+ "begin_suppress_tokens": null,
40
+ "bos_token_id": 151643,
41
+ "chunk_size_feed_forward": 0,
42
+ "cross_attention_hidden_size": null,
43
+ "decoder_start_token_id": null,
44
+ "diversity_penalty": 0.0,
45
+ "do_sample": false,
46
+ "early_stopping": false,
47
+ "encoder_no_repeat_ngram_size": 0,
48
+ "eos_token_id": 151645,
49
+ "exponential_decay_length_penalty": null,
50
+ "finetuning_task": null,
51
+ "forced_bos_token_id": null,
52
+ "forced_eos_token_id": null,
53
+ "hidden_act": "silu",
54
+ "hidden_size": 896,
55
+ "id2label": {
56
+ "0": "LABEL_0",
57
+ "1": "LABEL_1"
58
+ },
59
+ "initializer_range": 0.02,
60
+ "intermediate_size": 4864,
61
+ "is_decoder": false,
62
+ "is_encoder_decoder": false,
63
+ "label2id": {
64
+ "LABEL_0": 0,
65
+ "LABEL_1": 1
66
+ },
67
+ "length_penalty": 1.0,
68
+ "max_length": 20,
69
+ "max_position_embeddings": 32768,
70
+ "max_window_layers": 21,
71
+ "min_length": 0,
72
+ "model_type": "qwen2",
73
+ "no_repeat_ngram_size": 0,
74
+ "num_attention_heads": 14,
75
+ "num_beam_groups": 1,
76
+ "num_beams": 1,
77
+ "num_hidden_layers": 24,
78
+ "num_key_value_heads": 2,
79
+ "num_return_sequences": 1,
80
+ "output_attentions": false,
81
+ "output_hidden_states": false,
82
+ "output_scores": false,
83
+ "pad_token_id": null,
84
+ "prefix": null,
85
+ "problem_type": null,
86
+ "pruned_heads": {},
87
+ "remove_invalid_values": false,
88
+ "repetition_penalty": 1.0,
89
+ "return_dict": true,
90
+ "return_dict_in_generate": false,
91
+ "rms_norm_eps": 1e-06,
92
+ "rope_scaling": null,
93
+ "rope_theta": 1000000.0,
94
+ "sep_token_id": null,
95
+ "sliding_window": 32768,
96
+ "suppress_tokens": null,
97
+ "task_specific_params": null,
98
+ "temperature": 1.0,
99
+ "tf_legacy_loss": false,
100
+ "tie_encoder_decoder": false,
101
+ "tie_word_embeddings": true,
102
+ "tokenizer_class": null,
103
+ "top_k": 50,
104
+ "top_p": 1.0,
105
+ "torch_dtype": "bfloat16",
106
+ "torchscript": false,
107
+ "transformers_version": "4.50.0.dev0",
108
+ "typical_p": 1.0,
109
+ "use_bfloat16": false,
110
+ "use_cache": false,
111
+ "use_sliding_window": false,
112
+ "vocab_size": 151674
113
+ },
114
+ "loss_version": "v4",
115
+ "max_dynamic_patch": 12,
116
+ "min_dynamic_patch": 1,
117
+ "mlp_checkpoint": true,
118
+ "model_path": "nvidia/Eagle2-1B",
119
+ "model_type": "Eagle2_1BVLA",
120
+ "modeling": "denoising",
121
+ "normalization": "quantile",
122
+ "num_readouts": 1,
123
+ "pad2square": false,
124
+ "pre_feature_reduction": false,
125
+ "ps_version": "v2",
126
+ "readout_token_as_eos": true,
127
+ "return_text": null,
128
+ "select_layer": -1,
129
+ "state_dim": 10,
130
+ "stopping_token": "|",
131
+ "template": "qwen2-chat",
132
+ "test_denoising_steps": 10,
133
+ "torch_dtype": "bfloat16",
134
+ "train_denoising_steps": 100,
135
+ "transformers_version": null,
136
+ "use_backbone_lora": 0,
137
+ "use_llm_lora": 0,
138
+ "use_thumbnail": true,
139
+ "vision_config": {
140
+ "_attn_implementation_autoset": true,
141
+ "_name_or_path": "",
142
+ "add_cross_attention": false,
143
+ "architectures": [
144
+ "SiglipVisionModel"
145
+ ],
146
+ "attention_dropout": 0.0,
147
+ "auto_map": {
148
+ "AutoConfig": "configuration_siglip.SiglipVisionConfig",
149
+ "AutoModel": "modeling_siglip.SiglipVisionModel"
150
+ },
151
+ "bad_words_ids": null,
152
+ "begin_suppress_tokens": null,
153
+ "bos_token_id": null,
154
+ "chunk_size_feed_forward": 0,
155
+ "cross_attention_hidden_size": null,
156
+ "decoder_start_token_id": null,
157
+ "diversity_penalty": 0.0,
158
+ "do_sample": false,
159
+ "drop_path_rate": 0.1,
160
+ "early_stopping": false,
161
+ "encoder_no_repeat_ngram_size": 0,
162
+ "eos_token_id": null,
163
+ "exponential_decay_length_penalty": null,
164
+ "finetuning_task": null,
165
+ "forced_bos_token_id": null,
166
+ "forced_eos_token_id": null,
167
+ "hidden_act": "gelu_pytorch_tanh",
168
+ "hidden_size": 1152,
169
+ "id2label": {
170
+ "0": "LABEL_0",
171
+ "1": "LABEL_1"
172
+ },
173
+ "image_size": 448,
174
+ "intermediate_size": 4304,
175
+ "is_decoder": false,
176
+ "is_encoder_decoder": false,
177
+ "label2id": {
178
+ "LABEL_0": 0,
179
+ "LABEL_1": 1
180
+ },
181
+ "layer_norm_eps": 1e-06,
182
+ "length_penalty": 1.0,
183
+ "max_length": 20,
184
+ "min_length": 0,
185
+ "model_type": "siglip_vision_model",
186
+ "no_repeat_ngram_size": 0,
187
+ "num_attention_heads": 16,
188
+ "num_beam_groups": 1,
189
+ "num_beams": 1,
190
+ "num_channels": 3,
191
+ "num_hidden_layers": 27,
192
+ "num_image_tokens": 1024,
193
+ "num_return_sequences": 1,
194
+ "output_attentions": false,
195
+ "output_hidden_states": false,
196
+ "output_scores": false,
197
+ "pad_token_id": null,
198
+ "patch_size": 14,
199
+ "prefix": null,
200
+ "problem_type": null,
201
+ "projection_dim": 2048,
202
+ "projector_hidden_act": "gelu_fast",
203
+ "pruned_heads": {},
204
+ "remove_invalid_values": false,
205
+ "repetition_penalty": 1.0,
206
+ "return_dict": true,
207
+ "return_dict_in_generate": false,
208
+ "sep_token_id": null,
209
+ "suppress_tokens": null,
210
+ "task_specific_params": null,
211
+ "temperature": 1.0,
212
+ "tf_legacy_loss": false,
213
+ "tie_encoder_decoder": false,
214
+ "tie_word_embeddings": true,
215
+ "tokenizer_class": null,
216
+ "top_k": 50,
217
+ "top_p": 1.0,
218
+ "torch_dtype": "bfloat16",
219
+ "torchscript": false,
220
+ "transformers_version": "4.50.0.dev0",
221
+ "typical_p": 1.0,
222
+ "use_bfloat16": false,
223
+ "vision_use_head": false
224
+ },
225
+ "vocab_size": 151674,
226
+ "vocab_start": null
227
+ }
2e-5/twinvla-scratch-aloha_handover_box/config.json ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "action_dim": 10,
3
+ "action_head": "DiT",
4
+ "action_len": 20,
5
+ "architectures": [
6
+ "Eagle2_1BTwinVLA"
7
+ ],
8
+ "attn_reweighting": true,
9
+ "denoiser": "FM",
10
+ "dit_scratch": false,
11
+ "global_normalization": true,
12
+ "hz_interpolate": null,
13
+ "interpolate_gripper": false,
14
+ "knowledge_insulation": false,
15
+ "model_path": null,
16
+ "model_type": "Eagle2_1BTwinVLA",
17
+ "modeling": "denoising",
18
+ "normalization": "quantile",
19
+ "num_readouts": 1,
20
+ "readout_token_as_eos": true,
21
+ "share_decoder": true,
22
+ "share_embed_tokens": true,
23
+ "share_vision": true,
24
+ "singlevla_config": {
25
+ "_attn_implementation_autoset": false,
26
+ "_attn_implementation_internal": null,
27
+ "_commit_hash": null,
28
+ "_name_or_path": "/data5/jellyho/twinvla-checkpoints/Eagle2_1B-Scratch-DiT-B",
29
+ "action_dim": 10,
30
+ "action_head": "DiT",
31
+ "action_head_hidden_dim": 1024,
32
+ "action_len": 20,
33
+ "add_cross_attention": false,
34
+ "aggregation": "None",
35
+ "architectures": [
36
+ "Eagle2_1BVLA"
37
+ ],
38
+ "auto_map": {},
39
+ "bad_words_ids": null,
40
+ "begin_suppress_tokens": null,
41
+ "bos_token_id": null,
42
+ "chunk_size_feed_forward": 0,
43
+ "cross_attention_hidden_size": null,
44
+ "decoder_start_token_id": null,
45
+ "denoiser": "FM",
46
+ "diffusion_batch": 32,
47
+ "dit_size": "DiT-B",
48
+ "diversity_penalty": 0.0,
49
+ "do_sample": false,
50
+ "downsample_ratio": 0.5,
51
+ "dynamic_image_size": true,
52
+ "early_stopping": false,
53
+ "efficient_loss": true,
54
+ "enable_cfg": true,
55
+ "encoder_no_repeat_ngram_size": 0,
56
+ "eos_token_id": null,
57
+ "exponential_decay_length_penalty": null,
58
+ "finetuning_task": null,
59
+ "force_image_size": 448,
60
+ "forced_bos_token_id": null,
61
+ "forced_eos_token_id": null,
62
+ "global_normalization": true,
63
+ "id2label": {
64
+ "0": "LABEL_0",
65
+ "1": "LABEL_1"
66
+ },
67
+ "image_size": 448,
68
+ "is_decoder": false,
69
+ "is_encoder_decoder": false,
70
+ "keep_aspect_ratio": false,
71
+ "knowledge_insulation": false,
72
+ "label2id": {
73
+ "LABEL_0": 0,
74
+ "LABEL_1": 1
75
+ },
76
+ "length_penalty": 1.0,
77
+ "llm_config": {
78
+ "_attn_implementation_autoset": true,
79
+ "_name_or_path": "./pretrained/Qwen2_5-0_5B-Instruct",
80
+ "add_cross_attention": false,
81
+ "architectures": [
82
+ "Qwen2ForCausalLM"
83
+ ],
84
+ "attention_dropout": 0.0,
85
+ "auto_map": {
86
+ "AutoConfig": "configuration_qwen2.Qwen2Config",
87
+ "AutoModel": "modeling_qwen2.Qwen2Model",
88
+ "AutoModelForCausalLM": "modeling_qwen2.Qwen2ForCausalLM"
89
+ },
90
+ "bad_words_ids": null,
91
+ "begin_suppress_tokens": null,
92
+ "bos_token_id": 151643,
93
+ "chunk_size_feed_forward": 0,
94
+ "cross_attention_hidden_size": null,
95
+ "decoder_start_token_id": null,
96
+ "diversity_penalty": 0.0,
97
+ "do_sample": false,
98
+ "early_stopping": false,
99
+ "encoder_no_repeat_ngram_size": 0,
100
+ "eos_token_id": 151645,
101
+ "exponential_decay_length_penalty": null,
102
+ "finetuning_task": null,
103
+ "forced_bos_token_id": null,
104
+ "forced_eos_token_id": null,
105
+ "hidden_act": "silu",
106
+ "hidden_size": 896,
107
+ "id2label": {
108
+ "0": "LABEL_0",
109
+ "1": "LABEL_1"
110
+ },
111
+ "initializer_range": 0.02,
112
+ "intermediate_size": 4864,
113
+ "is_decoder": false,
114
+ "is_encoder_decoder": false,
115
+ "label2id": {
116
+ "LABEL_0": 0,
117
+ "LABEL_1": 1
118
+ },
119
+ "length_penalty": 1.0,
120
+ "max_length": 20,
121
+ "max_position_embeddings": 32768,
122
+ "max_window_layers": 21,
123
+ "min_length": 0,
124
+ "model_type": "qwen2",
125
+ "no_repeat_ngram_size": 0,
126
+ "num_attention_heads": 14,
127
+ "num_beam_groups": 1,
128
+ "num_beams": 1,
129
+ "num_hidden_layers": 24,
130
+ "num_key_value_heads": 2,
131
+ "num_return_sequences": 1,
132
+ "output_attentions": false,
133
+ "output_hidden_states": false,
134
+ "output_scores": false,
135
+ "pad_token_id": null,
136
+ "prefix": null,
137
+ "problem_type": null,
138
+ "pruned_heads": {},
139
+ "remove_invalid_values": false,
140
+ "repetition_penalty": 1.0,
141
+ "return_dict": true,
142
+ "return_dict_in_generate": false,
143
+ "rms_norm_eps": 1e-06,
144
+ "rope_scaling": null,
145
+ "rope_theta": 1000000.0,
146
+ "sep_token_id": null,
147
+ "sliding_window": 32768,
148
+ "suppress_tokens": null,
149
+ "task_specific_params": null,
150
+ "temperature": 1.0,
151
+ "tf_legacy_loss": false,
152
+ "tie_encoder_decoder": false,
153
+ "tie_word_embeddings": true,
154
+ "tokenizer_class": null,
155
+ "top_k": 50,
156
+ "top_p": 1.0,
157
+ "torch_dtype": "bfloat16",
158
+ "torchscript": false,
159
+ "transformers_version": "4.50.0.dev0",
160
+ "typical_p": 1.0,
161
+ "use_bfloat16": false,
162
+ "use_cache": false,
163
+ "use_sliding_window": false,
164
+ "vocab_size": 151674
165
+ },
166
+ "loss_version": "v4",
167
+ "max_dynamic_patch": 12,
168
+ "max_length": 20,
169
+ "min_dynamic_patch": 1,
170
+ "min_length": 0,
171
+ "mlp_checkpoint": true,
172
+ "model_path": "nvidia/Eagle2-1B",
173
+ "model_type": "Eagle2_1BVLA",
174
+ "modeling": "denoising",
175
+ "no_repeat_ngram_size": 0,
176
+ "normalization": "quantile",
177
+ "num_beam_groups": 1,
178
+ "num_beams": 1,
179
+ "num_readouts": 1,
180
+ "num_return_sequences": 1,
181
+ "output_attentions": false,
182
+ "output_hidden_states": false,
183
+ "output_scores": false,
184
+ "pad2square": false,
185
+ "pad_token_id": null,
186
+ "pre_feature_reduction": false,
187
+ "prefix": null,
188
+ "problem_type": null,
189
+ "pruned_heads": {},
190
+ "ps_version": "v2",
191
+ "readout_token_as_eos": true,
192
+ "remove_invalid_values": false,
193
+ "repetition_penalty": 1.0,
194
+ "return_dict": true,
195
+ "return_dict_in_generate": false,
196
+ "return_text": null,
197
+ "select_layer": -1,
198
+ "sep_token_id": null,
199
+ "state_dim": 10,
200
+ "stopping_token": "|",
201
+ "suppress_tokens": null,
202
+ "task_specific_params": null,
203
+ "temperature": 1.0,
204
+ "template": "qwen2-chat",
205
+ "test_denoising_steps": 10,
206
+ "tf_legacy_loss": false,
207
+ "tie_encoder_decoder": false,
208
+ "tie_word_embeddings": true,
209
+ "tokenizer_class": null,
210
+ "top_k": 50,
211
+ "top_p": 1.0,
212
+ "torch_dtype": "bfloat16",
213
+ "torchscript": false,
214
+ "train_denoising_steps": 100,
215
+ "typical_p": 1.0,
216
+ "use_backbone_lora": 0,
217
+ "use_bfloat16": false,
218
+ "use_llm_lora": 0,
219
+ "use_thumbnail": true,
220
+ "vision_config": {
221
+ "_attn_implementation_autoset": true,
222
+ "_name_or_path": "",
223
+ "add_cross_attention": false,
224
+ "architectures": [
225
+ "SiglipVisionModel"
226
+ ],
227
+ "attention_dropout": 0.0,
228
+ "auto_map": {
229
+ "AutoConfig": "configuration_siglip.SiglipVisionConfig",
230
+ "AutoModel": "modeling_siglip.SiglipVisionModel"
231
+ },
232
+ "bad_words_ids": null,
233
+ "begin_suppress_tokens": null,
234
+ "bos_token_id": null,
235
+ "chunk_size_feed_forward": 0,
236
+ "cross_attention_hidden_size": null,
237
+ "decoder_start_token_id": null,
238
+ "diversity_penalty": 0.0,
239
+ "do_sample": false,
240
+ "drop_path_rate": 0.1,
241
+ "early_stopping": false,
242
+ "encoder_no_repeat_ngram_size": 0,
243
+ "eos_token_id": null,
244
+ "exponential_decay_length_penalty": null,
245
+ "finetuning_task": null,
246
+ "forced_bos_token_id": null,
247
+ "forced_eos_token_id": null,
248
+ "hidden_act": "gelu_pytorch_tanh",
249
+ "hidden_size": 1152,
250
+ "id2label": {
251
+ "0": "LABEL_0",
252
+ "1": "LABEL_1"
253
+ },
254
+ "image_size": 448,
255
+ "intermediate_size": 4304,
256
+ "is_decoder": false,
257
+ "is_encoder_decoder": false,
258
+ "label2id": {
259
+ "LABEL_0": 0,
260
+ "LABEL_1": 1
261
+ },
262
+ "layer_norm_eps": 1e-06,
263
+ "length_penalty": 1.0,
264
+ "max_length": 20,
265
+ "min_length": 0,
266
+ "model_type": "siglip_vision_model",
267
+ "no_repeat_ngram_size": 0,
268
+ "num_attention_heads": 16,
269
+ "num_beam_groups": 1,
270
+ "num_beams": 1,
271
+ "num_channels": 3,
272
+ "num_hidden_layers": 27,
273
+ "num_image_tokens": 1024,
274
+ "num_return_sequences": 1,
275
+ "output_attentions": false,
276
+ "output_hidden_states": false,
277
+ "output_scores": false,
278
+ "pad_token_id": null,
279
+ "patch_size": 14,
280
+ "prefix": null,
281
+ "problem_type": null,
282
+ "projection_dim": 2048,
283
+ "projector_hidden_act": "gelu_fast",
284
+ "pruned_heads": {},
285
+ "remove_invalid_values": false,
286
+ "repetition_penalty": 1.0,
287
+ "return_dict": true,
288
+ "return_dict_in_generate": false,
289
+ "sep_token_id": null,
290
+ "suppress_tokens": null,
291
+ "task_specific_params": null,
292
+ "temperature": 1.0,
293
+ "tf_legacy_loss": false,
294
+ "tie_encoder_decoder": false,
295
+ "tie_word_embeddings": true,
296
+ "tokenizer_class": null,
297
+ "top_k": 50,
298
+ "top_p": 1.0,
299
+ "torch_dtype": "bfloat16",
300
+ "torchscript": false,
301
+ "transformers_version": "4.50.0.dev0",
302
+ "typical_p": 1.0,
303
+ "use_bfloat16": false,
304
+ "vision_use_head": false
305
+ },
306
+ "vocab_size": 151674,
307
+ "vocab_start": null
308
+ },
309
+ "singlevla_config_path": "/data5/jellyho/twinvla-checkpoints/Eagle2_1B-Scratch-DiT-B",
310
+ "singlevla_pretrained_path": null,
311
+ "state_dim": 10,
312
+ "torch_dtype": "bfloat16",
313
+ "transformers_version": "4.50.0.dev0"
314
+ }
2e-5/twinvla-scratch-aloha_handover_box/dataset_statistics.json ADDED
@@ -0,0 +1,634 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "aloha_handover_box": {
3
+ "action": {
4
+ "mean": [
5
+ 0.3224456310272217,
6
+ -0.07136797159910202,
7
+ 0.16350853443145752,
8
+ 0.6155848503112793,
9
+ 0.02582639828324318,
10
+ -0.6291783452033997,
11
+ -0.12873497605323792,
12
+ 0.9684513211250305,
13
+ -0.05324755236506462,
14
+ 0.32048356533050537,
15
+ 0.3553532660007477,
16
+ -0.01726912148296833,
17
+ 0.2535472810268402,
18
+ 0.9467610120773315,
19
+ -0.10934814065694809,
20
+ -0.16364224255084991,
21
+ 0.1080813780426979,
22
+ 0.9652291536331177,
23
+ -0.08225563168525696,
24
+ 0.6809535622596741
25
+ ],
26
+ "std": [
27
+ 0.07454729825258255,
28
+ 0.08869028091430664,
29
+ 0.07996603846549988,
30
+ 0.33460596203804016,
31
+ 0.19985826313495636,
32
+ 0.26947179436683655,
33
+ 0.12514576315879822,
34
+ 0.030899839475750923,
35
+ 0.16146357357501984,
36
+ 0.9473047256469727,
37
+ 0.064877949655056,
38
+ 0.038949884474277496,
39
+ 0.027652490884065628,
40
+ 0.10490523278713226,
41
+ 0.1838432103395462,
42
+ 0.14178058505058289,
43
+ 0.2033252865076065,
44
+ 0.06656654924154282,
45
+ 0.06421922147274017,
46
+ 0.7322930693626404
47
+ ],
48
+ "max": [
49
+ 0.48683926463127136,
50
+ 0.0484432689845562,
51
+ 0.31490612030029297,
52
+ 0.99891197681427,
53
+ 0.4277522563934326,
54
+ 0.06322141736745834,
55
+ 0.4004654884338379,
56
+ 0.9999857544898987,
57
+ 0.3100079298019409,
58
+ 1.0,
59
+ 0.5334027409553528,
60
+ 0.08494444936513901,
61
+ 0.36568865180015564,
62
+ 0.9999882578849792,
63
+ 0.2546274662017822,
64
+ 0.1172015443444252,
65
+ 0.7982608079910278,
66
+ 0.9999992251396179,
67
+ 0.20094169676303864,
68
+ 1.0
69
+ ],
70
+ "min": [
71
+ 0.1422317922115326,
72
+ -0.2763901352882385,
73
+ -0.0600760243833065,
74
+ -0.14848311245441437,
75
+ -0.6282482743263245,
76
+ -0.9999129176139832,
77
+ -0.42181891202926636,
78
+ 0.7404066324234009,
79
+ -0.6676974296569824,
80
+ -1.0,
81
+ 0.1786160171031952,
82
+ -0.1845615804195404,
83
+ 0.1687021553516388,
84
+ 0.2762398421764374,
85
+ -0.7479667067527771,
86
+ -0.8485982418060303,
87
+ -0.2597721517086029,
88
+ 0.6015138626098633,
89
+ -0.3933228552341461,
90
+ -1.0
91
+ ],
92
+ "q01": [
93
+ 0.1950138956308365,
94
+ -0.24691226959228516,
95
+ -0.015285035967826844,
96
+ -0.04555398792028427,
97
+ -0.4452396559715271,
98
+ -0.996303243637085,
99
+ -0.3760478734970093,
100
+ 0.8516808867454528,
101
+ -0.46342918753623963,
102
+ -1.0,
103
+ 0.21926841557025908,
104
+ -0.1317625629901886,
105
+ 0.1978745412826538,
106
+ 0.5117229986190795,
107
+ -0.6376786828041077,
108
+ -0.6609986042976379,
109
+ -0.19099083304405212,
110
+ 0.6930621123313904,
111
+ -0.2356126993894577,
112
+ -1.0
113
+ ],
114
+ "q99": [
115
+ 0.47150796771049497,
116
+ 0.038070930540561675,
117
+ 0.28182336688041676,
118
+ 0.9817836880683899,
119
+ 0.3871919810771942,
120
+ -0.1345064049959186,
121
+ 0.20285944879054985,
122
+ 0.9992118668556214,
123
+ 0.2293877118825912,
124
+ 1.0,
125
+ 0.49810330152511595,
126
+ 0.0599309906363487,
127
+ 0.3309180569648742,
128
+ 0.9995350050926208,
129
+ 0.1829529863595952,
130
+ 0.03216676786541939,
131
+ 0.7132800936698909,
132
+ 0.9997488117218017,
133
+ 0.08941484957933345,
134
+ 1.0
135
+ ],
136
+ "mask": [
137
+ true,
138
+ true,
139
+ true,
140
+ false,
141
+ false,
142
+ false,
143
+ false,
144
+ false,
145
+ false,
146
+ false,
147
+ true,
148
+ true,
149
+ true,
150
+ false,
151
+ false,
152
+ false,
153
+ false,
154
+ false,
155
+ false,
156
+ false
157
+ ]
158
+ },
159
+ "proprio": {
160
+ "mean": [
161
+ 0.3200361728668213,
162
+ -0.06315236538648605,
163
+ 0.1539715975522995,
164
+ 0.6064432263374329,
165
+ 0.06654595583677292,
166
+ -0.6403248906135559,
167
+ -0.09359703212976456,
168
+ 0.962713897228241,
169
+ 0.0318731926381588,
170
+ 0.6514409780502319,
171
+ 0.35154759883880615,
172
+ -0.018256481736898422,
173
+ 0.23897576332092285,
174
+ 0.9405075311660767,
175
+ -0.11245886981487274,
176
+ -0.2019508332014084,
177
+ 0.10934195667505264,
178
+ 0.9565085172653198,
179
+ -0.0809820145368576,
180
+ 0.808542013168335
181
+ ],
182
+ "std": [
183
+ 0.07218372076749802,
184
+ 0.07983935624361038,
185
+ 0.08212248235940933,
186
+ 0.32560113072395325,
187
+ 0.15540780127048492,
188
+ 0.29600197076797485,
189
+ 0.1470419317483902,
190
+ 0.06898750364780426,
191
+ 0.19242025911808014,
192
+ 0.4760681986808777,
193
+ 0.06400060653686523,
194
+ 0.03751807287335396,
195
+ 0.0323367603123188,
196
+ 0.10509958118200302,
197
+ 0.17797957360744476,
198
+ 0.13889151811599731,
199
+ 0.20048993825912476,
200
+ 0.08273555338382721,
201
+ 0.13973523676395416,
202
+ 0.40020158886909485
203
+ ],
204
+ "max": [
205
+ 0.47570154070854187,
206
+ 0.08932404220104218,
207
+ 0.44513142108917236,
208
+ 0.9999915361404419,
209
+ 0.6316148042678833,
210
+ 0.7311769127845764,
211
+ 0.5646719932556152,
212
+ 1.0,
213
+ 0.9345466494560242,
214
+ 1.3299691677093506,
215
+ 0.5250220894813538,
216
+ 0.07912999391555786,
217
+ 0.41775044798851013,
218
+ 0.9999979138374329,
219
+ 0.2288104146718979,
220
+ 0.2556033134460449,
221
+ 0.7930954098701477,
222
+ 1.0,
223
+ 0.8460071086883545,
224
+ 1.1448447704315186
225
+ ],
226
+ "min": [
227
+ 0.15507374703884125,
228
+ -0.24968452751636505,
229
+ -0.005626574158668518,
230
+ -0.12249666452407837,
231
+ -0.3874700665473938,
232
+ -1.0,
233
+ -0.8481224179267883,
234
+ 0.28493279218673706,
235
+ -0.8170893788337708,
236
+ -1.083611011505127,
237
+ 0.18484443426132202,
238
+ -0.1679670214653015,
239
+ 0.1543029397726059,
240
+ 0.2590605616569519,
241
+ -0.7203781604766846,
242
+ -0.8606433272361755,
243
+ -0.2443554699420929,
244
+ 0.2216777801513672,
245
+ -0.9731146693229675,
246
+ -1.0848060846328735
247
+ ],
248
+ "q01": [
249
+ 0.1903739631175995,
250
+ -0.22257488489151,
251
+ -0.0036250025033950804,
252
+ -0.015333320312201977,
253
+ -0.2553225290775299,
254
+ -0.9997995805740356,
255
+ -0.3545967137813568,
256
+ 0.6295642066001892,
257
+ -0.32733017563819883,
258
+ -0.4065189242362976,
259
+ 0.22028838396072387,
260
+ -0.1278022611141205,
261
+ 0.17875114858150482,
262
+ 0.488557243347168,
263
+ -0.6262442255020142,
264
+ -0.6858670902252197,
265
+ -0.17815817892551422,
266
+ 0.6348884439468384,
267
+ -0.5856496715545654,
268
+ -0.4086606001853943
269
+ ],
270
+ "q99": [
271
+ 0.4643457818031311,
272
+ 0.05302721098065367,
273
+ 0.32663319587707507,
274
+ 0.995180070400238,
275
+ 0.426870135068893,
276
+ 0.18705489814281454,
277
+ 0.3631119978427884,
278
+ 0.9999364447593689,
279
+ 0.7475578069686883,
280
+ 1.178509011268615,
281
+ 0.4939642870426177,
282
+ 0.051381030380725806,
283
+ 0.3385275864601135,
284
+ 0.999157931804657,
285
+ 0.16684140086173982,
286
+ 0.05098062053322772,
287
+ 0.7065742087364195,
288
+ 0.9998370099067688,
289
+ 0.5137611627578699,
290
+ 1.0447997903823851
291
+ ],
292
+ "mask": [
293
+ true,
294
+ true,
295
+ true,
296
+ false,
297
+ false,
298
+ false,
299
+ false,
300
+ false,
301
+ false,
302
+ false,
303
+ true,
304
+ true,
305
+ true,
306
+ false,
307
+ false,
308
+ false,
309
+ false,
310
+ false,
311
+ false,
312
+ false
313
+ ]
314
+ },
315
+ "num_transitions": 11829,
316
+ "num_trajectories": 50
317
+ },
318
+ "aloha_handover_box_new": {
319
+ "action": {
320
+ "mean": [
321
+ 0.3224456310272217,
322
+ -0.07136797159910202,
323
+ 0.16350853443145752,
324
+ 0.6155848503112793,
325
+ 0.02582639828324318,
326
+ -0.6291783452033997,
327
+ -0.12873497605323792,
328
+ 0.9684513211250305,
329
+ -0.05324755236506462,
330
+ 0.32048356533050537,
331
+ 0.3553532660007477,
332
+ -0.01726912148296833,
333
+ 0.2535472810268402,
334
+ 0.9467610120773315,
335
+ -0.10934814065694809,
336
+ -0.16364224255084991,
337
+ 0.1080813780426979,
338
+ 0.9652291536331177,
339
+ -0.08225563168525696,
340
+ 0.6809535622596741
341
+ ],
342
+ "std": [
343
+ 0.07454729825258255,
344
+ 0.08869028091430664,
345
+ 0.07996603846549988,
346
+ 0.33460596203804016,
347
+ 0.19985826313495636,
348
+ 0.26947179436683655,
349
+ 0.12514576315879822,
350
+ 0.030899839475750923,
351
+ 0.16146357357501984,
352
+ 0.9473047256469727,
353
+ 0.064877949655056,
354
+ 0.038949884474277496,
355
+ 0.027652490884065628,
356
+ 0.10490523278713226,
357
+ 0.1838432103395462,
358
+ 0.14178058505058289,
359
+ 0.2033252865076065,
360
+ 0.06656654924154282,
361
+ 0.06421922147274017,
362
+ 0.7322930693626404
363
+ ],
364
+ "max": [
365
+ 0.48683926463127136,
366
+ 0.0484432689845562,
367
+ 0.31490612030029297,
368
+ 0.99891197681427,
369
+ 0.4277522563934326,
370
+ 0.06322141736745834,
371
+ 0.4004654884338379,
372
+ 0.9999857544898987,
373
+ 0.3100079298019409,
374
+ 1.0,
375
+ 0.5334027409553528,
376
+ 0.08494444936513901,
377
+ 0.36568865180015564,
378
+ 0.9999882578849792,
379
+ 0.2546274662017822,
380
+ 0.1172015443444252,
381
+ 0.7982608079910278,
382
+ 0.9999992251396179,
383
+ 0.20094169676303864,
384
+ 1.0
385
+ ],
386
+ "min": [
387
+ 0.1422317922115326,
388
+ -0.2763901352882385,
389
+ -0.0600760243833065,
390
+ -0.14848311245441437,
391
+ -0.6282482743263245,
392
+ -0.9999129176139832,
393
+ -0.42181891202926636,
394
+ 0.7404066324234009,
395
+ -0.6676974296569824,
396
+ -1.0,
397
+ 0.1786160171031952,
398
+ -0.1845615804195404,
399
+ 0.1687021553516388,
400
+ 0.2762398421764374,
401
+ -0.7479667067527771,
402
+ -0.8485982418060303,
403
+ -0.2597721517086029,
404
+ 0.6015138626098633,
405
+ -0.3933228552341461,
406
+ -1.0
407
+ ],
408
+ "q01": [
409
+ 0.1950138956308365,
410
+ -0.24691226959228516,
411
+ -0.015285035967826844,
412
+ -0.04555398792028427,
413
+ -0.4452396559715271,
414
+ -0.996303243637085,
415
+ -0.3760478734970093,
416
+ 0.8516808867454528,
417
+ -0.46342918753623963,
418
+ -1.0,
419
+ 0.21926841557025908,
420
+ -0.1317625629901886,
421
+ 0.1978745412826538,
422
+ 0.5117229986190795,
423
+ -0.6376786828041077,
424
+ -0.6609986042976379,
425
+ -0.19099083304405212,
426
+ 0.6930621123313904,
427
+ -0.2356126993894577,
428
+ -1.0
429
+ ],
430
+ "q99": [
431
+ 0.47150796771049497,
432
+ 0.038070930540561675,
433
+ 0.28182336688041676,
434
+ 0.9817836880683899,
435
+ 0.3871919810771942,
436
+ -0.1345064049959186,
437
+ 0.20285944879054985,
438
+ 0.9992118668556214,
439
+ 0.2293877118825912,
440
+ 1.0,
441
+ 0.49810330152511595,
442
+ 0.0599309906363487,
443
+ 0.3309180569648742,
444
+ 0.9995350050926208,
445
+ 0.1829529863595952,
446
+ 0.03216676786541939,
447
+ 0.7132800936698909,
448
+ 0.9997488117218017,
449
+ 0.08941484957933345,
450
+ 1.0
451
+ ],
452
+ "mask": [
453
+ true,
454
+ true,
455
+ true,
456
+ false,
457
+ false,
458
+ false,
459
+ false,
460
+ false,
461
+ false,
462
+ false,
463
+ true,
464
+ true,
465
+ true,
466
+ false,
467
+ false,
468
+ false,
469
+ false,
470
+ false,
471
+ false,
472
+ false
473
+ ]
474
+ },
475
+ "proprio": {
476
+ "mean": [
477
+ 0.3200361728668213,
478
+ -0.06315236538648605,
479
+ 0.1539715975522995,
480
+ 0.6064432263374329,
481
+ 0.06654595583677292,
482
+ -0.6403248906135559,
483
+ -0.09359703212976456,
484
+ 0.962713897228241,
485
+ 0.0318731926381588,
486
+ 0.6514409780502319,
487
+ 0.35154759883880615,
488
+ -0.018256481736898422,
489
+ 0.23897576332092285,
490
+ 0.9405075311660767,
491
+ -0.11245886981487274,
492
+ -0.2019508332014084,
493
+ 0.10934195667505264,
494
+ 0.9565085172653198,
495
+ -0.0809820145368576,
496
+ 0.808542013168335
497
+ ],
498
+ "std": [
499
+ 0.07218372076749802,
500
+ 0.07983935624361038,
501
+ 0.08212248235940933,
502
+ 0.32560113072395325,
503
+ 0.15540780127048492,
504
+ 0.29600197076797485,
505
+ 0.1470419317483902,
506
+ 0.06898750364780426,
507
+ 0.19242025911808014,
508
+ 0.4760681986808777,
509
+ 0.06400060653686523,
510
+ 0.03751807287335396,
511
+ 0.0323367603123188,
512
+ 0.10509958118200302,
513
+ 0.17797957360744476,
514
+ 0.13889151811599731,
515
+ 0.20048993825912476,
516
+ 0.08273555338382721,
517
+ 0.13973523676395416,
518
+ 0.40020158886909485
519
+ ],
520
+ "max": [
521
+ 0.47570154070854187,
522
+ 0.08932404220104218,
523
+ 0.44513142108917236,
524
+ 0.9999915361404419,
525
+ 0.6316148042678833,
526
+ 0.7311769127845764,
527
+ 0.5646719932556152,
528
+ 1.0,
529
+ 0.9345466494560242,
530
+ 1.3299691677093506,
531
+ 0.5250220894813538,
532
+ 0.07912999391555786,
533
+ 0.41775044798851013,
534
+ 0.9999979138374329,
535
+ 0.2288104146718979,
536
+ 0.2556033134460449,
537
+ 0.7930954098701477,
538
+ 1.0,
539
+ 0.8460071086883545,
540
+ 1.1448447704315186
541
+ ],
542
+ "min": [
543
+ 0.15507374703884125,
544
+ -0.24968452751636505,
545
+ -0.005626574158668518,
546
+ -0.12249666452407837,
547
+ -0.3874700665473938,
548
+ -1.0,
549
+ -0.8481224179267883,
550
+ 0.28493279218673706,
551
+ -0.8170893788337708,
552
+ -1.083611011505127,
553
+ 0.18484443426132202,
554
+ -0.1679670214653015,
555
+ 0.1543029397726059,
556
+ 0.2590605616569519,
557
+ -0.7203781604766846,
558
+ -0.8606433272361755,
559
+ -0.2443554699420929,
560
+ 0.2216777801513672,
561
+ -0.9731146693229675,
562
+ -1.0848060846328735
563
+ ],
564
+ "q01": [
565
+ 0.1903739631175995,
566
+ -0.22257488489151,
567
+ -0.0036250025033950804,
568
+ -0.015333320312201977,
569
+ -0.2553225290775299,
570
+ -0.9997995805740356,
571
+ -0.3545967137813568,
572
+ 0.6295642066001892,
573
+ -0.32733017563819883,
574
+ -0.4065189242362976,
575
+ 0.22028838396072387,
576
+ -0.1278022611141205,
577
+ 0.17875114858150482,
578
+ 0.488557243347168,
579
+ -0.6262442255020142,
580
+ -0.6858670902252197,
581
+ -0.17815817892551422,
582
+ 0.6348884439468384,
583
+ -0.5856496715545654,
584
+ -0.4086606001853943
585
+ ],
586
+ "q99": [
587
+ 0.4643457818031311,
588
+ 0.05302721098065367,
589
+ 0.32663319587707507,
590
+ 0.995180070400238,
591
+ 0.426870135068893,
592
+ 0.18705489814281454,
593
+ 0.3631119978427884,
594
+ 0.9999364447593689,
595
+ 0.7475578069686883,
596
+ 1.178509011268615,
597
+ 0.4939642870426177,
598
+ 0.051381030380725806,
599
+ 0.3385275864601135,
600
+ 0.999157931804657,
601
+ 0.16684140086173982,
602
+ 0.05098062053322772,
603
+ 0.7065742087364195,
604
+ 0.9998370099067688,
605
+ 0.5137611627578699,
606
+ 1.0447997903823851
607
+ ],
608
+ "mask": [
609
+ true,
610
+ true,
611
+ true,
612
+ false,
613
+ false,
614
+ false,
615
+ false,
616
+ false,
617
+ false,
618
+ false,
619
+ true,
620
+ true,
621
+ true,
622
+ false,
623
+ false,
624
+ false,
625
+ false,
626
+ false,
627
+ false,
628
+ false
629
+ ]
630
+ },
631
+ "num_transitions": 11829,
632
+ "num_trajectories": 50
633
+ }
634
+ }
2e-5/twinvla-scratch-aloha_handover_box/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ac71b26bd439527bce49753635984d136ea90a4514ce78738b77fe82a090911
3
+ size 2889536104
2e-5/twinvla-scratch-aloha_handover_box/singlevla_config/config.json ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_commit_hash": null,
3
+ "_name_or_path": "/data5/jellyho/twinvla-checkpoints/Eagle2_1B-Scratch-DiT-B",
4
+ "action_dim": 10,
5
+ "action_head": "DiT",
6
+ "action_head_hidden_dim": 1024,
7
+ "action_len": 20,
8
+ "aggregation": "None",
9
+ "architectures": [
10
+ "Eagle2_1BVLA"
11
+ ],
12
+ "auto_map": {},
13
+ "denoiser": "FM",
14
+ "diffusion_batch": 32,
15
+ "dit_size": "DiT-B",
16
+ "downsample_ratio": 0.5,
17
+ "dynamic_image_size": true,
18
+ "efficient_loss": true,
19
+ "enable_cfg": true,
20
+ "force_image_size": 448,
21
+ "global_normalization": true,
22
+ "image_size": 448,
23
+ "keep_aspect_ratio": false,
24
+ "knowledge_insulation": false,
25
+ "llm_config": {
26
+ "_attn_implementation_autoset": true,
27
+ "_name_or_path": "./pretrained/Qwen2_5-0_5B-Instruct",
28
+ "add_cross_attention": false,
29
+ "architectures": [
30
+ "Qwen2ForCausalLM"
31
+ ],
32
+ "attention_dropout": 0.0,
33
+ "auto_map": {
34
+ "AutoConfig": "configuration_qwen2.Qwen2Config",
35
+ "AutoModel": "modeling_qwen2.Qwen2Model",
36
+ "AutoModelForCausalLM": "modeling_qwen2.Qwen2ForCausalLM"
37
+ },
38
+ "bad_words_ids": null,
39
+ "begin_suppress_tokens": null,
40
+ "bos_token_id": 151643,
41
+ "chunk_size_feed_forward": 0,
42
+ "cross_attention_hidden_size": null,
43
+ "decoder_start_token_id": null,
44
+ "diversity_penalty": 0.0,
45
+ "do_sample": false,
46
+ "early_stopping": false,
47
+ "encoder_no_repeat_ngram_size": 0,
48
+ "eos_token_id": 151645,
49
+ "exponential_decay_length_penalty": null,
50
+ "finetuning_task": null,
51
+ "forced_bos_token_id": null,
52
+ "forced_eos_token_id": null,
53
+ "hidden_act": "silu",
54
+ "hidden_size": 896,
55
+ "id2label": {
56
+ "0": "LABEL_0",
57
+ "1": "LABEL_1"
58
+ },
59
+ "initializer_range": 0.02,
60
+ "intermediate_size": 4864,
61
+ "is_decoder": false,
62
+ "is_encoder_decoder": false,
63
+ "label2id": {
64
+ "LABEL_0": 0,
65
+ "LABEL_1": 1
66
+ },
67
+ "length_penalty": 1.0,
68
+ "max_length": 20,
69
+ "max_position_embeddings": 32768,
70
+ "max_window_layers": 21,
71
+ "min_length": 0,
72
+ "model_type": "qwen2",
73
+ "no_repeat_ngram_size": 0,
74
+ "num_attention_heads": 14,
75
+ "num_beam_groups": 1,
76
+ "num_beams": 1,
77
+ "num_hidden_layers": 24,
78
+ "num_key_value_heads": 2,
79
+ "num_return_sequences": 1,
80
+ "output_attentions": false,
81
+ "output_hidden_states": false,
82
+ "output_scores": false,
83
+ "pad_token_id": null,
84
+ "prefix": null,
85
+ "problem_type": null,
86
+ "pruned_heads": {},
87
+ "remove_invalid_values": false,
88
+ "repetition_penalty": 1.0,
89
+ "return_dict": true,
90
+ "return_dict_in_generate": false,
91
+ "rms_norm_eps": 1e-06,
92
+ "rope_scaling": null,
93
+ "rope_theta": 1000000.0,
94
+ "sep_token_id": null,
95
+ "sliding_window": 32768,
96
+ "suppress_tokens": null,
97
+ "task_specific_params": null,
98
+ "temperature": 1.0,
99
+ "tf_legacy_loss": false,
100
+ "tie_encoder_decoder": false,
101
+ "tie_word_embeddings": true,
102
+ "tokenizer_class": null,
103
+ "top_k": 50,
104
+ "top_p": 1.0,
105
+ "torch_dtype": "bfloat16",
106
+ "torchscript": false,
107
+ "transformers_version": "4.50.0.dev0",
108
+ "typical_p": 1.0,
109
+ "use_bfloat16": false,
110
+ "use_cache": false,
111
+ "use_sliding_window": false,
112
+ "vocab_size": 151674
113
+ },
114
+ "loss_version": "v4",
115
+ "max_dynamic_patch": 12,
116
+ "min_dynamic_patch": 1,
117
+ "mlp_checkpoint": true,
118
+ "model_path": "nvidia/Eagle2-1B",
119
+ "model_type": "Eagle2_1BVLA",
120
+ "modeling": "denoising",
121
+ "normalization": "quantile",
122
+ "num_readouts": 1,
123
+ "pad2square": false,
124
+ "pre_feature_reduction": false,
125
+ "ps_version": "v2",
126
+ "readout_token_as_eos": true,
127
+ "return_text": null,
128
+ "select_layer": -1,
129
+ "state_dim": 10,
130
+ "stopping_token": "|",
131
+ "template": "qwen2-chat",
132
+ "test_denoising_steps": 10,
133
+ "torch_dtype": "bfloat16",
134
+ "train_denoising_steps": 100,
135
+ "transformers_version": null,
136
+ "use_backbone_lora": 0,
137
+ "use_llm_lora": 0,
138
+ "use_thumbnail": true,
139
+ "vision_config": {
140
+ "_attn_implementation_autoset": true,
141
+ "_name_or_path": "",
142
+ "add_cross_attention": false,
143
+ "architectures": [
144
+ "SiglipVisionModel"
145
+ ],
146
+ "attention_dropout": 0.0,
147
+ "auto_map": {
148
+ "AutoConfig": "configuration_siglip.SiglipVisionConfig",
149
+ "AutoModel": "modeling_siglip.SiglipVisionModel"
150
+ },
151
+ "bad_words_ids": null,
152
+ "begin_suppress_tokens": null,
153
+ "bos_token_id": null,
154
+ "chunk_size_feed_forward": 0,
155
+ "cross_attention_hidden_size": null,
156
+ "decoder_start_token_id": null,
157
+ "diversity_penalty": 0.0,
158
+ "do_sample": false,
159
+ "drop_path_rate": 0.1,
160
+ "early_stopping": false,
161
+ "encoder_no_repeat_ngram_size": 0,
162
+ "eos_token_id": null,
163
+ "exponential_decay_length_penalty": null,
164
+ "finetuning_task": null,
165
+ "forced_bos_token_id": null,
166
+ "forced_eos_token_id": null,
167
+ "hidden_act": "gelu_pytorch_tanh",
168
+ "hidden_size": 1152,
169
+ "id2label": {
170
+ "0": "LABEL_0",
171
+ "1": "LABEL_1"
172
+ },
173
+ "image_size": 448,
174
+ "intermediate_size": 4304,
175
+ "is_decoder": false,
176
+ "is_encoder_decoder": false,
177
+ "label2id": {
178
+ "LABEL_0": 0,
179
+ "LABEL_1": 1
180
+ },
181
+ "layer_norm_eps": 1e-06,
182
+ "length_penalty": 1.0,
183
+ "max_length": 20,
184
+ "min_length": 0,
185
+ "model_type": "siglip_vision_model",
186
+ "no_repeat_ngram_size": 0,
187
+ "num_attention_heads": 16,
188
+ "num_beam_groups": 1,
189
+ "num_beams": 1,
190
+ "num_channels": 3,
191
+ "num_hidden_layers": 27,
192
+ "num_image_tokens": 1024,
193
+ "num_return_sequences": 1,
194
+ "output_attentions": false,
195
+ "output_hidden_states": false,
196
+ "output_scores": false,
197
+ "pad_token_id": null,
198
+ "patch_size": 14,
199
+ "prefix": null,
200
+ "problem_type": null,
201
+ "projection_dim": 2048,
202
+ "projector_hidden_act": "gelu_fast",
203
+ "pruned_heads": {},
204
+ "remove_invalid_values": false,
205
+ "repetition_penalty": 1.0,
206
+ "return_dict": true,
207
+ "return_dict_in_generate": false,
208
+ "sep_token_id": null,
209
+ "suppress_tokens": null,
210
+ "task_specific_params": null,
211
+ "temperature": 1.0,
212
+ "tf_legacy_loss": false,
213
+ "tie_encoder_decoder": false,
214
+ "tie_word_embeddings": true,
215
+ "tokenizer_class": null,
216
+ "top_k": 50,
217
+ "top_p": 1.0,
218
+ "torch_dtype": "bfloat16",
219
+ "torchscript": false,
220
+ "transformers_version": "4.50.0.dev0",
221
+ "typical_p": 1.0,
222
+ "use_bfloat16": false,
223
+ "vision_use_head": false
224
+ },
225
+ "vocab_size": 151674,
226
+ "vocab_start": null
227
+ }
2e-5/twinvla-scratch-aloha_handover_box/training_states.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3272c2fbc20d5db43220008c4f3ee2ac707e606bf35d2cd829e5c5feda24abbc
3
+ size 4126124658
2e-5/twinvla-scratch-aloha_lift_box/config.json ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "action_dim": 10,
3
+ "action_head": "DiT",
4
+ "action_len": 20,
5
+ "architectures": [
6
+ "Eagle2_1BTwinVLA"
7
+ ],
8
+ "attn_reweighting": true,
9
+ "denoiser": "FM",
10
+ "dit_scratch": false,
11
+ "global_normalization": true,
12
+ "hz_interpolate": null,
13
+ "interpolate_gripper": false,
14
+ "knowledge_insulation": false,
15
+ "model_path": null,
16
+ "model_type": "Eagle2_1BTwinVLA",
17
+ "modeling": "denoising",
18
+ "normalization": "quantile",
19
+ "num_readouts": 1,
20
+ "readout_token_as_eos": true,
21
+ "share_decoder": true,
22
+ "share_embed_tokens": true,
23
+ "share_vision": true,
24
+ "singlevla_config": {
25
+ "_attn_implementation_autoset": false,
26
+ "_attn_implementation_internal": null,
27
+ "_commit_hash": null,
28
+ "_name_or_path": "/data5/jellyho/twinvla-checkpoints/Eagle2_1B-Scratch-DiT-B",
29
+ "action_dim": 10,
30
+ "action_head": "DiT",
31
+ "action_head_hidden_dim": 1024,
32
+ "action_len": 20,
33
+ "add_cross_attention": false,
34
+ "aggregation": "None",
35
+ "architectures": [
36
+ "Eagle2_1BVLA"
37
+ ],
38
+ "auto_map": {},
39
+ "bad_words_ids": null,
40
+ "begin_suppress_tokens": null,
41
+ "bos_token_id": null,
42
+ "chunk_size_feed_forward": 0,
43
+ "cross_attention_hidden_size": null,
44
+ "decoder_start_token_id": null,
45
+ "denoiser": "FM",
46
+ "diffusion_batch": 32,
47
+ "dit_size": "DiT-B",
48
+ "diversity_penalty": 0.0,
49
+ "do_sample": false,
50
+ "downsample_ratio": 0.5,
51
+ "dynamic_image_size": true,
52
+ "early_stopping": false,
53
+ "efficient_loss": true,
54
+ "enable_cfg": true,
55
+ "encoder_no_repeat_ngram_size": 0,
56
+ "eos_token_id": null,
57
+ "exponential_decay_length_penalty": null,
58
+ "finetuning_task": null,
59
+ "force_image_size": 448,
60
+ "forced_bos_token_id": null,
61
+ "forced_eos_token_id": null,
62
+ "global_normalization": true,
63
+ "id2label": {
64
+ "0": "LABEL_0",
65
+ "1": "LABEL_1"
66
+ },
67
+ "image_size": 448,
68
+ "is_decoder": false,
69
+ "is_encoder_decoder": false,
70
+ "keep_aspect_ratio": false,
71
+ "knowledge_insulation": false,
72
+ "label2id": {
73
+ "LABEL_0": 0,
74
+ "LABEL_1": 1
75
+ },
76
+ "length_penalty": 1.0,
77
+ "llm_config": {
78
+ "_attn_implementation_autoset": true,
79
+ "_name_or_path": "./pretrained/Qwen2_5-0_5B-Instruct",
80
+ "add_cross_attention": false,
81
+ "architectures": [
82
+ "Qwen2ForCausalLM"
83
+ ],
84
+ "attention_dropout": 0.0,
85
+ "auto_map": {
86
+ "AutoConfig": "configuration_qwen2.Qwen2Config",
87
+ "AutoModel": "modeling_qwen2.Qwen2Model",
88
+ "AutoModelForCausalLM": "modeling_qwen2.Qwen2ForCausalLM"
89
+ },
90
+ "bad_words_ids": null,
91
+ "begin_suppress_tokens": null,
92
+ "bos_token_id": 151643,
93
+ "chunk_size_feed_forward": 0,
94
+ "cross_attention_hidden_size": null,
95
+ "decoder_start_token_id": null,
96
+ "diversity_penalty": 0.0,
97
+ "do_sample": false,
98
+ "early_stopping": false,
99
+ "encoder_no_repeat_ngram_size": 0,
100
+ "eos_token_id": 151645,
101
+ "exponential_decay_length_penalty": null,
102
+ "finetuning_task": null,
103
+ "forced_bos_token_id": null,
104
+ "forced_eos_token_id": null,
105
+ "hidden_act": "silu",
106
+ "hidden_size": 896,
107
+ "id2label": {
108
+ "0": "LABEL_0",
109
+ "1": "LABEL_1"
110
+ },
111
+ "initializer_range": 0.02,
112
+ "intermediate_size": 4864,
113
+ "is_decoder": false,
114
+ "is_encoder_decoder": false,
115
+ "label2id": {
116
+ "LABEL_0": 0,
117
+ "LABEL_1": 1
118
+ },
119
+ "length_penalty": 1.0,
120
+ "max_length": 20,
121
+ "max_position_embeddings": 32768,
122
+ "max_window_layers": 21,
123
+ "min_length": 0,
124
+ "model_type": "qwen2",
125
+ "no_repeat_ngram_size": 0,
126
+ "num_attention_heads": 14,
127
+ "num_beam_groups": 1,
128
+ "num_beams": 1,
129
+ "num_hidden_layers": 24,
130
+ "num_key_value_heads": 2,
131
+ "num_return_sequences": 1,
132
+ "output_attentions": false,
133
+ "output_hidden_states": false,
134
+ "output_scores": false,
135
+ "pad_token_id": null,
136
+ "prefix": null,
137
+ "problem_type": null,
138
+ "pruned_heads": {},
139
+ "remove_invalid_values": false,
140
+ "repetition_penalty": 1.0,
141
+ "return_dict": true,
142
+ "return_dict_in_generate": false,
143
+ "rms_norm_eps": 1e-06,
144
+ "rope_scaling": null,
145
+ "rope_theta": 1000000.0,
146
+ "sep_token_id": null,
147
+ "sliding_window": 32768,
148
+ "suppress_tokens": null,
149
+ "task_specific_params": null,
150
+ "temperature": 1.0,
151
+ "tf_legacy_loss": false,
152
+ "tie_encoder_decoder": false,
153
+ "tie_word_embeddings": true,
154
+ "tokenizer_class": null,
155
+ "top_k": 50,
156
+ "top_p": 1.0,
157
+ "torch_dtype": "bfloat16",
158
+ "torchscript": false,
159
+ "transformers_version": "4.50.0.dev0",
160
+ "typical_p": 1.0,
161
+ "use_bfloat16": false,
162
+ "use_cache": false,
163
+ "use_sliding_window": false,
164
+ "vocab_size": 151674
165
+ },
166
+ "loss_version": "v4",
167
+ "max_dynamic_patch": 12,
168
+ "max_length": 20,
169
+ "min_dynamic_patch": 1,
170
+ "min_length": 0,
171
+ "mlp_checkpoint": true,
172
+ "model_path": "nvidia/Eagle2-1B",
173
+ "model_type": "Eagle2_1BVLA",
174
+ "modeling": "denoising",
175
+ "no_repeat_ngram_size": 0,
176
+ "normalization": "quantile",
177
+ "num_beam_groups": 1,
178
+ "num_beams": 1,
179
+ "num_readouts": 1,
180
+ "num_return_sequences": 1,
181
+ "output_attentions": false,
182
+ "output_hidden_states": false,
183
+ "output_scores": false,
184
+ "pad2square": false,
185
+ "pad_token_id": null,
186
+ "pre_feature_reduction": false,
187
+ "prefix": null,
188
+ "problem_type": null,
189
+ "pruned_heads": {},
190
+ "ps_version": "v2",
191
+ "readout_token_as_eos": true,
192
+ "remove_invalid_values": false,
193
+ "repetition_penalty": 1.0,
194
+ "return_dict": true,
195
+ "return_dict_in_generate": false,
196
+ "return_text": null,
197
+ "select_layer": -1,
198
+ "sep_token_id": null,
199
+ "state_dim": 10,
200
+ "stopping_token": "|",
201
+ "suppress_tokens": null,
202
+ "task_specific_params": null,
203
+ "temperature": 1.0,
204
+ "template": "qwen2-chat",
205
+ "test_denoising_steps": 10,
206
+ "tf_legacy_loss": false,
207
+ "tie_encoder_decoder": false,
208
+ "tie_word_embeddings": true,
209
+ "tokenizer_class": null,
210
+ "top_k": 50,
211
+ "top_p": 1.0,
212
+ "torch_dtype": "bfloat16",
213
+ "torchscript": false,
214
+ "train_denoising_steps": 100,
215
+ "typical_p": 1.0,
216
+ "use_backbone_lora": 0,
217
+ "use_bfloat16": false,
218
+ "use_llm_lora": 0,
219
+ "use_thumbnail": true,
220
+ "vision_config": {
221
+ "_attn_implementation_autoset": true,
222
+ "_name_or_path": "",
223
+ "add_cross_attention": false,
224
+ "architectures": [
225
+ "SiglipVisionModel"
226
+ ],
227
+ "attention_dropout": 0.0,
228
+ "auto_map": {
229
+ "AutoConfig": "configuration_siglip.SiglipVisionConfig",
230
+ "AutoModel": "modeling_siglip.SiglipVisionModel"
231
+ },
232
+ "bad_words_ids": null,
233
+ "begin_suppress_tokens": null,
234
+ "bos_token_id": null,
235
+ "chunk_size_feed_forward": 0,
236
+ "cross_attention_hidden_size": null,
237
+ "decoder_start_token_id": null,
238
+ "diversity_penalty": 0.0,
239
+ "do_sample": false,
240
+ "drop_path_rate": 0.1,
241
+ "early_stopping": false,
242
+ "encoder_no_repeat_ngram_size": 0,
243
+ "eos_token_id": null,
244
+ "exponential_decay_length_penalty": null,
245
+ "finetuning_task": null,
246
+ "forced_bos_token_id": null,
247
+ "forced_eos_token_id": null,
248
+ "hidden_act": "gelu_pytorch_tanh",
249
+ "hidden_size": 1152,
250
+ "id2label": {
251
+ "0": "LABEL_0",
252
+ "1": "LABEL_1"
253
+ },
254
+ "image_size": 448,
255
+ "intermediate_size": 4304,
256
+ "is_decoder": false,
257
+ "is_encoder_decoder": false,
258
+ "label2id": {
259
+ "LABEL_0": 0,
260
+ "LABEL_1": 1
261
+ },
262
+ "layer_norm_eps": 1e-06,
263
+ "length_penalty": 1.0,
264
+ "max_length": 20,
265
+ "min_length": 0,
266
+ "model_type": "siglip_vision_model",
267
+ "no_repeat_ngram_size": 0,
268
+ "num_attention_heads": 16,
269
+ "num_beam_groups": 1,
270
+ "num_beams": 1,
271
+ "num_channels": 3,
272
+ "num_hidden_layers": 27,
273
+ "num_image_tokens": 1024,
274
+ "num_return_sequences": 1,
275
+ "output_attentions": false,
276
+ "output_hidden_states": false,
277
+ "output_scores": false,
278
+ "pad_token_id": null,
279
+ "patch_size": 14,
280
+ "prefix": null,
281
+ "problem_type": null,
282
+ "projection_dim": 2048,
283
+ "projector_hidden_act": "gelu_fast",
284
+ "pruned_heads": {},
285
+ "remove_invalid_values": false,
286
+ "repetition_penalty": 1.0,
287
+ "return_dict": true,
288
+ "return_dict_in_generate": false,
289
+ "sep_token_id": null,
290
+ "suppress_tokens": null,
291
+ "task_specific_params": null,
292
+ "temperature": 1.0,
293
+ "tf_legacy_loss": false,
294
+ "tie_encoder_decoder": false,
295
+ "tie_word_embeddings": true,
296
+ "tokenizer_class": null,
297
+ "top_k": 50,
298
+ "top_p": 1.0,
299
+ "torch_dtype": "bfloat16",
300
+ "torchscript": false,
301
+ "transformers_version": "4.50.0.dev0",
302
+ "typical_p": 1.0,
303
+ "use_bfloat16": false,
304
+ "vision_use_head": false
305
+ },
306
+ "vocab_size": 151674,
307
+ "vocab_start": null
308
+ },
309
+ "singlevla_config_path": "/data5/jellyho/twinvla-checkpoints/Eagle2_1B-Scratch-DiT-B",
310
+ "singlevla_pretrained_path": null,
311
+ "state_dim": 10,
312
+ "torch_dtype": "bfloat16",
313
+ "transformers_version": "4.50.0.dev0"
314
+ }
2e-5/twinvla-scratch-aloha_lift_box/dataset_statistics.json ADDED
@@ -0,0 +1,634 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "aloha_lift_box": {
3
+ "action": {
4
+ "mean": [
5
+ 0.36333414912223816,
6
+ -0.018824385479092598,
7
+ 0.17903447151184082,
8
+ 0.40831902623176575,
9
+ -0.11689134687185287,
10
+ -0.8073355555534363,
11
+ -0.10573221743106842,
12
+ 0.9415335059165955,
13
+ -0.1624741405248642,
14
+ 0.619253396987915,
15
+ 0.360858291387558,
16
+ 0.013982057571411133,
17
+ 0.20412704348564148,
18
+ 0.5001598596572876,
19
+ 0.1113751009106636,
20
+ -0.7415224313735962,
21
+ 0.052125416696071625,
22
+ 0.9483603239059448,
23
+ 0.16254939138889313,
24
+ 0.7590736150741577
25
+ ],
26
+ "std": [
27
+ 0.0638059601187706,
28
+ 0.06317952275276184,
29
+ 0.11073730885982513,
30
+ 0.31736457347869873,
31
+ 0.13928908109664917,
32
+ 0.2184142768383026,
33
+ 0.22394300997257233,
34
+ 0.08008279651403427,
35
+ 0.13921108841896057,
36
+ 0.785220742225647,
37
+ 0.05687877535820007,
38
+ 0.059404969215393066,
39
+ 0.1170634776353836,
40
+ 0.3238433599472046,
41
+ 0.14080215990543365,
42
+ 0.2507486343383789,
43
+ 0.21644321084022522,
44
+ 0.07544828206300735,
45
+ 0.1375824511051178,
46
+ 0.6510355472564697
47
+ ],
48
+ "max": [
49
+ 0.5681452751159668,
50
+ 0.2437673658132553,
51
+ 0.45541316270828247,
52
+ 0.9999293088912964,
53
+ 0.523757815361023,
54
+ 0.4592168927192688,
55
+ 0.7756927013397217,
56
+ 0.9999935030937195,
57
+ 0.2805824279785156,
58
+ 1.0,
59
+ 0.5600330233573914,
60
+ 0.3342031240463257,
61
+ 0.4682213366031647,
62
+ 0.9998393058776855,
63
+ 0.7949740886688232,
64
+ 0.1664249449968338,
65
+ 0.9131186604499817,
66
+ 0.9999967813491821,
67
+ 0.7936055064201355,
68
+ 1.0
69
+ ],
70
+ "min": [
71
+ 0.1517709195613861,
72
+ -0.2900286316871643,
73
+ -0.07412093877792358,
74
+ -0.4022133946418762,
75
+ -0.7361933588981628,
76
+ -0.9999988079071045,
77
+ -0.9935019016265869,
78
+ 0.10709662735462189,
79
+ -0.8023554682731628,
80
+ -1.0,
81
+ 0.15366072952747345,
82
+ -0.23686714470386505,
83
+ 0.0008372184820473194,
84
+ -0.5509981513023376,
85
+ -0.35234102606773376,
86
+ -0.999956488609314,
87
+ -0.5318384766578674,
88
+ 0.3388061225414276,
89
+ -0.27330997586250305,
90
+ -1.0
91
+ ],
92
+ "q01": [
93
+ 0.21054682105779648,
94
+ -0.1866426882147789,
95
+ 0.008138886513188495,
96
+ -0.19710821226239203,
97
+ -0.5368945515155792,
98
+ -0.9981186389923096,
99
+ -0.6956261324882507,
100
+ 0.6267582887411117,
101
+ -0.5600040704011917,
102
+ -1.0,
103
+ 0.2190245844423771,
104
+ -0.15968348175287247,
105
+ 0.025033411756157874,
106
+ -0.23832830414175987,
107
+ -0.2097599548101425,
108
+ -0.9988620406389237,
109
+ -0.4039672353863716,
110
+ 0.6080100274085999,
111
+ -0.19206354618072508,
112
+ -1.0
113
+ ],
114
+ "q99": [
115
+ 0.5033414244651794,
116
+ 0.16928535521030416,
117
+ 0.41566276580095285,
118
+ 0.9899059218168258,
119
+ 0.15462822496891018,
120
+ 0.03764873944222882,
121
+ 0.4657947558164549,
122
+ 0.9995575082302094,
123
+ 0.12326683558523567,
124
+ 1.0,
125
+ 0.4801343524456022,
126
+ 0.1795493066310881,
127
+ 0.4235989159345625,
128
+ 0.9913575077056883,
129
+ 0.5356137681007356,
130
+ 0.044951977618036626,
131
+ 0.7084567189216593,
132
+ 0.9992782145738601,
133
+ 0.6150667482614517,
134
+ 1.0
135
+ ],
136
+ "mask": [
137
+ true,
138
+ true,
139
+ true,
140
+ false,
141
+ false,
142
+ false,
143
+ false,
144
+ false,
145
+ false,
146
+ false,
147
+ true,
148
+ true,
149
+ true,
150
+ false,
151
+ false,
152
+ false,
153
+ false,
154
+ false,
155
+ false,
156
+ false
157
+ ]
158
+ },
159
+ "proprio": {
160
+ "mean": [
161
+ 0.33022773265838623,
162
+ -0.022152910009026527,
163
+ 0.15129819512367249,
164
+ 0.2713927626609802,
165
+ -0.12320420891046524,
166
+ -0.8610075116157532,
167
+ -0.11408627033233643,
168
+ 0.939403772354126,
169
+ -0.15196889638900757,
170
+ 0.565762460231781,
171
+ 0.3260097801685333,
172
+ 0.01288874913007021,
173
+ 0.17190413177013397,
174
+ 0.3495618999004364,
175
+ 0.11379288882017136,
176
+ -0.8095750212669373,
177
+ 0.032405924052000046,
178
+ 0.9490125179290771,
179
+ 0.14386717975139618,
180
+ 0.7014499306678772
181
+ ],
182
+ "std": [
183
+ 0.0579490028321743,
184
+ 0.07013536244630814,
185
+ 0.09934848546981812,
186
+ 0.3117230236530304,
187
+ 0.13217779994010925,
188
+ 0.23493210971355438,
189
+ 0.2282971888780594,
190
+ 0.0823960080742836,
191
+ 0.1500341296195984,
192
+ 0.8515227437019348,
193
+ 0.04813272878527641,
194
+ 0.06810668110847473,
195
+ 0.10921779274940491,
196
+ 0.34190261363983154,
197
+ 0.146615669131279,
198
+ 0.26654887199401855,
199
+ 0.22192324697971344,
200
+ 0.08192051947116852,
201
+ 0.147200345993042,
202
+ 0.7158040404319763
203
+ ],
204
+ "max": [
205
+ 0.559941291809082,
206
+ 0.26086756587028503,
207
+ 0.4504527747631073,
208
+ 0.9999247789382935,
209
+ 0.4198993146419525,
210
+ 0.3512286841869354,
211
+ 0.7522457242012024,
212
+ 1.0,
213
+ 0.8956095576286316,
214
+ 1.470957636833191,
215
+ 0.5437091588973999,
216
+ 0.32627788186073303,
217
+ 0.4945259988307953,
218
+ 0.9998428821563721,
219
+ 0.7737792730331421,
220
+ 0.4633983373641968,
221
+ 0.9018308520317078,
222
+ 1.0,
223
+ 0.9907073378562927,
224
+ 1.361535668373108
225
+ ],
226
+ "min": [
227
+ 0.16681896150112152,
228
+ -0.20499344170093536,
229
+ -0.0030731656588613987,
230
+ -0.4872298836708069,
231
+ -0.6995252966880798,
232
+ -0.999997615814209,
233
+ -0.988165020942688,
234
+ 0.14152538776397705,
235
+ -0.8483264446258545,
236
+ -1.2196638584136963,
237
+ 0.14598572254180908,
238
+ -0.2277291864156723,
239
+ 0.004666368011385202,
240
+ -0.5699886679649353,
241
+ -0.40678924322128296,
242
+ -0.9999999403953552,
243
+ -0.6972882151603699,
244
+ 0.13462646305561066,
245
+ -0.643044650554657,
246
+ -1.164451003074646
247
+ ],
248
+ "q01": [
249
+ 0.2053149801492691,
250
+ -0.17586381256580352,
251
+ 0.015469378884881736,
252
+ -0.2516648331284523,
253
+ -0.5193796420097351,
254
+ -0.9995058274269104,
255
+ -0.7092818850278855,
256
+ 0.608681161403656,
257
+ -0.578884813785553,
258
+ -1.1618710005283355,
259
+ 0.21638940930366515,
260
+ -0.1691040216386318,
261
+ 0.011891756923869252,
262
+ -0.29012590169906616,
263
+ -0.20126488715410232,
264
+ -0.9995589327812194,
265
+ -0.49963704913854595,
266
+ 0.533765652179718,
267
+ -0.18726778730750085,
268
+ -1.082753186225891
269
+ ],
270
+ "q99": [
271
+ 0.5071819436550137,
272
+ 0.165744510143995,
273
+ 0.40272374808788297,
274
+ 0.995180070400238,
275
+ 0.16266889929771197,
276
+ 0.09040380395948588,
277
+ 0.5001266032457347,
278
+ 0.9997656464576721,
279
+ 0.10759550034999843,
280
+ 1.4176189756393425,
281
+ 0.47452601760625834,
282
+ 0.1839943121373646,
283
+ 0.40895662158727647,
284
+ 0.995180070400238,
285
+ 0.5622373461723318,
286
+ 0.07441098906099738,
287
+ 0.7114433652162524,
288
+ 0.999856880903244,
289
+ 0.5974926966428754,
290
+ 1.321595377922058
291
+ ],
292
+ "mask": [
293
+ true,
294
+ true,
295
+ true,
296
+ false,
297
+ false,
298
+ false,
299
+ false,
300
+ false,
301
+ false,
302
+ false,
303
+ true,
304
+ true,
305
+ true,
306
+ false,
307
+ false,
308
+ false,
309
+ false,
310
+ false,
311
+ false,
312
+ false
313
+ ]
314
+ },
315
+ "num_transitions": 11572,
316
+ "num_trajectories": 50
317
+ },
318
+ "aloha_lift_box_new": {
319
+ "action": {
320
+ "mean": [
321
+ 0.36333414912223816,
322
+ -0.018824385479092598,
323
+ 0.17903447151184082,
324
+ 0.40831902623176575,
325
+ -0.11689134687185287,
326
+ -0.8073355555534363,
327
+ -0.10573221743106842,
328
+ 0.9415335059165955,
329
+ -0.1624741405248642,
330
+ 0.619253396987915,
331
+ 0.360858291387558,
332
+ 0.013982057571411133,
333
+ 0.20412704348564148,
334
+ 0.5001598596572876,
335
+ 0.1113751009106636,
336
+ -0.7415224313735962,
337
+ 0.052125416696071625,
338
+ 0.9483603239059448,
339
+ 0.16254939138889313,
340
+ 0.7590736150741577
341
+ ],
342
+ "std": [
343
+ 0.0638059601187706,
344
+ 0.06317952275276184,
345
+ 0.11073730885982513,
346
+ 0.31736457347869873,
347
+ 0.13928908109664917,
348
+ 0.2184142768383026,
349
+ 0.22394300997257233,
350
+ 0.08008279651403427,
351
+ 0.13921108841896057,
352
+ 0.785220742225647,
353
+ 0.05687877535820007,
354
+ 0.059404969215393066,
355
+ 0.1170634776353836,
356
+ 0.3238433599472046,
357
+ 0.14080215990543365,
358
+ 0.2507486343383789,
359
+ 0.21644321084022522,
360
+ 0.07544828206300735,
361
+ 0.1375824511051178,
362
+ 0.6510355472564697
363
+ ],
364
+ "max": [
365
+ 0.5681452751159668,
366
+ 0.2437673658132553,
367
+ 0.45541316270828247,
368
+ 0.9999293088912964,
369
+ 0.523757815361023,
370
+ 0.4592168927192688,
371
+ 0.7756927013397217,
372
+ 0.9999935030937195,
373
+ 0.2805824279785156,
374
+ 1.0,
375
+ 0.5600330233573914,
376
+ 0.3342031240463257,
377
+ 0.4682213366031647,
378
+ 0.9998393058776855,
379
+ 0.7949740886688232,
380
+ 0.1664249449968338,
381
+ 0.9131186604499817,
382
+ 0.9999967813491821,
383
+ 0.7936055064201355,
384
+ 1.0
385
+ ],
386
+ "min": [
387
+ 0.1517709195613861,
388
+ -0.2900286316871643,
389
+ -0.07412093877792358,
390
+ -0.4022133946418762,
391
+ -0.7361933588981628,
392
+ -0.9999988079071045,
393
+ -0.9935019016265869,
394
+ 0.10709662735462189,
395
+ -0.8023554682731628,
396
+ -1.0,
397
+ 0.15366072952747345,
398
+ -0.23686714470386505,
399
+ 0.0008372184820473194,
400
+ -0.5509981513023376,
401
+ -0.35234102606773376,
402
+ -0.999956488609314,
403
+ -0.5318384766578674,
404
+ 0.3388061225414276,
405
+ -0.27330997586250305,
406
+ -1.0
407
+ ],
408
+ "q01": [
409
+ 0.21054682105779648,
410
+ -0.1866426882147789,
411
+ 0.008138886513188495,
412
+ -0.19710821226239203,
413
+ -0.5368945515155792,
414
+ -0.9981186389923096,
415
+ -0.6956261324882507,
416
+ 0.6267582887411117,
417
+ -0.5600040704011917,
418
+ -1.0,
419
+ 0.2190245844423771,
420
+ -0.15968348175287247,
421
+ 0.025033411756157874,
422
+ -0.23832830414175987,
423
+ -0.2097599548101425,
424
+ -0.9988620406389237,
425
+ -0.4039672353863716,
426
+ 0.6080100274085999,
427
+ -0.19206354618072508,
428
+ -1.0
429
+ ],
430
+ "q99": [
431
+ 0.5033414244651794,
432
+ 0.16928535521030416,
433
+ 0.41566276580095285,
434
+ 0.9899059218168258,
435
+ 0.15462822496891018,
436
+ 0.03764873944222882,
437
+ 0.4657947558164549,
438
+ 0.9995575082302094,
439
+ 0.12326683558523567,
440
+ 1.0,
441
+ 0.4801343524456022,
442
+ 0.1795493066310881,
443
+ 0.4235989159345625,
444
+ 0.9913575077056883,
445
+ 0.5356137681007356,
446
+ 0.044951977618036626,
447
+ 0.7084567189216593,
448
+ 0.9992782145738601,
449
+ 0.6150667482614517,
450
+ 1.0
451
+ ],
452
+ "mask": [
453
+ true,
454
+ true,
455
+ true,
456
+ false,
457
+ false,
458
+ false,
459
+ false,
460
+ false,
461
+ false,
462
+ false,
463
+ true,
464
+ true,
465
+ true,
466
+ false,
467
+ false,
468
+ false,
469
+ false,
470
+ false,
471
+ false,
472
+ false
473
+ ]
474
+ },
475
+ "proprio": {
476
+ "mean": [
477
+ 0.33022773265838623,
478
+ -0.022152910009026527,
479
+ 0.15129819512367249,
480
+ 0.2713927626609802,
481
+ -0.12320420891046524,
482
+ -0.8610075116157532,
483
+ -0.11408627033233643,
484
+ 0.939403772354126,
485
+ -0.15196889638900757,
486
+ 0.565762460231781,
487
+ 0.3260097801685333,
488
+ 0.01288874913007021,
489
+ 0.17190413177013397,
490
+ 0.3495618999004364,
491
+ 0.11379288882017136,
492
+ -0.8095750212669373,
493
+ 0.032405924052000046,
494
+ 0.9490125179290771,
495
+ 0.14386717975139618,
496
+ 0.7014499306678772
497
+ ],
498
+ "std": [
499
+ 0.0579490028321743,
500
+ 0.07013536244630814,
501
+ 0.09934848546981812,
502
+ 0.3117230236530304,
503
+ 0.13217779994010925,
504
+ 0.23493210971355438,
505
+ 0.2282971888780594,
506
+ 0.0823960080742836,
507
+ 0.1500341296195984,
508
+ 0.8515227437019348,
509
+ 0.04813272878527641,
510
+ 0.06810668110847473,
511
+ 0.10921779274940491,
512
+ 0.34190261363983154,
513
+ 0.146615669131279,
514
+ 0.26654887199401855,
515
+ 0.22192324697971344,
516
+ 0.08192051947116852,
517
+ 0.147200345993042,
518
+ 0.7158040404319763
519
+ ],
520
+ "max": [
521
+ 0.559941291809082,
522
+ 0.26086756587028503,
523
+ 0.4504527747631073,
524
+ 0.9999247789382935,
525
+ 0.4198993146419525,
526
+ 0.3512286841869354,
527
+ 0.7522457242012024,
528
+ 1.0,
529
+ 0.8956095576286316,
530
+ 1.470957636833191,
531
+ 0.5437091588973999,
532
+ 0.32627788186073303,
533
+ 0.4945259988307953,
534
+ 0.9998428821563721,
535
+ 0.7737792730331421,
536
+ 0.4633983373641968,
537
+ 0.9018308520317078,
538
+ 1.0,
539
+ 0.9907073378562927,
540
+ 1.361535668373108
541
+ ],
542
+ "min": [
543
+ 0.16681896150112152,
544
+ -0.20499344170093536,
545
+ -0.0030731656588613987,
546
+ -0.4872298836708069,
547
+ -0.6995252966880798,
548
+ -0.999997615814209,
549
+ -0.988165020942688,
550
+ 0.14152538776397705,
551
+ -0.8483264446258545,
552
+ -1.2196638584136963,
553
+ 0.14598572254180908,
554
+ -0.2277291864156723,
555
+ 0.004666368011385202,
556
+ -0.5699886679649353,
557
+ -0.40678924322128296,
558
+ -0.9999999403953552,
559
+ -0.6972882151603699,
560
+ 0.13462646305561066,
561
+ -0.643044650554657,
562
+ -1.164451003074646
563
+ ],
564
+ "q01": [
565
+ 0.2053149801492691,
566
+ -0.17586381256580352,
567
+ 0.015469378884881736,
568
+ -0.2516648331284523,
569
+ -0.5193796420097351,
570
+ -0.9995058274269104,
571
+ -0.7092818850278855,
572
+ 0.608681161403656,
573
+ -0.578884813785553,
574
+ -1.1618710005283355,
575
+ 0.21638940930366515,
576
+ -0.1691040216386318,
577
+ 0.011891756923869252,
578
+ -0.29012590169906616,
579
+ -0.20126488715410232,
580
+ -0.9995589327812194,
581
+ -0.49963704913854595,
582
+ 0.533765652179718,
583
+ -0.18726778730750085,
584
+ -1.082753186225891
585
+ ],
586
+ "q99": [
587
+ 0.5071819436550137,
588
+ 0.165744510143995,
589
+ 0.40272374808788297,
590
+ 0.995180070400238,
591
+ 0.16266889929771197,
592
+ 0.09040380395948588,
593
+ 0.5001266032457347,
594
+ 0.9997656464576721,
595
+ 0.10759550034999843,
596
+ 1.4176189756393425,
597
+ 0.47452601760625834,
598
+ 0.1839943121373646,
599
+ 0.40895662158727647,
600
+ 0.995180070400238,
601
+ 0.5622373461723318,
602
+ 0.07441098906099738,
603
+ 0.7114433652162524,
604
+ 0.999856880903244,
605
+ 0.5974926966428754,
606
+ 1.321595377922058
607
+ ],
608
+ "mask": [
609
+ true,
610
+ true,
611
+ true,
612
+ false,
613
+ false,
614
+ false,
615
+ false,
616
+ false,
617
+ false,
618
+ false,
619
+ true,
620
+ true,
621
+ true,
622
+ false,
623
+ false,
624
+ false,
625
+ false,
626
+ false,
627
+ false,
628
+ false
629
+ ]
630
+ },
631
+ "num_transitions": 11572,
632
+ "num_trajectories": 50
633
+ }
634
+ }
2e-5/twinvla-scratch-aloha_lift_box/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5291b6de1ebdbaf84f2a9addb3ee36fb3e83050b1196984e4534ece412e74aab
3
+ size 2889536104
2e-5/twinvla-scratch-aloha_lift_box/singlevla_config/config.json ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_commit_hash": null,
3
+ "_name_or_path": "/data5/jellyho/twinvla-checkpoints/Eagle2_1B-Scratch-DiT-B",
4
+ "action_dim": 10,
5
+ "action_head": "DiT",
6
+ "action_head_hidden_dim": 1024,
7
+ "action_len": 20,
8
+ "aggregation": "None",
9
+ "architectures": [
10
+ "Eagle2_1BVLA"
11
+ ],
12
+ "auto_map": {},
13
+ "denoiser": "FM",
14
+ "diffusion_batch": 32,
15
+ "dit_size": "DiT-B",
16
+ "downsample_ratio": 0.5,
17
+ "dynamic_image_size": true,
18
+ "efficient_loss": true,
19
+ "enable_cfg": true,
20
+ "force_image_size": 448,
21
+ "global_normalization": true,
22
+ "image_size": 448,
23
+ "keep_aspect_ratio": false,
24
+ "knowledge_insulation": false,
25
+ "llm_config": {
26
+ "_attn_implementation_autoset": true,
27
+ "_name_or_path": "./pretrained/Qwen2_5-0_5B-Instruct",
28
+ "add_cross_attention": false,
29
+ "architectures": [
30
+ "Qwen2ForCausalLM"
31
+ ],
32
+ "attention_dropout": 0.0,
33
+ "auto_map": {
34
+ "AutoConfig": "configuration_qwen2.Qwen2Config",
35
+ "AutoModel": "modeling_qwen2.Qwen2Model",
36
+ "AutoModelForCausalLM": "modeling_qwen2.Qwen2ForCausalLM"
37
+ },
38
+ "bad_words_ids": null,
39
+ "begin_suppress_tokens": null,
40
+ "bos_token_id": 151643,
41
+ "chunk_size_feed_forward": 0,
42
+ "cross_attention_hidden_size": null,
43
+ "decoder_start_token_id": null,
44
+ "diversity_penalty": 0.0,
45
+ "do_sample": false,
46
+ "early_stopping": false,
47
+ "encoder_no_repeat_ngram_size": 0,
48
+ "eos_token_id": 151645,
49
+ "exponential_decay_length_penalty": null,
50
+ "finetuning_task": null,
51
+ "forced_bos_token_id": null,
52
+ "forced_eos_token_id": null,
53
+ "hidden_act": "silu",
54
+ "hidden_size": 896,
55
+ "id2label": {
56
+ "0": "LABEL_0",
57
+ "1": "LABEL_1"
58
+ },
59
+ "initializer_range": 0.02,
60
+ "intermediate_size": 4864,
61
+ "is_decoder": false,
62
+ "is_encoder_decoder": false,
63
+ "label2id": {
64
+ "LABEL_0": 0,
65
+ "LABEL_1": 1
66
+ },
67
+ "length_penalty": 1.0,
68
+ "max_length": 20,
69
+ "max_position_embeddings": 32768,
70
+ "max_window_layers": 21,
71
+ "min_length": 0,
72
+ "model_type": "qwen2",
73
+ "no_repeat_ngram_size": 0,
74
+ "num_attention_heads": 14,
75
+ "num_beam_groups": 1,
76
+ "num_beams": 1,
77
+ "num_hidden_layers": 24,
78
+ "num_key_value_heads": 2,
79
+ "num_return_sequences": 1,
80
+ "output_attentions": false,
81
+ "output_hidden_states": false,
82
+ "output_scores": false,
83
+ "pad_token_id": null,
84
+ "prefix": null,
85
+ "problem_type": null,
86
+ "pruned_heads": {},
87
+ "remove_invalid_values": false,
88
+ "repetition_penalty": 1.0,
89
+ "return_dict": true,
90
+ "return_dict_in_generate": false,
91
+ "rms_norm_eps": 1e-06,
92
+ "rope_scaling": null,
93
+ "rope_theta": 1000000.0,
94
+ "sep_token_id": null,
95
+ "sliding_window": 32768,
96
+ "suppress_tokens": null,
97
+ "task_specific_params": null,
98
+ "temperature": 1.0,
99
+ "tf_legacy_loss": false,
100
+ "tie_encoder_decoder": false,
101
+ "tie_word_embeddings": true,
102
+ "tokenizer_class": null,
103
+ "top_k": 50,
104
+ "top_p": 1.0,
105
+ "torch_dtype": "bfloat16",
106
+ "torchscript": false,
107
+ "transformers_version": "4.50.0.dev0",
108
+ "typical_p": 1.0,
109
+ "use_bfloat16": false,
110
+ "use_cache": false,
111
+ "use_sliding_window": false,
112
+ "vocab_size": 151674
113
+ },
114
+ "loss_version": "v4",
115
+ "max_dynamic_patch": 12,
116
+ "min_dynamic_patch": 1,
117
+ "mlp_checkpoint": true,
118
+ "model_path": "nvidia/Eagle2-1B",
119
+ "model_type": "Eagle2_1BVLA",
120
+ "modeling": "denoising",
121
+ "normalization": "quantile",
122
+ "num_readouts": 1,
123
+ "pad2square": false,
124
+ "pre_feature_reduction": false,
125
+ "ps_version": "v2",
126
+ "readout_token_as_eos": true,
127
+ "return_text": null,
128
+ "select_layer": -1,
129
+ "state_dim": 10,
130
+ "stopping_token": "|",
131
+ "template": "qwen2-chat",
132
+ "test_denoising_steps": 10,
133
+ "torch_dtype": "bfloat16",
134
+ "train_denoising_steps": 100,
135
+ "transformers_version": null,
136
+ "use_backbone_lora": 0,
137
+ "use_llm_lora": 0,
138
+ "use_thumbnail": true,
139
+ "vision_config": {
140
+ "_attn_implementation_autoset": true,
141
+ "_name_or_path": "",
142
+ "add_cross_attention": false,
143
+ "architectures": [
144
+ "SiglipVisionModel"
145
+ ],
146
+ "attention_dropout": 0.0,
147
+ "auto_map": {
148
+ "AutoConfig": "configuration_siglip.SiglipVisionConfig",
149
+ "AutoModel": "modeling_siglip.SiglipVisionModel"
150
+ },
151
+ "bad_words_ids": null,
152
+ "begin_suppress_tokens": null,
153
+ "bos_token_id": null,
154
+ "chunk_size_feed_forward": 0,
155
+ "cross_attention_hidden_size": null,
156
+ "decoder_start_token_id": null,
157
+ "diversity_penalty": 0.0,
158
+ "do_sample": false,
159
+ "drop_path_rate": 0.1,
160
+ "early_stopping": false,
161
+ "encoder_no_repeat_ngram_size": 0,
162
+ "eos_token_id": null,
163
+ "exponential_decay_length_penalty": null,
164
+ "finetuning_task": null,
165
+ "forced_bos_token_id": null,
166
+ "forced_eos_token_id": null,
167
+ "hidden_act": "gelu_pytorch_tanh",
168
+ "hidden_size": 1152,
169
+ "id2label": {
170
+ "0": "LABEL_0",
171
+ "1": "LABEL_1"
172
+ },
173
+ "image_size": 448,
174
+ "intermediate_size": 4304,
175
+ "is_decoder": false,
176
+ "is_encoder_decoder": false,
177
+ "label2id": {
178
+ "LABEL_0": 0,
179
+ "LABEL_1": 1
180
+ },
181
+ "layer_norm_eps": 1e-06,
182
+ "length_penalty": 1.0,
183
+ "max_length": 20,
184
+ "min_length": 0,
185
+ "model_type": "siglip_vision_model",
186
+ "no_repeat_ngram_size": 0,
187
+ "num_attention_heads": 16,
188
+ "num_beam_groups": 1,
189
+ "num_beams": 1,
190
+ "num_channels": 3,
191
+ "num_hidden_layers": 27,
192
+ "num_image_tokens": 1024,
193
+ "num_return_sequences": 1,
194
+ "output_attentions": false,
195
+ "output_hidden_states": false,
196
+ "output_scores": false,
197
+ "pad_token_id": null,
198
+ "patch_size": 14,
199
+ "prefix": null,
200
+ "problem_type": null,
201
+ "projection_dim": 2048,
202
+ "projector_hidden_act": "gelu_fast",
203
+ "pruned_heads": {},
204
+ "remove_invalid_values": false,
205
+ "repetition_penalty": 1.0,
206
+ "return_dict": true,
207
+ "return_dict_in_generate": false,
208
+ "sep_token_id": null,
209
+ "suppress_tokens": null,
210
+ "task_specific_params": null,
211
+ "temperature": 1.0,
212
+ "tf_legacy_loss": false,
213
+ "tie_encoder_decoder": false,
214
+ "tie_word_embeddings": true,
215
+ "tokenizer_class": null,
216
+ "top_k": 50,
217
+ "top_p": 1.0,
218
+ "torch_dtype": "bfloat16",
219
+ "torchscript": false,
220
+ "transformers_version": "4.50.0.dev0",
221
+ "typical_p": 1.0,
222
+ "use_bfloat16": false,
223
+ "vision_use_head": false
224
+ },
225
+ "vocab_size": 151674,
226
+ "vocab_start": null
227
+ }
2e-5/twinvla-scratch-aloha_lift_box/training_states.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c17b8c72a7a6ef1706774e683a3696ca8fbcfeaec772981c175ccce137870553
3
+ size 4126124658