jellyho commited on
Commit
401b0db
·
verified ·
1 Parent(s): 56b057c

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "action_dim": 10,
3
+ "action_head": "DiT",
4
+ "action_len": 20,
5
+ "architectures": [
6
+ "Eagle2_1BTwinVLA"
7
+ ],
8
+ "attn_reweighting": true,
9
+ "denoiser": "FM",
10
+ "dit_scratch": false,
11
+ "enable_joint_attn": true,
12
+ "enable_moe": true,
13
+ "global_normalization": false,
14
+ "hz_interpolate": null,
15
+ "interpolate_gripper": false,
16
+ "knowledge_insulation": false,
17
+ "model_path": null,
18
+ "model_type": "Eagle2_1BTwinVLA",
19
+ "modeling": "denoising",
20
+ "normalization": "quantile",
21
+ "num_readouts": 1,
22
+ "readout_token_as_eos": false,
23
+ "share_decoder": true,
24
+ "share_embed_tokens": true,
25
+ "share_vision": true,
26
+ "singlevla_config": {
27
+ "_attn_implementation_autoset": false,
28
+ "_attn_implementation_internal": null,
29
+ "_commit_hash": "428b4d21376ff21d70b8b8830db6f6ab3907bfd8",
30
+ "_name_or_path": "jellyho/TwinVLA",
31
+ "action_dim": 10,
32
+ "action_head": "DiT",
33
+ "action_head_hidden_dim": 1024,
34
+ "action_len": 20,
35
+ "add_cross_attention": false,
36
+ "aggregation": "false",
37
+ "architectures": [
38
+ "Eagle2_1BVLA"
39
+ ],
40
+ "auto_map": {},
41
+ "bad_words_ids": null,
42
+ "begin_suppress_tokens": null,
43
+ "bos_token_id": null,
44
+ "chunk_size_feed_forward": 0,
45
+ "cross_attention_hidden_size": null,
46
+ "dataset_statistics_path": null,
47
+ "decoder_start_token_id": null,
48
+ "denoiser": "FM",
49
+ "diffusion_batch": 32,
50
+ "dit_size": "DiT-B",
51
+ "diversity_penalty": 0.0,
52
+ "do_sample": false,
53
+ "downsample_ratio": 0.5,
54
+ "dynamic_image_size": true,
55
+ "early_stopping": false,
56
+ "efficient_loss": true,
57
+ "enable_cfg": true,
58
+ "encoder_no_repeat_ngram_size": 0,
59
+ "eos_token_id": null,
60
+ "exponential_decay_length_penalty": null,
61
+ "finetuning_task": null,
62
+ "force_image_size": 448,
63
+ "forced_bos_token_id": null,
64
+ "forced_eos_token_id": null,
65
+ "global_normalization": true,
66
+ "hz_interpolate": 20,
67
+ "id2label": {
68
+ "0": "LABEL_0",
69
+ "1": "LABEL_1"
70
+ },
71
+ "image_size": 224,
72
+ "interpolate_gripper": false,
73
+ "is_decoder": false,
74
+ "is_encoder_decoder": false,
75
+ "keep_aspect_ratio": false,
76
+ "knowledge_insulation": false,
77
+ "label2id": {
78
+ "LABEL_0": 0,
79
+ "LABEL_1": 1
80
+ },
81
+ "length_penalty": 1.0,
82
+ "llm_config": {
83
+ "_attn_implementation_autoset": true,
84
+ "_name_or_path": "./pretrained/Qwen2_5-0_5B-Instruct",
85
+ "add_cross_attention": false,
86
+ "architectures": [
87
+ "Qwen2ForCausalLM"
88
+ ],
89
+ "attention_dropout": 0.0,
90
+ "auto_map": {
91
+ "AutoConfig": "configuration_qwen2.Qwen2Config",
92
+ "AutoModel": "modeling_qwen2.Qwen2Model",
93
+ "AutoModelForCausalLM": "modeling_qwen2.Qwen2ForCausalLM"
94
+ },
95
+ "bad_words_ids": null,
96
+ "begin_suppress_tokens": null,
97
+ "bos_token_id": 151643,
98
+ "chunk_size_feed_forward": 0,
99
+ "cross_attention_hidden_size": null,
100
+ "decoder_start_token_id": null,
101
+ "diversity_penalty": 0.0,
102
+ "do_sample": false,
103
+ "early_stopping": false,
104
+ "encoder_no_repeat_ngram_size": 0,
105
+ "eos_token_id": 151645,
106
+ "exponential_decay_length_penalty": null,
107
+ "finetuning_task": null,
108
+ "forced_bos_token_id": null,
109
+ "forced_eos_token_id": null,
110
+ "hidden_act": "silu",
111
+ "hidden_size": 896,
112
+ "id2label": {
113
+ "0": "LABEL_0",
114
+ "1": "LABEL_1"
115
+ },
116
+ "initializer_range": 0.02,
117
+ "intermediate_size": 4864,
118
+ "is_decoder": false,
119
+ "is_encoder_decoder": false,
120
+ "label2id": {
121
+ "LABEL_0": 0,
122
+ "LABEL_1": 1
123
+ },
124
+ "length_penalty": 1.0,
125
+ "max_length": 20,
126
+ "max_position_embeddings": 32768,
127
+ "max_window_layers": 21,
128
+ "min_length": 0,
129
+ "model_type": "qwen2",
130
+ "no_repeat_ngram_size": 0,
131
+ "num_attention_heads": 14,
132
+ "num_beam_groups": 1,
133
+ "num_beams": 1,
134
+ "num_hidden_layers": 24,
135
+ "num_key_value_heads": 2,
136
+ "num_return_sequences": 1,
137
+ "output_attentions": false,
138
+ "output_hidden_states": false,
139
+ "output_scores": false,
140
+ "pad_token_id": null,
141
+ "prefix": null,
142
+ "problem_type": null,
143
+ "pruned_heads": {},
144
+ "remove_invalid_values": false,
145
+ "repetition_penalty": 1.0,
146
+ "return_dict": true,
147
+ "return_dict_in_generate": false,
148
+ "rms_norm_eps": 1e-06,
149
+ "rope_scaling": null,
150
+ "rope_theta": 1000000.0,
151
+ "sep_token_id": null,
152
+ "sliding_window": 32768,
153
+ "suppress_tokens": null,
154
+ "task_specific_params": null,
155
+ "temperature": 1.0,
156
+ "tf_legacy_loss": false,
157
+ "tie_encoder_decoder": false,
158
+ "tie_word_embeddings": true,
159
+ "tokenizer_class": null,
160
+ "top_k": 50,
161
+ "top_p": 1.0,
162
+ "torch_dtype": "bfloat16",
163
+ "torchscript": false,
164
+ "transformers_version": "4.50.0.dev0",
165
+ "typical_p": 1.0,
166
+ "use_bfloat16": false,
167
+ "use_cache": false,
168
+ "use_sliding_window": false,
169
+ "vocab_size": 151674
170
+ },
171
+ "loss_version": "v4",
172
+ "max_dynamic_patch": 12,
173
+ "max_length": 20,
174
+ "min_dynamic_patch": 1,
175
+ "min_length": 0,
176
+ "mlp_checkpoint": true,
177
+ "model_path": "nvidia/Eagle2-1B",
178
+ "model_type": "Eagle2_1BVLA",
179
+ "modeling": "denoising",
180
+ "no_repeat_ngram_size": 0,
181
+ "normalization": "quantile",
182
+ "num_beam_groups": 1,
183
+ "num_beams": 1,
184
+ "num_readouts": 1,
185
+ "num_return_sequences": 1,
186
+ "output_attentions": false,
187
+ "output_hidden_states": false,
188
+ "output_scores": false,
189
+ "pad2square": false,
190
+ "pad_token_id": null,
191
+ "pre_feature_reduction": false,
192
+ "prefix": null,
193
+ "problem_type": null,
194
+ "pruned_heads": {},
195
+ "ps_version": "v2",
196
+ "readout_token_as_eos": false,
197
+ "remove_invalid_values": false,
198
+ "repetition_penalty": 1.0,
199
+ "return_dict": true,
200
+ "return_dict_in_generate": false,
201
+ "return_text": null,
202
+ "select_layer": -1,
203
+ "sep_token_id": null,
204
+ "state_dim": 10,
205
+ "stopping_token": "|",
206
+ "suppress_tokens": null,
207
+ "task_specific_params": null,
208
+ "temperature": 1.0,
209
+ "template": "qwen2-chat",
210
+ "test_denoising_steps": 10,
211
+ "tf_legacy_loss": false,
212
+ "tie_encoder_decoder": false,
213
+ "tie_word_embeddings": true,
214
+ "tokenizer_class": null,
215
+ "top_k": 50,
216
+ "top_p": 1.0,
217
+ "torch_dtype": "bfloat16",
218
+ "torchscript": false,
219
+ "train_denoising_steps": 100,
220
+ "typical_p": 1.0,
221
+ "use_backbone_lora": 0,
222
+ "use_bfloat16": false,
223
+ "use_llm_lora": 0,
224
+ "use_thumbnail": true,
225
+ "vision_config": {
226
+ "_attn_implementation_autoset": true,
227
+ "_name_or_path": "",
228
+ "add_cross_attention": false,
229
+ "architectures": [
230
+ "SiglipVisionModel"
231
+ ],
232
+ "attention_dropout": 0.0,
233
+ "auto_map": {
234
+ "AutoConfig": "configuration_siglip.SiglipVisionConfig",
235
+ "AutoModel": "modeling_siglip.SiglipVisionModel"
236
+ },
237
+ "bad_words_ids": null,
238
+ "begin_suppress_tokens": null,
239
+ "bos_token_id": null,
240
+ "chunk_size_feed_forward": 0,
241
+ "cross_attention_hidden_size": null,
242
+ "decoder_start_token_id": null,
243
+ "diversity_penalty": 0.0,
244
+ "do_sample": false,
245
+ "drop_path_rate": 0.1,
246
+ "early_stopping": false,
247
+ "encoder_no_repeat_ngram_size": 0,
248
+ "eos_token_id": null,
249
+ "exponential_decay_length_penalty": null,
250
+ "finetuning_task": null,
251
+ "forced_bos_token_id": null,
252
+ "forced_eos_token_id": null,
253
+ "hidden_act": "gelu_pytorch_tanh",
254
+ "hidden_size": 1152,
255
+ "id2label": {
256
+ "0": "LABEL_0",
257
+ "1": "LABEL_1"
258
+ },
259
+ "image_size": 448,
260
+ "intermediate_size": 4304,
261
+ "is_decoder": false,
262
+ "is_encoder_decoder": false,
263
+ "label2id": {
264
+ "LABEL_0": 0,
265
+ "LABEL_1": 1
266
+ },
267
+ "layer_norm_eps": 1e-06,
268
+ "length_penalty": 1.0,
269
+ "max_length": 20,
270
+ "min_length": 0,
271
+ "model_type": "siglip_vision_model",
272
+ "no_repeat_ngram_size": 0,
273
+ "num_attention_heads": 16,
274
+ "num_beam_groups": 1,
275
+ "num_beams": 1,
276
+ "num_channels": 3,
277
+ "num_hidden_layers": 27,
278
+ "num_image_tokens": 1024,
279
+ "num_return_sequences": 1,
280
+ "output_attentions": false,
281
+ "output_hidden_states": false,
282
+ "output_scores": false,
283
+ "pad_token_id": null,
284
+ "patch_size": 14,
285
+ "prefix": null,
286
+ "problem_type": null,
287
+ "projection_dim": 2048,
288
+ "projector_hidden_act": "gelu_fast",
289
+ "pruned_heads": {},
290
+ "remove_invalid_values": false,
291
+ "repetition_penalty": 1.0,
292
+ "return_dict": true,
293
+ "return_dict_in_generate": false,
294
+ "sep_token_id": null,
295
+ "suppress_tokens": null,
296
+ "task_specific_params": null,
297
+ "temperature": 1.0,
298
+ "tf_legacy_loss": false,
299
+ "tie_encoder_decoder": false,
300
+ "tie_word_embeddings": true,
301
+ "tokenizer_class": null,
302
+ "top_k": 50,
303
+ "top_p": 1.0,
304
+ "torch_dtype": "bfloat16",
305
+ "torchscript": false,
306
+ "transformers_version": "4.50.0.dev0",
307
+ "typical_p": 1.0,
308
+ "use_bfloat16": false,
309
+ "vision_use_head": false
310
+ },
311
+ "vocab_size": 151674,
312
+ "vocab_start": null
313
+ },
314
+ "singlevla_config_path": "jellyho/TwinVLA",
315
+ "singlevla_pretrained_path": null,
316
+ "state_dim": 10,
317
+ "torch_dtype": "bfloat16",
318
+ "transformers_version": "4.50.0.dev0"
319
+ }
dataset_statistics.json ADDED
@@ -0,0 +1,318 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "anubis_put_into_pot": {
3
+ "action": {
4
+ "mean": [
5
+ 0.3237164318561554,
6
+ 0.19323241710662842,
7
+ 0.17074844241142273,
8
+ -0.40655800700187683,
9
+ -0.8302565813064575,
10
+ 0.24439944326877594,
11
+ -0.6347360610961914,
12
+ 0.08541738986968994,
13
+ -0.6577622890472412,
14
+ -0.14733541011810303,
15
+ 0.3448340892791748,
16
+ -0.08713279664516449,
17
+ 0.16664977371692657,
18
+ 0.7354577779769897,
19
+ -0.4887841045856476,
20
+ 0.21303653717041016,
21
+ -0.09639851748943329,
22
+ -0.553676187992096,
23
+ -0.6703258156776428,
24
+ 0.6337327361106873
25
+ ],
26
+ "std": [
27
+ 0.047731053084135056,
28
+ 0.06773686408996582,
29
+ 0.03273946791887283,
30
+ 0.1042996421456337,
31
+ 0.11049824208021164,
32
+ 0.25012287497520447,
33
+ 0.21405042707920074,
34
+ 0.27174288034439087,
35
+ 0.19364885985851288,
36
+ 0.9816686511039734,
37
+ 0.07926628738641739,
38
+ 0.06496380269527435,
39
+ 0.05034809187054634,
40
+ 0.17572304606437683,
41
+ 0.33217287063598633,
42
+ 0.18327680230140686,
43
+ 0.19470512866973877,
44
+ 0.3387109041213989,
45
+ 0.2866652011871338,
46
+ 0.7686191201210022
47
+ ],
48
+ "max": [
49
+ 0.3899821639060974,
50
+ 0.40939435362815857,
51
+ 0.26672402024269104,
52
+ -0.08145993202924728,
53
+ -0.36822038888931274,
54
+ 0.8622857928276062,
55
+ 0.13092871010303497,
56
+ 0.7102192640304565,
57
+ -0.20073336362838745,
58
+ 1.0,
59
+ 0.549041748046875,
60
+ 0.13987885415554047,
61
+ 0.26468726992607117,
62
+ 0.9996138215065002,
63
+ 0.7353013753890991,
64
+ 0.7054949402809143,
65
+ 0.8297774195671082,
66
+ 0.3165386915206909,
67
+ 0.03685024008154869,
68
+ 1.0
69
+ ],
70
+ "min": [
71
+ 0.14642871916294098,
72
+ 0.1112271100282669,
73
+ 0.10428614169359207,
74
+ -0.7812724709510803,
75
+ -0.9828734397888184,
76
+ -0.26845887303352356,
77
+ -0.9568612575531006,
78
+ -0.5920525789260864,
79
+ -0.9996806383132935,
80
+ -1.0,
81
+ 0.1487334966659546,
82
+ -0.16685600578784943,
83
+ 0.057327114045619965,
84
+ 0.2669093906879425,
85
+ -0.9613909125328064,
86
+ -0.4603269398212433,
87
+ -0.6394798159599304,
88
+ -0.999998152256012,
89
+ -0.9998581409454346,
90
+ -1.0
91
+ ],
92
+ "q01": [
93
+ 0.18516795575618744,
94
+ 0.12451772257685662,
95
+ 0.11509101435542107,
96
+ -0.6908658498525619,
97
+ -0.9580670577287674,
98
+ -0.145108138024807,
99
+ -0.9188619738817215,
100
+ -0.5034985786676407,
101
+ -0.9934796422719956,
102
+ -1.0,
103
+ 0.19577301174402237,
104
+ -0.16102446094155312,
105
+ 0.06668414495885372,
106
+ 0.3671954298019409,
107
+ -0.9094176268577576,
108
+ -0.2629955995082855,
109
+ -0.56613784968853,
110
+ -0.9951579666137695,
111
+ -0.9977369052171707,
112
+ -1.0
113
+ ],
114
+ "q99": [
115
+ 0.38379789978265755,
116
+ 0.3623714971542357,
117
+ 0.24623780891299243,
118
+ -0.13757481276989036,
119
+ -0.495397853255272,
120
+ 0.8110066300630565,
121
+ -0.08928897663950969,
122
+ 0.6101795786619182,
123
+ -0.2821595719456673,
124
+ 1.0,
125
+ 0.5267843568325042,
126
+ 0.08202718742191788,
127
+ 0.2494549164175987,
128
+ 0.9867255806922912,
129
+ 0.4620909711718558,
130
+ 0.5912377506494515,
131
+ 0.5964654320478437,
132
+ 0.16175421819090668,
133
+ -0.044016047082841944,
134
+ 1.0
135
+ ],
136
+ "mask": [
137
+ true,
138
+ true,
139
+ true,
140
+ false,
141
+ false,
142
+ false,
143
+ false,
144
+ false,
145
+ false,
146
+ false,
147
+ true,
148
+ true,
149
+ true,
150
+ false,
151
+ false,
152
+ false,
153
+ false,
154
+ false,
155
+ false,
156
+ false
157
+ ]
158
+ },
159
+ "proprio": {
160
+ "mean": [
161
+ 0.32505786418914795,
162
+ 0.19214771687984467,
163
+ 0.1698455959558487,
164
+ -0.4101601243019104,
165
+ -0.8288484215736389,
166
+ 0.24628858268260956,
167
+ -0.639862060546875,
168
+ 0.08716083317995071,
169
+ -0.6574246287345886,
170
+ 0.1716156154870987,
171
+ 0.34529370069503784,
172
+ -0.08818725496530533,
173
+ 0.16610461473464966,
174
+ 0.7380656003952026,
175
+ -0.4896639585494995,
176
+ 0.21104076504707336,
177
+ -0.09965088963508606,
178
+ -0.5533883571624756,
179
+ -0.6711149215698242,
180
+ 0.7709687948226929
181
+ ],
182
+ "std": [
183
+ 0.04416600242257118,
184
+ 0.06409642845392227,
185
+ 0.030034804716706276,
186
+ 0.10197774320840836,
187
+ 0.11434831470251083,
188
+ 0.2462652176618576,
189
+ 0.2088947892189026,
190
+ 0.26520225405693054,
191
+ 0.19183579087257385,
192
+ 0.6926007866859436,
193
+ 0.07798658311367035,
194
+ 0.06319615244865417,
195
+ 0.049226533621549606,
196
+ 0.17331941425800323,
197
+ 0.3265608549118042,
198
+ 0.1851140856742859,
199
+ 0.1893804967403412,
200
+ 0.3403646647930145,
201
+ 0.28587019443511963,
202
+ 0.4660526514053345
203
+ ],
204
+ "max": [
205
+ 0.38955727219581604,
206
+ 0.3466160297393799,
207
+ 0.24623136222362518,
208
+ -0.08630989491939545,
209
+ -0.3043893277645111,
210
+ 0.8533033132553101,
211
+ 0.0015313735930249095,
212
+ 0.7063275575637817,
213
+ -0.20640863478183746,
214
+ 1.003541111946106,
215
+ 0.5462295413017273,
216
+ 0.07705596089363098,
217
+ 0.262810081243515,
218
+ 0.9995301365852356,
219
+ 0.5431351661682129,
220
+ 0.7067950367927551,
221
+ 0.6764119267463684,
222
+ 0.30139848589897156,
223
+ 0.025077033787965775,
224
+ 1.014369249343872
225
+ ],
226
+ "min": [
227
+ 0.1910482496023178,
228
+ 0.11194601655006409,
229
+ 0.10973816365003586,
230
+ -0.7810142636299133,
231
+ -0.9814901351928711,
232
+ -0.21161042153835297,
233
+ -0.9549692869186401,
234
+ -0.5408061146736145,
235
+ -0.999822199344635,
236
+ -0.45464301109313965,
237
+ 0.1878221333026886,
238
+ -0.16589593887329102,
239
+ 0.06183049455285072,
240
+ 0.2736285626888275,
241
+ -0.960009753704071,
242
+ -0.45437952876091003,
243
+ -0.6288766860961914,
244
+ -0.999998152256012,
245
+ -0.9999145269393921,
246
+ -0.49795544147491455
247
+ ],
248
+ "q01": [
249
+ 0.21300651252269745,
250
+ 0.12324244387447834,
251
+ 0.11845191389322281,
252
+ -0.6922157597541809,
253
+ -0.9584890645742417,
254
+ -0.1259118865430355,
255
+ -0.9182252979278565,
256
+ -0.46052598804235456,
257
+ -0.9951841980218887,
258
+ -0.44922900199890137,
259
+ 0.2136411526799202,
260
+ -0.16165925472974776,
261
+ 0.06899189889431,
262
+ 0.3749441310763359,
263
+ -0.9038005238771438,
264
+ -0.2544199496507645,
265
+ -0.5673457723855972,
266
+ -0.9951095831394196,
267
+ -0.9976086699962616,
268
+ -0.3860650062561035
269
+ ],
270
+ "q99": [
271
+ 0.38241307526826857,
272
+ 0.3363050788640975,
273
+ 0.23329319134354592,
274
+ -0.14138195231556894,
275
+ -0.46486153453588586,
276
+ 0.8059070360660553,
277
+ -0.0950157642364502,
278
+ 0.6086657553911209,
279
+ -0.28258277803659465,
280
+ 1.0017364025115967,
281
+ 0.5247526270151138,
282
+ 0.06677914083003997,
283
+ 0.24800570905208577,
284
+ 0.9867493206262589,
285
+ 0.43939185500144884,
286
+ 0.5892856574058531,
287
+ 0.5718294656276701,
288
+ 0.15633342444896692,
289
+ -0.04927438985556463,
290
+ 1.0017364025115967
291
+ ],
292
+ "mask": [
293
+ true,
294
+ true,
295
+ true,
296
+ false,
297
+ false,
298
+ false,
299
+ false,
300
+ false,
301
+ false,
302
+ false,
303
+ true,
304
+ true,
305
+ true,
306
+ false,
307
+ false,
308
+ false,
309
+ false,
310
+ false,
311
+ false,
312
+ false
313
+ ]
314
+ },
315
+ "num_transitions": 12330,
316
+ "num_trajectories": 54
317
+ }
318
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c5793a679c97b60e4b4aa8ed772e5dc799e136aa661b6b66af2e3621134c96c
3
+ size 2889539864
singlevla_config/config.json ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_commit_hash": "428b4d21376ff21d70b8b8830db6f6ab3907bfd8",
3
+ "_name_or_path": "jellyho/TwinVLA",
4
+ "action_dim": 10,
5
+ "action_head": "DiT",
6
+ "action_head_hidden_dim": 1024,
7
+ "action_len": 20,
8
+ "aggregation": "false",
9
+ "architectures": [
10
+ "Eagle2_1BVLA"
11
+ ],
12
+ "auto_map": {},
13
+ "dataset_statistics_path": null,
14
+ "denoiser": "FM",
15
+ "diffusion_batch": 32,
16
+ "dit_size": "DiT-B",
17
+ "downsample_ratio": 0.5,
18
+ "dynamic_image_size": true,
19
+ "efficient_loss": true,
20
+ "enable_cfg": true,
21
+ "force_image_size": 448,
22
+ "global_normalization": true,
23
+ "hz_interpolate": 20,
24
+ "image_size": 224,
25
+ "interpolate_gripper": false,
26
+ "keep_aspect_ratio": false,
27
+ "knowledge_insulation": false,
28
+ "llm_config": {
29
+ "_attn_implementation_autoset": true,
30
+ "_name_or_path": "./pretrained/Qwen2_5-0_5B-Instruct",
31
+ "add_cross_attention": false,
32
+ "architectures": [
33
+ "Qwen2ForCausalLM"
34
+ ],
35
+ "attention_dropout": 0.0,
36
+ "auto_map": {
37
+ "AutoConfig": "configuration_qwen2.Qwen2Config",
38
+ "AutoModel": "modeling_qwen2.Qwen2Model",
39
+ "AutoModelForCausalLM": "modeling_qwen2.Qwen2ForCausalLM"
40
+ },
41
+ "bad_words_ids": null,
42
+ "begin_suppress_tokens": null,
43
+ "bos_token_id": 151643,
44
+ "chunk_size_feed_forward": 0,
45
+ "cross_attention_hidden_size": null,
46
+ "decoder_start_token_id": null,
47
+ "diversity_penalty": 0.0,
48
+ "do_sample": false,
49
+ "early_stopping": false,
50
+ "encoder_no_repeat_ngram_size": 0,
51
+ "eos_token_id": 151645,
52
+ "exponential_decay_length_penalty": null,
53
+ "finetuning_task": null,
54
+ "forced_bos_token_id": null,
55
+ "forced_eos_token_id": null,
56
+ "hidden_act": "silu",
57
+ "hidden_size": 896,
58
+ "id2label": {
59
+ "0": "LABEL_0",
60
+ "1": "LABEL_1"
61
+ },
62
+ "initializer_range": 0.02,
63
+ "intermediate_size": 4864,
64
+ "is_decoder": false,
65
+ "is_encoder_decoder": false,
66
+ "label2id": {
67
+ "LABEL_0": 0,
68
+ "LABEL_1": 1
69
+ },
70
+ "length_penalty": 1.0,
71
+ "max_length": 20,
72
+ "max_position_embeddings": 32768,
73
+ "max_window_layers": 21,
74
+ "min_length": 0,
75
+ "model_type": "qwen2",
76
+ "no_repeat_ngram_size": 0,
77
+ "num_attention_heads": 14,
78
+ "num_beam_groups": 1,
79
+ "num_beams": 1,
80
+ "num_hidden_layers": 24,
81
+ "num_key_value_heads": 2,
82
+ "num_return_sequences": 1,
83
+ "output_attentions": false,
84
+ "output_hidden_states": false,
85
+ "output_scores": false,
86
+ "pad_token_id": null,
87
+ "prefix": null,
88
+ "problem_type": null,
89
+ "pruned_heads": {},
90
+ "remove_invalid_values": false,
91
+ "repetition_penalty": 1.0,
92
+ "return_dict": true,
93
+ "return_dict_in_generate": false,
94
+ "rms_norm_eps": 1e-06,
95
+ "rope_scaling": null,
96
+ "rope_theta": 1000000.0,
97
+ "sep_token_id": null,
98
+ "sliding_window": 32768,
99
+ "suppress_tokens": null,
100
+ "task_specific_params": null,
101
+ "temperature": 1.0,
102
+ "tf_legacy_loss": false,
103
+ "tie_encoder_decoder": false,
104
+ "tie_word_embeddings": true,
105
+ "tokenizer_class": null,
106
+ "top_k": 50,
107
+ "top_p": 1.0,
108
+ "torch_dtype": "bfloat16",
109
+ "torchscript": false,
110
+ "transformers_version": "4.50.0.dev0",
111
+ "typical_p": 1.0,
112
+ "use_bfloat16": false,
113
+ "use_cache": false,
114
+ "use_sliding_window": false,
115
+ "vocab_size": 151674
116
+ },
117
+ "loss_version": "v4",
118
+ "max_dynamic_patch": 12,
119
+ "min_dynamic_patch": 1,
120
+ "mlp_checkpoint": true,
121
+ "model_path": "nvidia/Eagle2-1B",
122
+ "model_type": "Eagle2_1BVLA",
123
+ "modeling": "denoising",
124
+ "normalization": "quantile",
125
+ "num_readouts": 1,
126
+ "pad2square": false,
127
+ "pre_feature_reduction": false,
128
+ "ps_version": "v2",
129
+ "readout_token_as_eos": false,
130
+ "return_text": null,
131
+ "select_layer": -1,
132
+ "state_dim": 10,
133
+ "stopping_token": "|",
134
+ "template": "qwen2-chat",
135
+ "test_denoising_steps": 10,
136
+ "torch_dtype": "bfloat16",
137
+ "train_denoising_steps": 100,
138
+ "transformers_version": null,
139
+ "use_backbone_lora": 0,
140
+ "use_llm_lora": 0,
141
+ "use_thumbnail": true,
142
+ "vision_config": {
143
+ "_attn_implementation_autoset": true,
144
+ "_name_or_path": "",
145
+ "add_cross_attention": false,
146
+ "architectures": [
147
+ "SiglipVisionModel"
148
+ ],
149
+ "attention_dropout": 0.0,
150
+ "auto_map": {
151
+ "AutoConfig": "configuration_siglip.SiglipVisionConfig",
152
+ "AutoModel": "modeling_siglip.SiglipVisionModel"
153
+ },
154
+ "bad_words_ids": null,
155
+ "begin_suppress_tokens": null,
156
+ "bos_token_id": null,
157
+ "chunk_size_feed_forward": 0,
158
+ "cross_attention_hidden_size": null,
159
+ "decoder_start_token_id": null,
160
+ "diversity_penalty": 0.0,
161
+ "do_sample": false,
162
+ "drop_path_rate": 0.1,
163
+ "early_stopping": false,
164
+ "encoder_no_repeat_ngram_size": 0,
165
+ "eos_token_id": null,
166
+ "exponential_decay_length_penalty": null,
167
+ "finetuning_task": null,
168
+ "forced_bos_token_id": null,
169
+ "forced_eos_token_id": null,
170
+ "hidden_act": "gelu_pytorch_tanh",
171
+ "hidden_size": 1152,
172
+ "id2label": {
173
+ "0": "LABEL_0",
174
+ "1": "LABEL_1"
175
+ },
176
+ "image_size": 448,
177
+ "intermediate_size": 4304,
178
+ "is_decoder": false,
179
+ "is_encoder_decoder": false,
180
+ "label2id": {
181
+ "LABEL_0": 0,
182
+ "LABEL_1": 1
183
+ },
184
+ "layer_norm_eps": 1e-06,
185
+ "length_penalty": 1.0,
186
+ "max_length": 20,
187
+ "min_length": 0,
188
+ "model_type": "siglip_vision_model",
189
+ "no_repeat_ngram_size": 0,
190
+ "num_attention_heads": 16,
191
+ "num_beam_groups": 1,
192
+ "num_beams": 1,
193
+ "num_channels": 3,
194
+ "num_hidden_layers": 27,
195
+ "num_image_tokens": 1024,
196
+ "num_return_sequences": 1,
197
+ "output_attentions": false,
198
+ "output_hidden_states": false,
199
+ "output_scores": false,
200
+ "pad_token_id": null,
201
+ "patch_size": 14,
202
+ "prefix": null,
203
+ "problem_type": null,
204
+ "projection_dim": 2048,
205
+ "projector_hidden_act": "gelu_fast",
206
+ "pruned_heads": {},
207
+ "remove_invalid_values": false,
208
+ "repetition_penalty": 1.0,
209
+ "return_dict": true,
210
+ "return_dict_in_generate": false,
211
+ "sep_token_id": null,
212
+ "suppress_tokens": null,
213
+ "task_specific_params": null,
214
+ "temperature": 1.0,
215
+ "tf_legacy_loss": false,
216
+ "tie_encoder_decoder": false,
217
+ "tie_word_embeddings": true,
218
+ "tokenizer_class": null,
219
+ "top_k": 50,
220
+ "top_p": 1.0,
221
+ "torch_dtype": "bfloat16",
222
+ "torchscript": false,
223
+ "transformers_version": "4.50.0.dev0",
224
+ "typical_p": 1.0,
225
+ "use_bfloat16": false,
226
+ "vision_use_head": false
227
+ },
228
+ "vocab_size": 151674,
229
+ "vocab_start": null
230
+ }