JianZhangAI commited on
Commit
233bc55
·
verified ·
1 Parent(s): 51f364d

Delete libero_spatial_dit-1B_action_text-siglip

Browse files
libero_spatial_dit-1B_action_text-siglip/config.yaml DELETED
@@ -1,302 +0,0 @@
1
- run_name: libero_4_qwen3-4b_l1_regression_wrist_proprio_ft_ah_lora_r8_llm_bs108_20250811_215843
2
- seed: 6198
3
- epoch: null
4
- dry_run: false
5
- model:
6
- d_model: 2560
7
- n_heads: 32
8
- n_kv_heads: 8
9
- qkv_bias: false
10
- clip_qkv: null
11
- n_layers: 36
12
- mlp_ratio: 4
13
- mlp_hidden_size: 9728
14
- activation_type: llama_swiglu
15
- block_type: qwen
16
- block_group_size: 1
17
- rope: true
18
- rope_full_precision: true
19
- rope_theta: 1000000.0
20
- vision_backbone:
21
- image_model_type: openai
22
- image_default_input_size:
23
- - 336
24
- - 336
25
- image_patch_size: 14
26
- image_pos_patch_size: 14
27
- image_emb_dim: 1024
28
- image_num_heads: 16
29
- image_num_key_value_heads: 16
30
- image_num_layers: 23
31
- image_head_dim: 64
32
- image_mlp_dim: 4096
33
- image_mlp_activations: quick_gelu
34
- image_dropout_rate: 0.0
35
- image_num_pos: 577
36
- image_norm_eps: 1.0e-05
37
- attention_dropout: 0.0
38
- residual_dropout: 0.0
39
- initializer_range: 0.02
40
- fsdp_wrap: false
41
- resize_mode: default
42
- vit_load_path: /mnt/data/zhangjian/molmo/pretrained_image_encoders/vit-l-14-336.pt
43
- llm_load_path: /mnt/data/zhangjian/molmo/pretrained_llms/qwen3-4b.pt
44
- low_cpu_fsdp: true
45
- attention_type: sdpa
46
- float32_attention: true
47
- attention_dropout: 0.0
48
- attention_layer_norm: false
49
- residual_dropout: 0.0
50
- response_residual_dropout: 0.1
51
- embedding_dropout: 0.0
52
- layer_norm_type: rms
53
- layer_norm_with_affine: true
54
- layer_norm_eps: 1.0e-06
55
- attention_layer_norm_with_affine: true
56
- max_sequence_length: 32768
57
- max_position_embeddings: null
58
- include_bias: false
59
- bias_for_layer_norm: null
60
- scale_logits: false
61
- vocab_size: 151936
62
- embedding_size: 151936
63
- additional_vocab_size: 128
64
- new_embedding_init_range: 0.02
65
- weight_tying: false
66
- init_device: null
67
- init_fn: normal
68
- init_std: 0.02
69
- init_cutoff_factor: null
70
- norm_after: false
71
- precision: amp_bf16
72
- max_crops: 12
73
- crop_mode: overlap-and-resize-c2
74
- use_col_tokens: true
75
- prompt_type: none
76
- system_prompt_kind: demo_or_style
77
- message_formatting: none
78
- always_start_with_space: true
79
- multi_annotation_weighting: null
80
- default_inference_len: 65
81
- overlap_margins:
82
- - 4
83
- - 4
84
- pad_value: 0.0
85
- image_padding_embed: pad_and_partial_pad
86
- fix_image_padding: true
87
- vit_layers:
88
- - -2
89
- - -9
90
- image_pooling_h: 2
91
- image_pooling_w: 2
92
- image_pooling_2d: attention_meanq
93
- image_projector: mlp
94
- image_feature_dropout: 0.0
95
- initializer_range: 0.02
96
- normalize_input_embeds: false
97
- use_position_ids: true
98
- head_dim: 128
99
- tokenizer:
100
- identifier: Qwen/Qwen3-4B
101
- tokenizer_dir: /mnt/data/zhangjian/hf_cache
102
- pad_tokenizer: true
103
- moe_num_experts: 8
104
- moe_top_k: 2
105
- moe_mlp_impl: sparse
106
- moe_log_expert_assignment: false
107
- moe_shared_expert: false
108
- moe_lbl_in_fp32: false
109
- moe_interleave: false
110
- moe_loss_weight: 0.1
111
- moe_zloss_weight: null
112
- moe_dropless: true
113
- moe_capacity_factor: 1.25
114
- action_head: l1_regression
115
- num_diffusion_steps: 1000
116
- num_diffusion_inference_steps: 30
117
- use_proprio: true
118
- action_head_dit_hidden_size: 1152
119
- action_head_dit_depth: 28
120
- action_head_dit_num_heads: 16
121
- llm_causal_attention: false
122
- action_use_left_eef: false
123
- action_use_mobile_base: false
124
- allow_resume: false
125
- ft_llm: false
126
- ft_vit: false
127
- ft_connector: false
128
- ft_embedding: lm_head
129
- lora: false
130
- use_lora: true
131
- lora_rank: 8
132
- lora_llm: true
133
- lora_vit: false
134
- lora_connector: false
135
- optimizer:
136
- name: adamw
137
- learning_rate: 0.0001
138
- weight_decay: 0.01
139
- betas:
140
- - 0.9
141
- - 0.95
142
- eps: 1.0e-05
143
- connector_learning_rate: 0.0002
144
- vit_learning_rate: 6.0e-06
145
- llm_learning_rate: 5.0e-05
146
- connector_weight_decay: 0.0
147
- vit_weight_decay: 0.0
148
- llm_weight_decay: 0.0
149
- connector_betas:
150
- - 0.9
151
- - 0.95
152
- vit_betas:
153
- - 0.9
154
- - 0.95
155
- llm_betas:
156
- - 0.9
157
- - 0.95
158
- connector_eps: 1.0e-06
159
- vit_eps: 1.0e-06
160
- llm_eps: 1.0e-06
161
- metrics_log_interval: 20
162
- scheduler:
163
- name: multimodal
164
- units: steps
165
- t_warmup: 100
166
- t_max: null
167
- alpha_f: 0.1
168
- connector_t_warmup: 200
169
- vit_t_warmup: 2000
170
- llm_t_warmup: 2000
171
- grad_clip_warmup_steps: null
172
- grad_clip_warmup_factor: null
173
- warmup_min_lr: 0.0
174
- data:
175
- dataset: rlds_dataset
176
- mixture: null
177
- root_size_mixture: null
178
- split: train
179
- seed: 95818
180
- shuffle_messages: false
181
- pad: to_max
182
- sequence_length: 768
183
- shuffle: true
184
- for_inference: false
185
- multi_modal: torch
186
- num_workers: 0
187
- drop_last: true
188
- pin_memory: true
189
- prefetch_factor: null
190
- persistent_workers: false
191
- timeout: 0
192
- rlds_dataset_name: libero_4_task_suites_no_noops
193
- rlds_data_root_dir: /mnt/data/zhangjian/dataset/Simulation/datasets--openvla--modified_libero_rlds
194
- use_wrist_image: true
195
- use_proprio: true
196
- restore_dataloader: true
197
- fast_forward_batches: null
198
- evaluators:
199
- - label: val
200
- data:
201
- dataset: rlds_dataset
202
- mixture: null
203
- root_size_mixture: null
204
- split: validation
205
- seed: null
206
- shuffle_messages: false
207
- pad: to_max
208
- sequence_length: 768
209
- shuffle: false
210
- for_inference: false
211
- multi_modal: torch
212
- num_workers: 0
213
- drop_last: true
214
- pin_memory: true
215
- prefetch_factor: null
216
- persistent_workers: true
217
- timeout: 0
218
- rlds_dataset_name: libero_4_task_suites_no_noops
219
- rlds_data_root_dir: /mnt/data/zhangjian/dataset/Simulation/datasets--openvla--modified_libero_rlds
220
- use_wrist_image: true
221
- use_proprio: true
222
- device_eval_batch_size: null
223
- subset_num_batches: 170
224
- max_examples: null
225
- max_new_tokens: 448
226
- mm_evaluator: null
227
- save_dir: null
228
- save_to_checkpoint_dir: false
229
- eval_name: null
230
- skip_if_metrics_cached: true
231
- eval_interval: 0
232
- inf_eval_interval: -1
233
- inf_evaluators: []
234
- save_folder: /mnt/data/zhangjian/a1/libero_4_qwen3-4b_l1_regression_wrist_proprio_ft_ah_lora_r8_llm_bs108
235
- remote_save_folder: null
236
- canceled_check_interval: 50
237
- save_interval: 500
238
- save_interval_unsharded: 500
239
- save_interval_ephemeral: null
240
- save_interval_action_head: 500
241
- save_num_checkpoints_to_keep: 1
242
- save_num_unsharded_checkpoints_to_keep: 1
243
- save_num_action_head_checkpoints_to_keep: 2
244
- save_overwrite: true
245
- force_save_unsharded: false
246
- no_pre_train_checkpoint: true
247
- initial_model_checkpoint: null
248
- load_model_config: null
249
- load_path: null
250
- load_path_sharded_checkpointer: null
251
- reset_optimizer_state: false
252
- reset_trainer_state: false
253
- save_dataloader_state: false
254
- reset_dataloader_state: false
255
- sharded_checkpointer: torch_legacy
256
- max_duration: 440800
257
- global_train_batch_size: 96
258
- device_train_batch_size: 32
259
- device_train_microbatch_size: 32
260
- device_eval_batch_size: 4
261
- eval_subset_num_batches: -1
262
- eval_on_load: false
263
- device_inf_eval_batch_size: 16
264
- inf_eval_subset_num_batches: -1
265
- device_train_grad_accum: 1
266
- max_grad_norm: 1.0
267
- multi_component_grad_norm: true
268
- batch_divisor: global_batch
269
- max_grad_norm_ratio: null
270
- precision: amp_bf16
271
- wandb:
272
- project: a1-vla-20.47
273
- entity: demo0
274
- group: null
275
- name: libero_4_qwen3-4b_l1_regression_wrist_proprio_ft_ah_lora_r8_llm_bs108_20250811_215843
276
- tags:
277
- - watching
278
- log_artifacts: false
279
- rank_zero_only: true
280
- log_interval: 1
281
- speed_monitor:
282
- window_size: 20
283
- gpu_flops_available: null
284
- console_log_interval: 1
285
- gen1_gc_interval: 1
286
- compile: null
287
- fsdp:
288
- use_orig_params: true
289
- sharding_strategy: FULL_SHARD
290
- wrapping_strategy: by_block_and_size
291
- precision: float
292
- hybrid_sharding_num_model_replicas: null
293
- softmax_auxiliary_loss: true
294
- softmax_auxiliary_loss_scale: 0.0001
295
- time_limit: null
296
- extra_steps_after_cancel: 10
297
- python_profiling: false
298
- torch_profiling: false
299
- stop_at: 440800
300
- stop_after: null
301
- activation_checkpointing: whole_layer
302
- fused_loss: null
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
libero_spatial_dit-1B_action_text-siglip/step_130000/model.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f4ddfa081721faeb08de99dea9b1685a6baec710edcafc8d410c7541d931bb29
3
- size 7228114647