LucaFrat commited on
Commit
8b19a37
·
verified ·
1 Parent(s): 15f43aa

Upload folder using huggingface_hub

Browse files
checkpoint-4000/config.json ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "action_horizon": 40,
3
+ "add_pos_embed": true,
4
+ "apply_sincos_state_encoding": false,
5
+ "architectures": [
6
+ "Gr00tN1d7"
7
+ ],
8
+ "attn_dropout": 0.2,
9
+ "attn_implementation": null,
10
+ "backbone_embedding_dim": 2048,
11
+ "backbone_trainable_params_fp32": true,
12
+ "color_jitter_params": {
13
+ "brightness": 0.3,
14
+ "contrast": 0.4,
15
+ "hue": 0.08,
16
+ "saturation": 0.5
17
+ },
18
+ "crop_fraction": 0.95,
19
+ "diffusion_model_cfg": {
20
+ "attention_head_dim": 48,
21
+ "dropout": 0.2,
22
+ "final_dropout": true,
23
+ "interleave_self_attention": true,
24
+ "norm_type": "ada_norm",
25
+ "num_attention_heads": 32,
26
+ "num_layers": 32,
27
+ "output_dim": 1024,
28
+ "positional_embeddings": null
29
+ },
30
+ "dtype": "bfloat16",
31
+ "exclude_state": false,
32
+ "formalize_language": true,
33
+ "hidden_size": 1024,
34
+ "image_crop_size": [
35
+ 230,
36
+ 230
37
+ ],
38
+ "image_target_size": [
39
+ 256,
40
+ 256
41
+ ],
42
+ "letter_box_transform": false,
43
+ "load_bf16": false,
44
+ "max_action_dim": 132,
45
+ "max_num_embodiments": 32,
46
+ "max_seq_len": 1024,
47
+ "max_state_dim": 132,
48
+ "model_dtype": "bfloat16",
49
+ "model_name": "nvidia/Cosmos-Reason2-2B",
50
+ "model_type": "Gr00tN1d7",
51
+ "noise_beta_alpha": 1.5,
52
+ "noise_beta_beta": 1.0,
53
+ "noise_s": 0.999,
54
+ "num_inference_timesteps": 4,
55
+ "num_timestep_buckets": 1000,
56
+ "random_history_crop": true,
57
+ "random_rotation_angle": 0,
58
+ "reproject_vision": false,
59
+ "rtc_ramp_rate": 6.0,
60
+ "select_layer": 16,
61
+ "shortest_image_edge": 256,
62
+ "state_dropout_prob": 0.2,
63
+ "state_gaussian_noise_std": 0.0,
64
+ "transformers_version": "4.57.3",
65
+ "tune_diffusion_model": true,
66
+ "tune_linear": true,
67
+ "tune_llm": false,
68
+ "tune_projector": true,
69
+ "tune_top_llm_layers": 0,
70
+ "tune_visual": false,
71
+ "tune_vlln": true,
72
+ "use_albumentations": true,
73
+ "use_alternate_vl_dit": true,
74
+ "use_flash_attention": true,
75
+ "use_future_tokens": false,
76
+ "use_mean_std": false,
77
+ "use_percentiles": true,
78
+ "use_vl_self_attention": true,
79
+ "use_vlln": true,
80
+ "vl_self_attention_cfg": {
81
+ "attention_head_dim": 64,
82
+ "dropout": 0.2,
83
+ "final_dropout": true,
84
+ "num_attention_heads": 32,
85
+ "num_layers": 4,
86
+ "positional_embeddings": null
87
+ }
88
+ }
checkpoint-4000/embodiment_id.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "robocasa_panda_omron": 13,
3
+ "oxe_droid": 17,
4
+ "oxe_fractal": 18,
5
+ "oxe_language_table": 19,
6
+ "oxe_bridge": 20,
7
+ "unknown": 22,
8
+ "gr1_unified": 20,
9
+ "agibot": 26,
10
+ "sim_behavior_r1_pro": 23,
11
+ "xdof": 24,
12
+ "xdof_oss_data": 25,
13
+ "unitree_g1_full_body_with_waist_height_nav_cmd": 25,
14
+ "real_r1_pro_sharpa": 27,
15
+ "real_r1_pro_sharpa_add_view": 27,
16
+ "real_r1_pro_sharpa_relative_arm_joint": 26,
17
+ "real_r1_pro_sharpa_delta_eef": 26,
18
+ "real_r1_pro_sharpa_absolute_eef": 26,
19
+ "real_r1_pro_sharpa_meanstd": 26,
20
+ "real_r1_pro_sharpa_relative_eef": 26,
21
+ "real_r1_pro_sharpa_relative_eef_add_view": 26,
22
+ "real_r1_pro_sharpa_relative_eef_relative_hand": 26,
23
+ "real_r1_pro_sharpa_relative_eef_human": 26,
24
+ "real_r1_pro_sharpa_relative_eef_human_add_view": 26,
25
+ "real_r1_pro_sharpa_relative_eef_human_relative_hand": 26,
26
+ "real_r1_pro_sharpa_relative_eef_egodex": 26,
27
+ "real_r1_pro_sharpa_relative_eef_egodex_relative_hand": 26,
28
+ "real_r1_pro_sharpa_relative_eef_egodex_wrist_only": 26,
29
+ "real_r1_pro_sharpa_relative_eef_maxinsights": 26,
30
+ "real_r1_pro_sharpa_relative_eef_maxinsights_relative_hand": 26,
31
+ "real_r1_pro_sharpa_relative_eef_mecka": 26,
32
+ "real_r1_pro_sharpa_relative_eef_mecka_relative_hand": 26,
33
+ "real_g1_relative_eef_absolute_joints": 25,
34
+ "real_g1_relative_eef_absolute_joints_wrist_cam": 25,
35
+ "real_g1_relative_eef_relative_joints": 25,
36
+ "real_r1_pro_sharpa_relative_eef_relative_hand_relative_joint": 26,
37
+ "real_r1_pro_sharpa_relative_joint": 29,
38
+ "oxe_droid_relative_eef_relative_joint": 24,
39
+ "oxe_droid_relative_eef_relative_joint_swapped": 24,
40
+ "oxe_droid_relative_eef_relative_joint_upweight_z": 24,
41
+ "oxe_droid_relative_eef_relative_joint_upweight_z_swapped": 24,
42
+ "oxe_droid_relative_eef_relative_joint_3view": 24,
43
+ "oxe_droid_relative_eef_relative_joint_3view_swapped": 24,
44
+ "oxe_droid_relative_eef": 24,
45
+ "oxe_droid_joint_position_relative": 24,
46
+ "xdof_relative_eef_relative_joint": 27,
47
+ "xdof_relative_eef_relative_joint_subtask": 27,
48
+ "xdof_relative_eef": 27,
49
+ "xdof_relative_joint": 28,
50
+ "simpler_env_google": 0,
51
+ "simpler_env_widowx": 1,
52
+ "libero_sim": 2,
53
+ "droid_sim": 3,
54
+ "unitree_g1_sonic": 11,
55
+ "new_embodiment": 10,
56
+ "robocasa_gr1_tabletop": 10
57
+ }
checkpoint-4000/experiment_cfg/conf.yaml ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ load_config_path: null
2
+ model:
3
+ model_type: Gr00tN1d7
4
+ model_dtype: bfloat16
5
+ model_name: nvidia/Cosmos-Reason2-2B
6
+ backbone_model_type: qwen
7
+ model_revision: null
8
+ tune_top_llm_layers: 0
9
+ backbone_embedding_dim: 2048
10
+ tune_llm: false
11
+ tune_visual: false
12
+ select_layer: 12
13
+ reproject_vision: false
14
+ use_flash_attention: true
15
+ load_bf16: false
16
+ backbone_trainable_params_fp32: true
17
+ image_crop_size:
18
+ - 230
19
+ - 230
20
+ image_target_size:
21
+ - 256
22
+ - 256
23
+ shortest_image_edge: null
24
+ crop_fraction: null
25
+ random_rotation_angle: null
26
+ color_jitter_params:
27
+ brightness: 0.3
28
+ contrast: 0.4
29
+ saturation: 0.5
30
+ hue: 0.08
31
+ use_albumentations_transforms: true
32
+ extra_augmentation_config: null
33
+ formalize_language: true
34
+ apply_sincos_state_encoding: false
35
+ use_percentiles: true
36
+ use_relative_action: true
37
+ max_state_dim: 132
38
+ max_action_dim: 132
39
+ action_horizon: 40
40
+ hidden_size: 1024
41
+ input_embedding_dim: 1536
42
+ state_history_length: 1
43
+ add_pos_embed: true
44
+ attn_dropout: 0.2
45
+ use_vlln: true
46
+ max_seq_len: 1024
47
+ use_alternate_vl_dit: true
48
+ attend_text_every_n_blocks: 2
49
+ diffusion_model_cfg:
50
+ positional_embeddings: null
51
+ num_layers: 16
52
+ num_attention_heads: 32
53
+ attention_head_dim: 48
54
+ norm_type: ada_norm
55
+ dropout: 0.2
56
+ final_dropout: true
57
+ output_dim: 1024
58
+ interleave_self_attention: true
59
+ num_inference_timesteps: 4
60
+ noise_beta_alpha: 1.5
61
+ noise_beta_beta: 1.0
62
+ noise_s: 0.999
63
+ num_timestep_buckets: 1000
64
+ tune_projector: true
65
+ tune_diffusion_model: true
66
+ tune_vlln: true
67
+ state_dropout_prob: 0.2
68
+ exclude_state: false
69
+ use_mean_std: false
70
+ max_num_embodiments: 32
71
+ data:
72
+ datasets:
73
+ - dataset_paths:
74
+ - /home/ubuntu/groot-files/dataset_train
75
+ embodiment_tag: unitree_g1_sonic
76
+ mix_ratio: 1.0
77
+ dataset_type: physical_embodiment
78
+ val_dataset_path: null
79
+ modality_configs:
80
+ unitree_g1_sonic:
81
+ video:
82
+ delta_indices:
83
+ - 0
84
+ modality_keys:
85
+ - ego_view
86
+ sin_cos_embedding_keys: null
87
+ mean_std_embedding_keys: null
88
+ action_configs: null
89
+ state:
90
+ delta_indices:
91
+ - 0
92
+ modality_keys:
93
+ - left_leg
94
+ - right_leg
95
+ - waist
96
+ - left_arm
97
+ - right_arm
98
+ - left_hand
99
+ - right_hand
100
+ - projected_gravity
101
+ sin_cos_embedding_keys: null
102
+ mean_std_embedding_keys: null
103
+ action_configs: null
104
+ action:
105
+ delta_indices:
106
+ - 0
107
+ - 1
108
+ - 2
109
+ - 3
110
+ - 4
111
+ - 5
112
+ - 6
113
+ - 7
114
+ - 8
115
+ - 9
116
+ - 10
117
+ - 11
118
+ - 12
119
+ - 13
120
+ - 14
121
+ - 15
122
+ - 16
123
+ - 17
124
+ - 18
125
+ - 19
126
+ - 20
127
+ - 21
128
+ - 22
129
+ - 23
130
+ - 24
131
+ - 25
132
+ - 26
133
+ - 27
134
+ - 28
135
+ - 29
136
+ - 30
137
+ - 31
138
+ - 32
139
+ - 33
140
+ - 34
141
+ - 35
142
+ - 36
143
+ - 37
144
+ - 38
145
+ - 39
146
+ modality_keys:
147
+ - motion_token
148
+ - left_hand_joints
149
+ - right_hand_joints
150
+ sin_cos_embedding_keys: null
151
+ mean_std_embedding_keys: null
152
+ action_configs:
153
+ - rep: ABSOLUTE
154
+ type: NON_EEF
155
+ format: DEFAULT
156
+ state_key: null
157
+ - rep: ABSOLUTE
158
+ type: NON_EEF
159
+ format: DEFAULT
160
+ state_key: null
161
+ - rep: ABSOLUTE
162
+ type: NON_EEF
163
+ format: DEFAULT
164
+ state_key: null
165
+ language:
166
+ delta_indices:
167
+ - 0
168
+ modality_keys:
169
+ - annotation.human.task_description
170
+ sin_cos_embedding_keys: null
171
+ mean_std_embedding_keys: null
172
+ action_configs: null
173
+ download_cache: false
174
+ shard_size: 1024
175
+ episode_sampling_rate: 0.1
176
+ num_shards_per_epoch: 100000
177
+ override_pretraining_statistics: true
178
+ mode: single_turn
179
+ random_chop: 0.0
180
+ mock_dataset_mode: false
181
+ shuffle: true
182
+ seed: 42
183
+ multiprocessing_context: fork
184
+ allow_padding: false
185
+ subsample_ratio: 1.0
186
+ image_crop_size:
187
+ - 244
188
+ - 244
189
+ image_target_size:
190
+ - 224
191
+ - 224
192
+ video_backend: torchcodec
193
+ training:
194
+ output_dir: /home/ubuntu/groot-files/checkpoints/g1_finetune-20260527-142325
195
+ experiment_name: null
196
+ max_steps: 20000
197
+ global_batch_size: 32
198
+ batch_size: null
199
+ gradient_accumulation_steps: 1
200
+ learning_rate: 0.0001
201
+ lr_scheduler_type: cosine
202
+ weight_decay: 1.0e-05
203
+ warmup_ratio: 0.05
204
+ warmup_steps: 0
205
+ max_grad_norm: 1.0
206
+ optim: adamw_torch
207
+ start_from_checkpoint: nvidia/GR00T-N1.7-3B
208
+ skip_weight_loading: false
209
+ tf32: true
210
+ fp16: false
211
+ bf16: true
212
+ eval_bf16: true
213
+ logging_steps: 10
214
+ save_steps: 2000
215
+ save_total_limit: 10
216
+ save_vl_model: false
217
+ save_only_model: false
218
+ upload_checkpoints: false
219
+ upload_every: 1000
220
+ upload_last_n_checkpoints: 5
221
+ max_concurrent_uploads: 2
222
+ eval_strategy: 'no'
223
+ eval_steps: 500
224
+ eval_set_split_ratio: 0.1
225
+ eval_batch_size: 2
226
+ save_best_eval_metric_name: ''
227
+ save_best_eval_metric_greater_is_better: true
228
+ eval_dataset_path: /home/ubuntu/groot-files/dataset_eval
229
+ eval_num_batches: 50
230
+ deepspeed_stage: 2
231
+ gradient_checkpointing: false
232
+ transformers_trust_remote_code: true
233
+ transformers_local_files_only: false
234
+ transformers_cache_dir: null
235
+ transformers_access_token: null
236
+ use_ddp: false
237
+ ddp_bucket_cap_mb: 100
238
+ num_gpus: 4
239
+ dataloader_num_workers: 6
240
+ remove_unused_columns: false
241
+ use_wandb: true
242
+ wandb_project: groot-finetune
243
+ enable_profiling: false
244
+ max_retries: 3
245
+ assert_loss_less_than: null
246
+ add_rl_callback: false
247
+ enable_open_loop_eval: false
248
+ open_loop_eval_traj_ids:
249
+ - 0
250
+ open_loop_eval_steps_per_traj: 100
251
+ open_loop_eval_plot_indices: null
252
+ max_steps: 20000
253
+ save_steps: 2000
checkpoint-4000/experiment_cfg/config.yaml ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ !!python/object:gr00t.configs.base_config.Config
2
+ data: !!python/object:gr00t.configs.data.data_config.DataConfig
3
+ allow_padding: false
4
+ datasets:
5
+ - !!python/object:gr00t.configs.data.data_config.SingleDatasetConfig
6
+ dataset_paths:
7
+ - /home/ubuntu/groot-files/dataset_train
8
+ dataset_type: physical_embodiment
9
+ embodiment_tag: unitree_g1_sonic
10
+ mix_ratio: 1.0
11
+ val_dataset_path: null
12
+ download_cache: false
13
+ episode_sampling_rate: 0.1
14
+ image_crop_size:
15
+ - 244
16
+ - 244
17
+ image_target_size:
18
+ - 224
19
+ - 224
20
+ mock_dataset_mode: false
21
+ modality_configs:
22
+ unitree_g1_sonic:
23
+ action: !!python/object:gr00t.data.types.ModalityConfig
24
+ action_configs:
25
+ - !!python/object:gr00t.data.types.ActionConfig
26
+ format: &id001 !!python/object/apply:gr00t.data.types.ActionFormat
27
+ - default
28
+ rep: &id002 !!python/object/apply:gr00t.data.types.ActionRepresentation
29
+ - absolute
30
+ state_key: null
31
+ type: &id003 !!python/object/apply:gr00t.data.types.ActionType
32
+ - non_eef
33
+ - !!python/object:gr00t.data.types.ActionConfig
34
+ format: *id001
35
+ rep: *id002
36
+ state_key: null
37
+ type: *id003
38
+ - !!python/object:gr00t.data.types.ActionConfig
39
+ format: *id001
40
+ rep: *id002
41
+ state_key: null
42
+ type: *id003
43
+ delta_indices:
44
+ - 0
45
+ - 1
46
+ - 2
47
+ - 3
48
+ - 4
49
+ - 5
50
+ - 6
51
+ - 7
52
+ - 8
53
+ - 9
54
+ - 10
55
+ - 11
56
+ - 12
57
+ - 13
58
+ - 14
59
+ - 15
60
+ - 16
61
+ - 17
62
+ - 18
63
+ - 19
64
+ - 20
65
+ - 21
66
+ - 22
67
+ - 23
68
+ - 24
69
+ - 25
70
+ - 26
71
+ - 27
72
+ - 28
73
+ - 29
74
+ - 30
75
+ - 31
76
+ - 32
77
+ - 33
78
+ - 34
79
+ - 35
80
+ - 36
81
+ - 37
82
+ - 38
83
+ - 39
84
+ mean_std_embedding_keys: null
85
+ modality_keys:
86
+ - motion_token
87
+ - left_hand_joints
88
+ - right_hand_joints
89
+ sin_cos_embedding_keys: null
90
+ language: !!python/object:gr00t.data.types.ModalityConfig
91
+ action_configs: null
92
+ delta_indices:
93
+ - 0
94
+ mean_std_embedding_keys: null
95
+ modality_keys:
96
+ - annotation.human.task_description
97
+ sin_cos_embedding_keys: null
98
+ state: !!python/object:gr00t.data.types.ModalityConfig
99
+ action_configs: null
100
+ delta_indices:
101
+ - 0
102
+ mean_std_embedding_keys: null
103
+ modality_keys:
104
+ - left_leg
105
+ - right_leg
106
+ - waist
107
+ - left_arm
108
+ - right_arm
109
+ - left_hand
110
+ - right_hand
111
+ - projected_gravity
112
+ sin_cos_embedding_keys: null
113
+ video: !!python/object:gr00t.data.types.ModalityConfig
114
+ action_configs: null
115
+ delta_indices:
116
+ - 0
117
+ mean_std_embedding_keys: null
118
+ modality_keys:
119
+ - ego_view
120
+ sin_cos_embedding_keys: null
121
+ mode: single_turn
122
+ multiprocessing_context: fork
123
+ num_shards_per_epoch: 100000
124
+ override_pretraining_statistics: true
125
+ random_chop: 0.0
126
+ seed: 42
127
+ shard_size: 1024
128
+ shuffle: true
129
+ subsample_ratio: 1.0
130
+ video_backend: torchcodec
131
+ load_config_path: null
132
+ model: !!python/object:gr00t.configs.model.gr00t_n1d7.Gr00tN1d7Config
133
+ _attn_implementation_internal: null
134
+ _commit_hash: null
135
+ _name_or_path: ''
136
+ _output_attentions: false
137
+ add_cross_attention: false
138
+ architectures: null
139
+ backbone_trainable_params_fp32: true
140
+ bad_words_ids: null
141
+ begin_suppress_tokens: null
142
+ bos_token_id: null
143
+ chunk_size_feed_forward: 0
144
+ color_jitter_params:
145
+ brightness: 0.3
146
+ contrast: 0.4
147
+ hue: 0.08
148
+ saturation: 0.5
149
+ cross_attention_hidden_size: null
150
+ decoder_start_token_id: null
151
+ diffusion_model_cfg:
152
+ attention_head_dim: 48
153
+ dropout: 0.2
154
+ final_dropout: true
155
+ interleave_self_attention: true
156
+ norm_type: ada_norm
157
+ num_attention_heads: 32
158
+ num_layers: 16
159
+ output_dim: 1024
160
+ positional_embeddings: null
161
+ diversity_penalty: 0.0
162
+ do_sample: false
163
+ dtype: null
164
+ early_stopping: false
165
+ encoder_no_repeat_ngram_size: 0
166
+ eos_token_id: null
167
+ exponential_decay_length_penalty: null
168
+ extra_augmentation_config: null
169
+ finetuning_task: null
170
+ forced_bos_token_id: null
171
+ forced_eos_token_id: null
172
+ id2label:
173
+ 0: LABEL_0
174
+ 1: LABEL_1
175
+ is_decoder: false
176
+ is_encoder_decoder: false
177
+ label2id:
178
+ LABEL_0: 0
179
+ LABEL_1: 1
180
+ length_penalty: 1.0
181
+ load_bf16: false
182
+ max_length: 20
183
+ min_length: 0
184
+ model_name: nvidia/Cosmos-Reason2-2B
185
+ no_repeat_ngram_size: 0
186
+ num_beam_groups: 1
187
+ num_beams: 1
188
+ num_return_sequences: 1
189
+ output_hidden_states: false
190
+ output_scores: false
191
+ pad_token_id: null
192
+ prefix: null
193
+ problem_type: null
194
+ pruned_heads: {}
195
+ random_rotation_angle: null
196
+ remove_invalid_values: false
197
+ repetition_penalty: 1.0
198
+ reproject_vision: false
199
+ return_dict: true
200
+ return_dict_in_generate: false
201
+ sep_token_id: null
202
+ state_dropout_prob: 0.2
203
+ suppress_tokens: null
204
+ task_specific_params: null
205
+ temperature: 1.0
206
+ tf_legacy_loss: false
207
+ tie_encoder_decoder: false
208
+ tie_word_embeddings: true
209
+ tokenizer_class: null
210
+ top_k: 50
211
+ top_p: 1.0
212
+ torchscript: false
213
+ transformers_version: null
214
+ tune_diffusion_model: true
215
+ tune_llm: false
216
+ tune_projector: true
217
+ tune_visual: false
218
+ typical_p: 1.0
219
+ use_bfloat16: false
220
+ use_relative_action: true
221
+ training: !!python/object:gr00t.configs.training.training_config.TrainingConfig
222
+ add_rl_callback: false
223
+ assert_loss_less_than: null
224
+ batch_size: null
225
+ bf16: true
226
+ dataloader_num_workers: 6
227
+ ddp_bucket_cap_mb: 100
228
+ deepspeed_stage: 2
229
+ enable_open_loop_eval: false
230
+ enable_profiling: false
231
+ eval_batch_size: 2
232
+ eval_bf16: true
233
+ eval_dataset_path: /home/ubuntu/groot-files/dataset_eval
234
+ eval_num_batches: 50
235
+ eval_set_split_ratio: 0.1
236
+ eval_steps: 500
237
+ eval_strategy: 'no'
238
+ experiment_name: null
239
+ fp16: false
240
+ global_batch_size: 32
241
+ gradient_accumulation_steps: 1
242
+ gradient_checkpointing: false
243
+ learning_rate: 0.0001
244
+ logging_steps: 10
245
+ lr_scheduler_type: cosine
246
+ max_concurrent_uploads: 2
247
+ max_grad_norm: 1.0
248
+ max_retries: 3
249
+ max_steps: 20000
250
+ num_gpus: 4
251
+ open_loop_eval_plot_indices: null
252
+ open_loop_eval_steps_per_traj: 100
253
+ open_loop_eval_traj_ids:
254
+ - 0
255
+ optim: adamw_torch
256
+ output_dir: /home/ubuntu/groot-files/checkpoints/g1_finetune-20260527-142325
257
+ remove_unused_columns: false
258
+ save_best_eval_metric_greater_is_better: true
259
+ save_best_eval_metric_name: ''
260
+ save_only_model: false
261
+ save_steps: 2000
262
+ save_total_limit: 10
263
+ save_vl_model: false
264
+ skip_weight_loading: false
265
+ start_from_checkpoint: nvidia/GR00T-N1.7-3B
266
+ tf32: true
267
+ transformers_access_token: null
268
+ transformers_cache_dir: null
269
+ transformers_local_files_only: false
270
+ transformers_trust_remote_code: true
271
+ upload_checkpoints: false
272
+ upload_every: 1000
273
+ upload_last_n_checkpoints: 5
274
+ use_ddp: false
275
+ use_wandb: true
276
+ wandb_project: groot-finetune
277
+ warmup_ratio: 0.05
278
+ warmup_steps: 0
279
+ weight_decay: 1.0e-05
checkpoint-4000/experiment_cfg/dataset_statistics.json ADDED
@@ -0,0 +1,907 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "unitree_g1_sonic": {
3
+ "state": {
4
+ "left_leg": {
5
+ "min": [
6
+ -0.4857776165008545,
7
+ -0.24391146004199982,
8
+ -0.8776780366897583,
9
+ 0.03597185015678406,
10
+ -0.4930363595485687,
11
+ -0.20340518653392792
12
+ ],
13
+ "max": [
14
+ 0.2887633442878723,
15
+ 0.31727835536003113,
16
+ 0.7570706605911255,
17
+ 0.8854392766952515,
18
+ 0.1892659068107605,
19
+ 0.15561887621879578
20
+ ],
21
+ "mean": [
22
+ 0.03709593042731285,
23
+ 0.017334647476673126,
24
+ 0.10126602649688721,
25
+ 0.21372011303901672,
26
+ -0.19937776029109955,
27
+ -0.009935487061738968
28
+ ],
29
+ "std": [
30
+ 0.12473253160715103,
31
+ 0.06594803929328918,
32
+ 0.16698330640792847,
33
+ 0.08503254503011703,
34
+ 0.054379407316446304,
35
+ 0.04406041279435158
36
+ ],
37
+ "q01": [
38
+ -0.34423858046531675,
39
+ -0.14141905188560486,
40
+ -0.4198744976520538,
41
+ 0.12339614242315293,
42
+ -0.2891655659675598,
43
+ -0.11299800157546996
44
+ ],
45
+ "q99": [
46
+ 0.1865586566925049,
47
+ 0.17292326152324677,
48
+ 0.4432717561721802,
49
+ 0.5340901875495913,
50
+ -0.01812098965048773,
51
+ 0.09286320090293888
52
+ ]
53
+ },
54
+ "right_leg": {
55
+ "min": [
56
+ -0.40723779797554016,
57
+ -0.30062612891197205,
58
+ -1.3307929039001465,
59
+ 0.07445771992206573,
60
+ -0.5606564879417419,
61
+ -0.1984633058309555
62
+ ],
63
+ "max": [
64
+ 0.25397607684135437,
65
+ 0.15654273331165314,
66
+ 0.24837948381900787,
67
+ 0.9999935626983643,
68
+ 0.10241366177797318,
69
+ 0.1306799054145813
70
+ ],
71
+ "mean": [
72
+ 0.05326543375849724,
73
+ -0.025633497163653374,
74
+ -0.21512727439403534,
75
+ 0.207839697599411,
76
+ -0.19957764446735382,
77
+ -0.003420396940782666
78
+ ],
79
+ "std": [
80
+ 0.07712754607200623,
81
+ 0.0601215623319149,
82
+ 0.18328991532325745,
83
+ 0.0994856134057045,
84
+ 0.06609486788511276,
85
+ 0.04335305467247963
86
+ ],
87
+ "q01": [
88
+ -0.19365620791912078,
89
+ -0.1937438142299652,
90
+ -0.9012520694732666,
91
+ 0.12708178162574768,
92
+ -0.43179439902305605,
93
+ -0.11342300474643707
94
+ ],
95
+ "q99": [
96
+ 0.19940897822380066,
97
+ 0.11470767110586166,
98
+ 0.14574560344219212,
99
+ 0.7404617261886605,
100
+ -0.08023603171110152,
101
+ 0.0866367721557618
102
+ ]
103
+ },
104
+ "waist": {
105
+ "min": [
106
+ -0.3889274299144745,
107
+ -0.3032904267311096,
108
+ -0.03526151925325394
109
+ ],
110
+ "max": [
111
+ 0.4404728412628174,
112
+ 0.11908156424760818,
113
+ 0.05046159774065018
114
+ ],
115
+ "mean": [
116
+ 0.09945842623710632,
117
+ -0.03563961386680603,
118
+ -0.00033498145057819784
119
+ ],
120
+ "std": [
121
+ 0.1114959642291069,
122
+ 0.04506837949156761,
123
+ 0.007683659438043833
124
+ ],
125
+ "q01": [
126
+ -0.09960892051458359,
127
+ -0.16775038778781892,
128
+ -0.019175979271531106
129
+ ],
130
+ "q99": [
131
+ 0.3722357642650606,
132
+ 0.042387988269329074,
133
+ 0.024506188780069386
134
+ ]
135
+ },
136
+ "left_arm": {
137
+ "min": [
138
+ -0.2003283053636551,
139
+ 0.09816279262304306,
140
+ -0.5412195920944214,
141
+ 0.7723593711853027,
142
+ -0.7412483096122742,
143
+ -0.09287774562835693,
144
+ -0.31815722584724426
145
+ ],
146
+ "max": [
147
+ 0.4134317934513092,
148
+ 0.38915175199508667,
149
+ 0.3470511734485626,
150
+ 1.363792896270752,
151
+ 0.38266828656196594,
152
+ 0.5152138471603394,
153
+ 0.42312705516815186
154
+ ],
155
+ "mean": [
156
+ 0.17440484464168549,
157
+ 0.21054533123970032,
158
+ 0.032854851335287094,
159
+ 1.0895949602127075,
160
+ -0.11395763605833054,
161
+ 0.20216572284698486,
162
+ 0.011326363310217857
163
+ ],
164
+ "std": [
165
+ 0.09000273048877716,
166
+ 0.04009358212351799,
167
+ 0.09271923452615738,
168
+ 0.0719800963997837,
169
+ 0.1966457962989807,
170
+ 0.11650487035512924,
171
+ 0.10442934930324554
172
+ ],
173
+ "q01": [
174
+ -0.06558151662349701,
175
+ 0.13350427150726318,
176
+ -0.20625522315502168,
177
+ 0.8954852747917176,
178
+ -0.5607327961921692,
179
+ -0.05204461485147476,
180
+ -0.23472590863704682
181
+ ],
182
+ "q99": [
183
+ 0.3320828676223755,
184
+ 0.31760211229324387,
185
+ 0.23645307958126077,
186
+ 1.2605007886886597,
187
+ 0.29088112711906433,
188
+ 0.4681393218040469,
189
+ 0.27189282655715985
190
+ ]
191
+ },
192
+ "right_arm": {
193
+ "min": [
194
+ -0.9314020276069641,
195
+ -0.7953690886497498,
196
+ -0.49778875708580017,
197
+ -0.7156979441642761,
198
+ -0.9786917567253113,
199
+ -0.1273084282875061,
200
+ -0.9962846040725708
201
+ ],
202
+ "max": [
203
+ 0.3628104329109192,
204
+ 0.07754991948604584,
205
+ 0.8718883395195007,
206
+ 1.3758729696273804,
207
+ 1.0651459693908691,
208
+ 0.6250013113021851,
209
+ 0.9581388235092163
210
+ ],
211
+ "mean": [
212
+ -0.13906176388263702,
213
+ -0.203518807888031,
214
+ 0.16250377893447876,
215
+ 0.3269632160663605,
216
+ 0.15917158126831055,
217
+ 0.20837940275669098,
218
+ 0.0921417772769928
219
+ ],
220
+ "std": [
221
+ 0.281828910112381,
222
+ 0.07468900084495544,
223
+ 0.18738719820976257,
224
+ 0.6238588690757751,
225
+ 0.21029023826122284,
226
+ 0.12883725762367249,
227
+ 0.2203560620546341
228
+ ],
229
+ "q01": [
230
+ -0.7701197338104248,
231
+ -0.3844673705101013,
232
+ -0.2184901690483093,
233
+ -0.5404818391799927,
234
+ -0.3923635482788086,
235
+ -0.0465836426615715,
236
+ -0.36222128629684447
237
+ ],
238
+ "q99": [
239
+ 0.2503907299041749,
240
+ 0.0012238291278481798,
241
+ 0.5864720344543457,
242
+ 1.3040913009643558,
243
+ 0.5952301049232496,
244
+ 0.5132987618446353,
245
+ 0.6925142097473146
246
+ ]
247
+ },
248
+ "left_hand": {
249
+ "min": [
250
+ 0.0,
251
+ 0.0,
252
+ 0.0,
253
+ 0.0,
254
+ 0.0,
255
+ 0.0,
256
+ 0.0
257
+ ],
258
+ "max": [
259
+ 0.0,
260
+ 0.0,
261
+ 0.0,
262
+ 0.0,
263
+ 0.0,
264
+ 0.0,
265
+ 0.0
266
+ ],
267
+ "mean": [
268
+ 0.0,
269
+ 0.0,
270
+ 0.0,
271
+ 0.0,
272
+ 0.0,
273
+ 0.0,
274
+ 0.0
275
+ ],
276
+ "std": [
277
+ 0.0,
278
+ 0.0,
279
+ 0.0,
280
+ 0.0,
281
+ 0.0,
282
+ 0.0,
283
+ 0.0
284
+ ],
285
+ "q01": [
286
+ 0.0,
287
+ 0.0,
288
+ 0.0,
289
+ 0.0,
290
+ 0.0,
291
+ 0.0,
292
+ 0.0
293
+ ],
294
+ "q99": [
295
+ 0.0,
296
+ 0.0,
297
+ 0.0,
298
+ 0.0,
299
+ 0.0,
300
+ 0.0,
301
+ 0.0
302
+ ]
303
+ },
304
+ "right_hand": {
305
+ "min": [
306
+ 0.0,
307
+ 0.0,
308
+ 0.0,
309
+ 0.0,
310
+ 0.0,
311
+ 0.0,
312
+ 0.0
313
+ ],
314
+ "max": [
315
+ 0.0,
316
+ 0.0,
317
+ 0.0,
318
+ 0.0,
319
+ 0.0,
320
+ 0.0,
321
+ 0.0
322
+ ],
323
+ "mean": [
324
+ 0.0,
325
+ 0.0,
326
+ 0.0,
327
+ 0.0,
328
+ 0.0,
329
+ 0.0,
330
+ 0.0
331
+ ],
332
+ "std": [
333
+ 0.0,
334
+ 0.0,
335
+ 0.0,
336
+ 0.0,
337
+ 0.0,
338
+ 0.0,
339
+ 0.0
340
+ ],
341
+ "q01": [
342
+ 0.0,
343
+ 0.0,
344
+ 0.0,
345
+ 0.0,
346
+ 0.0,
347
+ 0.0,
348
+ 0.0
349
+ ],
350
+ "q99": [
351
+ 0.0,
352
+ 0.0,
353
+ 0.0,
354
+ 0.0,
355
+ 0.0,
356
+ 0.0,
357
+ 0.0
358
+ ]
359
+ },
360
+ "projected_gravity": {
361
+ "min": [
362
+ -0.12354099005460739,
363
+ -0.13017292320728302,
364
+ -1.0
365
+ ],
366
+ "max": [
367
+ 0.22223307192325592,
368
+ 0.10705921798944473,
369
+ -0.9749471545219421
370
+ ],
371
+ "mean": [
372
+ -0.01829414628446102,
373
+ 0.002673766575753689,
374
+ -0.9988483190536499
375
+ ],
376
+ "std": [
377
+ 0.035847291350364685,
378
+ 0.03194916248321533,
379
+ 0.0014168791240174116
380
+ ],
381
+ "q01": [
382
+ -0.09324856609106064,
383
+ -0.08918023109436035,
384
+ -0.9999811720848083
385
+ ],
386
+ "q99": [
387
+ 0.06776312530040744,
388
+ 0.060245763361454124,
389
+ -0.9940099501609803
390
+ ]
391
+ }
392
+ },
393
+ "action": {
394
+ "motion_token": {
395
+ "min": [
396
+ -0.3125,
397
+ -0.5,
398
+ -0.5,
399
+ -0.4375,
400
+ -0.625,
401
+ -0.375,
402
+ -0.25,
403
+ -0.0625,
404
+ -0.4375,
405
+ -0.6875,
406
+ -0.375,
407
+ -0.375,
408
+ -0.3125,
409
+ -0.375,
410
+ -0.6875,
411
+ -0.25,
412
+ -0.375,
413
+ -0.25,
414
+ -0.375,
415
+ -0.5,
416
+ -0.5,
417
+ -0.5,
418
+ -0.625,
419
+ -0.5,
420
+ -0.3125,
421
+ -0.5625,
422
+ -0.125,
423
+ -0.5,
424
+ -0.3125,
425
+ -0.3125,
426
+ -0.125,
427
+ -0.375,
428
+ 0.0625,
429
+ -0.1875,
430
+ -0.1875,
431
+ -0.5625,
432
+ -0.625,
433
+ -0.6875,
434
+ -0.125,
435
+ -0.125,
436
+ -0.4375,
437
+ -0.5625,
438
+ -0.3125,
439
+ -0.375,
440
+ -0.5,
441
+ -0.4375,
442
+ -0.125,
443
+ -0.3125,
444
+ -0.5,
445
+ -0.25,
446
+ -0.375,
447
+ -0.625,
448
+ -0.0625,
449
+ -0.4375,
450
+ -0.0625,
451
+ -0.4375,
452
+ -0.5,
453
+ 0.0,
454
+ -0.25,
455
+ -0.5,
456
+ -0.375,
457
+ -0.1875,
458
+ 0.0,
459
+ -0.5
460
+ ],
461
+ "max": [
462
+ 0.125,
463
+ 0.25,
464
+ 0.25,
465
+ 0.125,
466
+ 0.1875,
467
+ 0.1875,
468
+ 0.5,
469
+ 0.4375,
470
+ 0.25,
471
+ 0.125,
472
+ 0.1875,
473
+ 0.0625,
474
+ 0.125,
475
+ 0.3125,
476
+ -0.0625,
477
+ 0.1875,
478
+ 0.25,
479
+ 0.25,
480
+ 0.125,
481
+ 0.0625,
482
+ 0.3125,
483
+ 0.125,
484
+ 0.25,
485
+ 0.25,
486
+ 0.3125,
487
+ 0.125,
488
+ 0.375,
489
+ 0.1875,
490
+ 0.375,
491
+ 0.375,
492
+ 0.375,
493
+ 0.25,
494
+ 0.4375,
495
+ 0.5,
496
+ 0.5,
497
+ 0.5625,
498
+ 0.25,
499
+ -0.0625,
500
+ 0.3125,
501
+ 0.4375,
502
+ 0.125,
503
+ 0.4375,
504
+ 0.4375,
505
+ 0.375,
506
+ -0.0625,
507
+ 0.1875,
508
+ 0.5625,
509
+ 0.1875,
510
+ 0.3125,
511
+ 0.1875,
512
+ 0.25,
513
+ 0.1875,
514
+ 0.375,
515
+ 0.3125,
516
+ 0.5625,
517
+ 0.1875,
518
+ 0.4375,
519
+ 0.5625,
520
+ 0.1875,
521
+ 0.375,
522
+ 0.3125,
523
+ 0.4375,
524
+ 0.5625,
525
+ 0.0
526
+ ],
527
+ "mean": [
528
+ -0.05909007787704468,
529
+ -0.12453675270080566,
530
+ -0.01780678704380989,
531
+ -0.16097033023834229,
532
+ -0.18779605627059937,
533
+ -0.08800307661294937,
534
+ 0.25408294796943665,
535
+ 0.19637517631053925,
536
+ -0.02542746439576149,
537
+ -0.21233749389648438,
538
+ -0.10003577917814255,
539
+ -0.2185034602880478,
540
+ -0.12148252129554749,
541
+ -0.0835186317563057,
542
+ -0.3871306777000427,
543
+ -0.000689013279043138,
544
+ -0.05048113688826561,
545
+ -0.05405506119132042,
546
+ -0.09686227142810822,
547
+ -0.29744818806648254,
548
+ -0.02790983021259308,
549
+ -0.22964908182621002,
550
+ -0.19564782083034515,
551
+ -0.233476459980011,
552
+ -0.09457159787416458,
553
+ -0.1688774675130844,
554
+ 0.12882205843925476,
555
+ -0.23097918927669525,
556
+ 0.07296938449144363,
557
+ 0.06667029112577438,
558
+ 0.14194951951503754,
559
+ -0.11792350560426712,
560
+ 0.2987644672393799,
561
+ 0.1758815497159958,
562
+ 0.14483655989170074,
563
+ 0.07765190303325653,
564
+ -0.1703023612499237,
565
+ -0.31876927614212036,
566
+ 0.11987659335136414,
567
+ 0.13128630816936493,
568
+ -0.08386579900979996,
569
+ -0.09540224820375443,
570
+ 0.058435142040252686,
571
+ -0.006829431280493736,
572
+ -0.31632739305496216,
573
+ -0.023000476881861687,
574
+ 0.21785704791545868,
575
+ -0.10807923972606659,
576
+ -0.01691969484090805,
577
+ -0.0048369369469583035,
578
+ -0.06753820925951004,
579
+ -0.14284193515777588,
580
+ 0.2129189521074295,
581
+ 0.015633253380656242,
582
+ 0.22632861137390137,
583
+ -0.14845414459705353,
584
+ 0.029541311785578728,
585
+ 0.29734915494918823,
586
+ -0.022820502519607544,
587
+ -0.14013485610485077,
588
+ 0.034568872302770615,
589
+ 0.0991114154458046,
590
+ 0.30694955587387085,
591
+ -0.22556506097316742
592
+ ],
593
+ "std": [
594
+ 0.0659003034234047,
595
+ 0.10037083178758621,
596
+ 0.07181278616189957,
597
+ 0.08773662894964218,
598
+ 0.17245645821094513,
599
+ 0.0585007518529892,
600
+ 0.09713392704725266,
601
+ 0.07830599695444107,
602
+ 0.1365525871515274,
603
+ 0.15700660645961761,
604
+ 0.09677070379257202,
605
+ 0.0688071995973587,
606
+ 0.05875665321946144,
607
+ 0.11583787202835083,
608
+ 0.11943700164556503,
609
+ 0.08188900351524353,
610
+ 0.06468821316957474,
611
+ 0.06906170397996902,
612
+ 0.09798441082239151,
613
+ 0.107750304043293,
614
+ 0.19952501356601715,
615
+ 0.1263345628976822,
616
+ 0.21770258247852325,
617
+ 0.14369164407253265,
618
+ 0.08769412338733673,
619
+ 0.1116272583603859,
620
+ 0.058447372168302536,
621
+ 0.18122409284114838,
622
+ 0.06928596645593643,
623
+ 0.16142737865447998,
624
+ 0.11108831316232681,
625
+ 0.06649158895015717,
626
+ 0.07022565603256226,
627
+ 0.10431908816099167,
628
+ 0.12881824374198914,
629
+ 0.32292255759239197,
630
+ 0.14586171507835388,
631
+ 0.08622725307941437,
632
+ 0.08811167627573013,
633
+ 0.06192754954099655,
634
+ 0.07485736161470413,
635
+ 0.27316814661026,
636
+ 0.13828617334365845,
637
+ 0.10271976888179779,
638
+ 0.06276937574148178,
639
+ 0.0647144541144371,
640
+ 0.08778728544712067,
641
+ 0.06371328979730606,
642
+ 0.1188734695315361,
643
+ 0.06812839955091476,
644
+ 0.09292089939117432,
645
+ 0.15349407494068146,
646
+ 0.069960817694664,
647
+ 0.08846410363912582,
648
+ 0.08715438097715378,
649
+ 0.07486134022474289,
650
+ 0.09785888344049454,
651
+ 0.1145324558019638,
652
+ 0.06591252237558365,
653
+ 0.20452545583248138,
654
+ 0.07274971902370453,
655
+ 0.09519387036561966,
656
+ 0.11427336931228638,
657
+ 0.11966678500175476
658
+ ],
659
+ "q01": [
660
+ -0.25,
661
+ -0.375,
662
+ -0.1875,
663
+ -0.375,
664
+ -0.5625,
665
+ -0.1875,
666
+ 0.0,
667
+ 0.0,
668
+ -0.3125,
669
+ -0.625,
670
+ -0.3125,
671
+ -0.3125,
672
+ -0.25,
673
+ -0.3125,
674
+ -0.625,
675
+ -0.1875,
676
+ -0.25,
677
+ -0.1875,
678
+ -0.3125,
679
+ -0.5,
680
+ -0.375,
681
+ -0.4375,
682
+ -0.5625,
683
+ -0.4375,
684
+ -0.3125,
685
+ -0.5,
686
+ -0.0625,
687
+ -0.5,
688
+ -0.0625,
689
+ -0.1875,
690
+ -0.0625,
691
+ -0.25,
692
+ 0.125,
693
+ -0.0625,
694
+ -0.0625,
695
+ -0.5,
696
+ -0.5625,
697
+ -0.5625,
698
+ -0.0625,
699
+ 0.0,
700
+ -0.25,
701
+ -0.5625,
702
+ -0.1875,
703
+ -0.25,
704
+ -0.4375,
705
+ -0.1875,
706
+ 0.0625,
707
+ -0.25,
708
+ -0.375,
709
+ -0.1875,
710
+ -0.3125,
711
+ -0.5,
712
+ 0.0625,
713
+ -0.1875,
714
+ 0.0625,
715
+ -0.375,
716
+ -0.375,
717
+ 0.0625,
718
+ -0.1875,
719
+ -0.4375,
720
+ -0.1875,
721
+ -0.0625,
722
+ 0.125,
723
+ -0.4375
724
+ ],
725
+ "q99": [
726
+ 0.125,
727
+ 0.0625,
728
+ 0.125,
729
+ 0.0625,
730
+ 0.125,
731
+ 0.0625,
732
+ 0.375,
733
+ 0.375,
734
+ 0.1875,
735
+ 0.0625,
736
+ 0.0625,
737
+ -0.0625,
738
+ 0.0625,
739
+ 0.1875,
740
+ -0.125,
741
+ 0.1875,
742
+ 0.125,
743
+ 0.125,
744
+ 0.0625,
745
+ -0.0625,
746
+ 0.3125,
747
+ 0.0,
748
+ 0.1875,
749
+ 0.125,
750
+ 0.0625,
751
+ 0.0,
752
+ 0.25,
753
+ 0.125,
754
+ 0.25,
755
+ 0.3125,
756
+ 0.3125,
757
+ 0.125,
758
+ 0.375,
759
+ 0.4375,
760
+ 0.4375,
761
+ 0.5,
762
+ 0.125,
763
+ -0.125,
764
+ 0.25,
765
+ 0.25,
766
+ 0.0625,
767
+ 0.3125,
768
+ 0.3125,
769
+ 0.25,
770
+ -0.125,
771
+ 0.125,
772
+ 0.4375,
773
+ 0.0,
774
+ 0.1875,
775
+ 0.125,
776
+ 0.125,
777
+ 0.125,
778
+ 0.375,
779
+ 0.1875,
780
+ 0.5,
781
+ 0.0625,
782
+ 0.25,
783
+ 0.5,
784
+ 0.125,
785
+ 0.1875,
786
+ 0.25,
787
+ 0.3125,
788
+ 0.5,
789
+ 0.0
790
+ ]
791
+ },
792
+ "left_hand_joints": {
793
+ "min": [
794
+ 0.0,
795
+ 0.0,
796
+ 0.0,
797
+ 0.0,
798
+ 0.0,
799
+ 0.0,
800
+ 0.0
801
+ ],
802
+ "max": [
803
+ 0.0,
804
+ 0.0,
805
+ 0.0,
806
+ 0.0,
807
+ 0.0,
808
+ 0.0,
809
+ 0.0
810
+ ],
811
+ "mean": [
812
+ 0.0,
813
+ 0.0,
814
+ 0.0,
815
+ 0.0,
816
+ 0.0,
817
+ 0.0,
818
+ 0.0
819
+ ],
820
+ "std": [
821
+ 0.0,
822
+ 0.0,
823
+ 0.0,
824
+ 0.0,
825
+ 0.0,
826
+ 0.0,
827
+ 0.0
828
+ ],
829
+ "q01": [
830
+ 0.0,
831
+ 0.0,
832
+ 0.0,
833
+ 0.0,
834
+ 0.0,
835
+ 0.0,
836
+ 0.0
837
+ ],
838
+ "q99": [
839
+ 0.0,
840
+ 0.0,
841
+ 0.0,
842
+ 0.0,
843
+ 0.0,
844
+ 0.0,
845
+ 0.0
846
+ ]
847
+ },
848
+ "right_hand_joints": {
849
+ "min": [
850
+ 0.0,
851
+ 0.0,
852
+ 0.0,
853
+ 0.0,
854
+ 0.0,
855
+ 0.0,
856
+ 0.0
857
+ ],
858
+ "max": [
859
+ 0.0,
860
+ 0.0,
861
+ 0.0,
862
+ 0.0,
863
+ 0.0,
864
+ 0.0,
865
+ 0.0
866
+ ],
867
+ "mean": [
868
+ 0.0,
869
+ 0.0,
870
+ 0.0,
871
+ 0.0,
872
+ 0.0,
873
+ 0.0,
874
+ 0.0
875
+ ],
876
+ "std": [
877
+ 0.0,
878
+ 0.0,
879
+ 0.0,
880
+ 0.0,
881
+ 0.0,
882
+ 0.0,
883
+ 0.0
884
+ ],
885
+ "q01": [
886
+ 0.0,
887
+ 0.0,
888
+ 0.0,
889
+ 0.0,
890
+ 0.0,
891
+ 0.0,
892
+ 0.0
893
+ ],
894
+ "q99": [
895
+ 0.0,
896
+ 0.0,
897
+ 0.0,
898
+ 0.0,
899
+ 0.0,
900
+ 0.0,
901
+ 0.0
902
+ ]
903
+ }
904
+ },
905
+ "relative_action": {}
906
+ }
907
+ }
checkpoint-4000/experiment_cfg/final_model_config.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "Gr00tN1d7",
3
+ "model_dtype": "bfloat16",
4
+ "model_name": "nvidia/Cosmos-Reason2-2B",
5
+ "backbone_model_type": "qwen",
6
+ "model_revision": null,
7
+ "tune_top_llm_layers": 0,
8
+ "backbone_embedding_dim": 2048,
9
+ "tune_llm": false,
10
+ "tune_visual": false,
11
+ "select_layer": 16,
12
+ "reproject_vision": false,
13
+ "use_flash_attention": true,
14
+ "load_bf16": false,
15
+ "backbone_trainable_params_fp32": true,
16
+ "extra_augmentation_config": null,
17
+ "apply_sincos_state_encoding": false,
18
+ "use_percentiles": true,
19
+ "use_relative_action": false,
20
+ "max_state_dim": 132,
21
+ "max_action_dim": 132,
22
+ "action_horizon": 40,
23
+ "hidden_size": 1024,
24
+ "input_embedding_dim": 1536,
25
+ "state_history_length": 1,
26
+ "add_pos_embed": true,
27
+ "attn_dropout": 0.2,
28
+ "use_vlln": true,
29
+ "max_seq_len": 1024,
30
+ "use_alternate_vl_dit": true,
31
+ "attend_text_every_n_blocks": 2,
32
+ "diffusion_model_cfg": {
33
+ "attention_head_dim": 48,
34
+ "dropout": 0.2,
35
+ "final_dropout": true,
36
+ "interleave_self_attention": true,
37
+ "norm_type": "ada_norm",
38
+ "num_attention_heads": 32,
39
+ "num_layers": 32,
40
+ "output_dim": 1024,
41
+ "positional_embeddings": null
42
+ },
43
+ "num_inference_timesteps": 4,
44
+ "noise_beta_alpha": 1.5,
45
+ "noise_beta_beta": 1.0,
46
+ "noise_s": 0.999,
47
+ "num_timestep_buckets": 1000,
48
+ "tune_projector": true,
49
+ "tune_diffusion_model": true,
50
+ "tune_vlln": true,
51
+ "state_dropout_prob": 0.2,
52
+ "exclude_state": false,
53
+ "use_mean_std": false,
54
+ "max_num_embodiments": 32
55
+ }
checkpoint-4000/experiment_cfg/final_processor_config.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-4000/global_step4000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:582e5f80e78bfb43635bc5bc857dc269739b5f124f23b7af101046312930f337
3
+ size 4861568625
checkpoint-4000/global_step4000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:badcd116f10cc9aec2be7bfee24f6d33d5405e618319ddc552357e569f6bfa2a
3
+ size 4861568369
checkpoint-4000/global_step4000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc24e5c7d6727603352ccadaeed4a3bdddd8fe8f582d8f5e395819ada10631c4
3
+ size 4861566321
checkpoint-4000/global_step4000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2cea808ea4331565e6f32378673bd4c775d1c070a136d10a1b56594d4a9b4fb
3
+ size 4861563121
checkpoint-4000/global_step4000/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0097b1f4f857f3a5f76b90cdeb3e0a5c1834135bb586a9e4dcdc1625a3cdb283
3
+ size 9335640879
checkpoint-4000/latest ADDED
@@ -0,0 +1 @@
 
 
1
+ global_step4000
checkpoint-4000/model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:223253fd834ad957a6a9443ebd39b021d059f391df5ba0553b027999764f02c6
3
+ size 4990519232
checkpoint-4000/model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0927bcc7b2ff77a0272177c79cc53ec6a7b9f571b1007e32f414e94c8018950b
3
+ size 1919980184
checkpoint-4000/model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-4000/processor_config.json ADDED
@@ -0,0 +1,1159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "processor_class": "Gr00tN1d7Processor",
3
+ "processor_kwargs": {
4
+ "modality_configs": {
5
+ "real_g1_relative_eef_relative_joints": {
6
+ "video": {
7
+ "delta_indices": [
8
+ -20,
9
+ 0
10
+ ],
11
+ "modality_keys": [
12
+ "ego_view"
13
+ ],
14
+ "sin_cos_embedding_keys": null,
15
+ "mean_std_embedding_keys": null,
16
+ "action_configs": null
17
+ },
18
+ "state": {
19
+ "delta_indices": [
20
+ 0
21
+ ],
22
+ "modality_keys": [
23
+ "left_wrist_eef_9d",
24
+ "right_wrist_eef_9d",
25
+ "left_hand",
26
+ "right_hand",
27
+ "left_arm",
28
+ "right_arm",
29
+ "waist"
30
+ ],
31
+ "sin_cos_embedding_keys": null,
32
+ "mean_std_embedding_keys": null,
33
+ "action_configs": null
34
+ },
35
+ "action": {
36
+ "delta_indices": [
37
+ 0,
38
+ 1,
39
+ 2,
40
+ 3,
41
+ 4,
42
+ 5,
43
+ 6,
44
+ 7,
45
+ 8,
46
+ 9,
47
+ 10,
48
+ 11,
49
+ 12,
50
+ 13,
51
+ 14,
52
+ 15,
53
+ 16,
54
+ 17,
55
+ 18,
56
+ 19,
57
+ 20,
58
+ 21,
59
+ 22,
60
+ 23,
61
+ 24,
62
+ 25,
63
+ 26,
64
+ 27,
65
+ 28,
66
+ 29,
67
+ 30,
68
+ 31,
69
+ 32,
70
+ 33,
71
+ 34,
72
+ 35,
73
+ 36,
74
+ 37,
75
+ 38,
76
+ 39
77
+ ],
78
+ "modality_keys": [
79
+ "left_wrist_eef_9d",
80
+ "right_wrist_eef_9d",
81
+ "left_hand",
82
+ "right_hand",
83
+ "left_arm",
84
+ "right_arm",
85
+ "waist",
86
+ "base_height_command",
87
+ "navigate_command"
88
+ ],
89
+ "sin_cos_embedding_keys": null,
90
+ "mean_std_embedding_keys": null,
91
+ "action_configs": [
92
+ {
93
+ "rep": "RELATIVE",
94
+ "type": "EEF",
95
+ "format": "XYZ_ROT6D",
96
+ "state_key": "left_wrist_eef_9d"
97
+ },
98
+ {
99
+ "rep": "RELATIVE",
100
+ "type": "EEF",
101
+ "format": "XYZ_ROT6D",
102
+ "state_key": "right_wrist_eef_9d"
103
+ },
104
+ {
105
+ "rep": "ABSOLUTE",
106
+ "type": "NON_EEF",
107
+ "format": "DEFAULT",
108
+ "state_key": "left_hand"
109
+ },
110
+ {
111
+ "rep": "ABSOLUTE",
112
+ "type": "NON_EEF",
113
+ "format": "DEFAULT",
114
+ "state_key": "right_hand"
115
+ },
116
+ {
117
+ "rep": "RELATIVE",
118
+ "type": "NON_EEF",
119
+ "format": "DEFAULT",
120
+ "state_key": "left_arm"
121
+ },
122
+ {
123
+ "rep": "RELATIVE",
124
+ "type": "NON_EEF",
125
+ "format": "DEFAULT",
126
+ "state_key": "right_arm"
127
+ },
128
+ {
129
+ "rep": "ABSOLUTE",
130
+ "type": "NON_EEF",
131
+ "format": "DEFAULT",
132
+ "state_key": "waist"
133
+ },
134
+ {
135
+ "rep": "ABSOLUTE",
136
+ "type": "NON_EEF",
137
+ "format": "DEFAULT",
138
+ "state_key": "base_height_command"
139
+ },
140
+ {
141
+ "rep": "ABSOLUTE",
142
+ "type": "NON_EEF",
143
+ "format": "DEFAULT",
144
+ "state_key": "navigate_command"
145
+ }
146
+ ]
147
+ },
148
+ "language": {
149
+ "delta_indices": [
150
+ 0
151
+ ],
152
+ "modality_keys": [
153
+ "annotation.human.task_description"
154
+ ],
155
+ "sin_cos_embedding_keys": null,
156
+ "mean_std_embedding_keys": null,
157
+ "action_configs": null
158
+ }
159
+ },
160
+ "real_r1_pro_sharpa_relative_eef_mecka": {
161
+ "video": {
162
+ "delta_indices": [
163
+ -30,
164
+ 0
165
+ ],
166
+ "modality_keys": [
167
+ "ego_view_cropratio_res320x240_freq30"
168
+ ],
169
+ "sin_cos_embedding_keys": null,
170
+ "mean_std_embedding_keys": null,
171
+ "action_configs": null
172
+ },
173
+ "state": {
174
+ "delta_indices": [
175
+ 0
176
+ ],
177
+ "modality_keys": [
178
+ "left_wrist_eef",
179
+ "right_wrist_eef",
180
+ "left_hand_joints",
181
+ "right_hand_joints"
182
+ ],
183
+ "sin_cos_embedding_keys": null,
184
+ "mean_std_embedding_keys": null,
185
+ "action_configs": null
186
+ },
187
+ "action": {
188
+ "delta_indices": [
189
+ 0,
190
+ 1,
191
+ 2,
192
+ 3,
193
+ 4,
194
+ 5,
195
+ 6,
196
+ 7,
197
+ 8,
198
+ 9,
199
+ 10,
200
+ 11,
201
+ 12,
202
+ 13,
203
+ 14,
204
+ 15,
205
+ 16,
206
+ 17,
207
+ 18,
208
+ 19,
209
+ 20,
210
+ 21,
211
+ 22,
212
+ 23,
213
+ 24,
214
+ 25,
215
+ 26,
216
+ 27,
217
+ 28,
218
+ 29,
219
+ 30,
220
+ 31,
221
+ 32,
222
+ 33,
223
+ 34,
224
+ 35,
225
+ 36,
226
+ 37,
227
+ 38,
228
+ 39
229
+ ],
230
+ "modality_keys": [
231
+ "left_wrist_eef",
232
+ "right_wrist_eef",
233
+ "left_hand_joints",
234
+ "right_hand_joints"
235
+ ],
236
+ "sin_cos_embedding_keys": null,
237
+ "mean_std_embedding_keys": null,
238
+ "action_configs": [
239
+ {
240
+ "rep": "RELATIVE",
241
+ "type": "EEF",
242
+ "format": "XYZ_ROT6D",
243
+ "state_key": "left_wrist_eef"
244
+ },
245
+ {
246
+ "rep": "RELATIVE",
247
+ "type": "EEF",
248
+ "format": "XYZ_ROT6D",
249
+ "state_key": "right_wrist_eef"
250
+ },
251
+ {
252
+ "rep": "ABSOLUTE",
253
+ "type": "NON_EEF",
254
+ "format": "DEFAULT",
255
+ "state_key": "left_hand_joints"
256
+ },
257
+ {
258
+ "rep": "ABSOLUTE",
259
+ "type": "NON_EEF",
260
+ "format": "DEFAULT",
261
+ "state_key": "right_hand_joints"
262
+ }
263
+ ]
264
+ },
265
+ "language": {
266
+ "delta_indices": [
267
+ 0
268
+ ],
269
+ "modality_keys": [
270
+ "annotation.human.coarse_action"
271
+ ],
272
+ "sin_cos_embedding_keys": null,
273
+ "mean_std_embedding_keys": null,
274
+ "action_configs": null
275
+ }
276
+ },
277
+ "real_r1_pro_sharpa_relative_eef_human": {
278
+ "video": {
279
+ "delta_indices": [
280
+ -20,
281
+ 0
282
+ ],
283
+ "modality_keys": [
284
+ "ego_view_res320x240_freq20",
285
+ "left_wrist_view_res320x240_freq20",
286
+ "right_wrist_view_res320x240_freq20"
287
+ ],
288
+ "sin_cos_embedding_keys": null,
289
+ "mean_std_embedding_keys": null,
290
+ "action_configs": null
291
+ },
292
+ "state": {
293
+ "delta_indices": [
294
+ 0
295
+ ],
296
+ "modality_keys": [
297
+ "left_wrist_eef",
298
+ "right_wrist_eef",
299
+ "left_hand_joints",
300
+ "right_hand_joints"
301
+ ],
302
+ "sin_cos_embedding_keys": null,
303
+ "mean_std_embedding_keys": null,
304
+ "action_configs": null
305
+ },
306
+ "action": {
307
+ "delta_indices": [
308
+ 0,
309
+ 1,
310
+ 2,
311
+ 3,
312
+ 4,
313
+ 5,
314
+ 6,
315
+ 7,
316
+ 8,
317
+ 9,
318
+ 10,
319
+ 11,
320
+ 12,
321
+ 13,
322
+ 14,
323
+ 15,
324
+ 16,
325
+ 17,
326
+ 18,
327
+ 19,
328
+ 20,
329
+ 21,
330
+ 22,
331
+ 23,
332
+ 24,
333
+ 25,
334
+ 26,
335
+ 27,
336
+ 28,
337
+ 29,
338
+ 30,
339
+ 31,
340
+ 32,
341
+ 33,
342
+ 34,
343
+ 35,
344
+ 36,
345
+ 37,
346
+ 38,
347
+ 39
348
+ ],
349
+ "modality_keys": [
350
+ "left_wrist_eef",
351
+ "right_wrist_eef",
352
+ "left_hand_joints",
353
+ "right_hand_joints"
354
+ ],
355
+ "sin_cos_embedding_keys": null,
356
+ "mean_std_embedding_keys": null,
357
+ "action_configs": [
358
+ {
359
+ "rep": "RELATIVE",
360
+ "type": "EEF",
361
+ "format": "XYZ_ROT6D",
362
+ "state_key": "left_wrist_eef"
363
+ },
364
+ {
365
+ "rep": "RELATIVE",
366
+ "type": "EEF",
367
+ "format": "XYZ_ROT6D",
368
+ "state_key": "right_wrist_eef"
369
+ },
370
+ {
371
+ "rep": "ABSOLUTE",
372
+ "type": "NON_EEF",
373
+ "format": "DEFAULT",
374
+ "state_key": "left_hand_joints"
375
+ },
376
+ {
377
+ "rep": "ABSOLUTE",
378
+ "type": "NON_EEF",
379
+ "format": "DEFAULT",
380
+ "state_key": "right_hand_joints"
381
+ }
382
+ ]
383
+ },
384
+ "language": {
385
+ "delta_indices": [
386
+ 0
387
+ ],
388
+ "modality_keys": [
389
+ "annotation.human.coarse_action"
390
+ ],
391
+ "sin_cos_embedding_keys": null,
392
+ "mean_std_embedding_keys": null,
393
+ "action_configs": null
394
+ }
395
+ },
396
+ "real_r1_pro_sharpa_relative_eef": {
397
+ "video": {
398
+ "delta_indices": [
399
+ -20,
400
+ 0
401
+ ],
402
+ "modality_keys": [
403
+ "ego_view_res320x240_freq20",
404
+ "left_wrist_view_res320x240_freq20",
405
+ "right_wrist_view_res320x240_freq20"
406
+ ],
407
+ "sin_cos_embedding_keys": null,
408
+ "mean_std_embedding_keys": null,
409
+ "action_configs": null
410
+ },
411
+ "state": {
412
+ "delta_indices": [
413
+ 0
414
+ ],
415
+ "modality_keys": [
416
+ "left_wrist_eef",
417
+ "right_wrist_eef",
418
+ "left_hand_joints",
419
+ "right_hand_joints"
420
+ ],
421
+ "sin_cos_embedding_keys": null,
422
+ "mean_std_embedding_keys": null,
423
+ "action_configs": null
424
+ },
425
+ "action": {
426
+ "delta_indices": [
427
+ 0,
428
+ 1,
429
+ 2,
430
+ 3,
431
+ 4,
432
+ 5,
433
+ 6,
434
+ 7,
435
+ 8,
436
+ 9,
437
+ 10,
438
+ 11,
439
+ 12,
440
+ 13,
441
+ 14,
442
+ 15,
443
+ 16,
444
+ 17,
445
+ 18,
446
+ 19,
447
+ 20,
448
+ 21,
449
+ 22,
450
+ 23,
451
+ 24,
452
+ 25,
453
+ 26,
454
+ 27,
455
+ 28,
456
+ 29,
457
+ 30,
458
+ 31,
459
+ 32,
460
+ 33,
461
+ 34,
462
+ 35,
463
+ 36,
464
+ 37,
465
+ 38,
466
+ 39
467
+ ],
468
+ "modality_keys": [
469
+ "left_wrist_eef",
470
+ "right_wrist_eef",
471
+ "left_hand_joints",
472
+ "right_hand_joints"
473
+ ],
474
+ "sin_cos_embedding_keys": null,
475
+ "mean_std_embedding_keys": null,
476
+ "action_configs": [
477
+ {
478
+ "rep": "RELATIVE",
479
+ "type": "EEF",
480
+ "format": "XYZ_ROT6D",
481
+ "state_key": "left_wrist_eef"
482
+ },
483
+ {
484
+ "rep": "RELATIVE",
485
+ "type": "EEF",
486
+ "format": "XYZ_ROT6D",
487
+ "state_key": "right_wrist_eef"
488
+ },
489
+ {
490
+ "rep": "ABSOLUTE",
491
+ "type": "NON_EEF",
492
+ "format": "DEFAULT",
493
+ "state_key": "left_hand_joints"
494
+ },
495
+ {
496
+ "rep": "ABSOLUTE",
497
+ "type": "NON_EEF",
498
+ "format": "DEFAULT",
499
+ "state_key": "right_hand_joints"
500
+ }
501
+ ]
502
+ },
503
+ "language": {
504
+ "delta_indices": [
505
+ 0
506
+ ],
507
+ "modality_keys": [
508
+ "annotation.human.coarse_action"
509
+ ],
510
+ "sin_cos_embedding_keys": null,
511
+ "mean_std_embedding_keys": null,
512
+ "action_configs": null
513
+ }
514
+ },
515
+ "xdof_relative_eef_relative_joint": {
516
+ "video": {
517
+ "delta_indices": [
518
+ -30,
519
+ 0
520
+ ],
521
+ "modality_keys": [
522
+ "top_camera-images-rgb_320_240",
523
+ "left_camera-images-rgb_320_240",
524
+ "right_camera-images-rgb_320_240"
525
+ ],
526
+ "sin_cos_embedding_keys": null,
527
+ "mean_std_embedding_keys": null,
528
+ "action_configs": null
529
+ },
530
+ "state": {
531
+ "delta_indices": [
532
+ 0
533
+ ],
534
+ "modality_keys": [
535
+ "left_wrist_eef",
536
+ "right_wrist_eef",
537
+ "left_gripper_pos",
538
+ "right_gripper_pos",
539
+ "left_joint_pos",
540
+ "right_joint_pos"
541
+ ],
542
+ "sin_cos_embedding_keys": null,
543
+ "mean_std_embedding_keys": null,
544
+ "action_configs": null
545
+ },
546
+ "action": {
547
+ "delta_indices": [
548
+ 0,
549
+ 1,
550
+ 2,
551
+ 3,
552
+ 4,
553
+ 5,
554
+ 6,
555
+ 7,
556
+ 8,
557
+ 9,
558
+ 10,
559
+ 11,
560
+ 12,
561
+ 13,
562
+ 14,
563
+ 15,
564
+ 16,
565
+ 17,
566
+ 18,
567
+ 19,
568
+ 20,
569
+ 21,
570
+ 22,
571
+ 23,
572
+ 24,
573
+ 25,
574
+ 26,
575
+ 27,
576
+ 28,
577
+ 29,
578
+ 30,
579
+ 31,
580
+ 32,
581
+ 33,
582
+ 34,
583
+ 35,
584
+ 36,
585
+ 37,
586
+ 38,
587
+ 39
588
+ ],
589
+ "modality_keys": [
590
+ "left_wrist_eef",
591
+ "right_wrist_eef",
592
+ "left_gripper_pos",
593
+ "right_gripper_pos",
594
+ "left_joint_pos",
595
+ "right_joint_pos"
596
+ ],
597
+ "sin_cos_embedding_keys": null,
598
+ "mean_std_embedding_keys": null,
599
+ "action_configs": [
600
+ {
601
+ "rep": "RELATIVE",
602
+ "type": "EEF",
603
+ "format": "XYZ_ROT6D",
604
+ "state_key": "left_wrist_eef"
605
+ },
606
+ {
607
+ "rep": "RELATIVE",
608
+ "type": "EEF",
609
+ "format": "XYZ_ROT6D",
610
+ "state_key": "right_wrist_eef"
611
+ },
612
+ {
613
+ "rep": "ABSOLUTE",
614
+ "type": "NON_EEF",
615
+ "format": "DEFAULT",
616
+ "state_key": "left_gripper_pos"
617
+ },
618
+ {
619
+ "rep": "ABSOLUTE",
620
+ "type": "NON_EEF",
621
+ "format": "DEFAULT",
622
+ "state_key": "right_gripper_pos"
623
+ },
624
+ {
625
+ "rep": "RELATIVE",
626
+ "type": "NON_EEF",
627
+ "format": "DEFAULT",
628
+ "state_key": "left_joint_pos"
629
+ },
630
+ {
631
+ "rep": "RELATIVE",
632
+ "type": "NON_EEF",
633
+ "format": "DEFAULT",
634
+ "state_key": "right_joint_pos"
635
+ }
636
+ ]
637
+ },
638
+ "language": {
639
+ "delta_indices": [
640
+ 0
641
+ ],
642
+ "modality_keys": [
643
+ "annotation.task"
644
+ ],
645
+ "sin_cos_embedding_keys": null,
646
+ "mean_std_embedding_keys": null,
647
+ "action_configs": null
648
+ }
649
+ },
650
+ "real_r1_pro_sharpa_relative_eef_maxinsights": {
651
+ "video": {
652
+ "delta_indices": [
653
+ -30,
654
+ 0
655
+ ],
656
+ "modality_keys": [
657
+ "ego_view_cropratio_res320x240_freq30"
658
+ ],
659
+ "sin_cos_embedding_keys": null,
660
+ "mean_std_embedding_keys": null,
661
+ "action_configs": null
662
+ },
663
+ "state": {
664
+ "delta_indices": [
665
+ 0
666
+ ],
667
+ "modality_keys": [
668
+ "left_wrist_eef",
669
+ "right_wrist_eef",
670
+ "left_hand_joints",
671
+ "right_hand_joints"
672
+ ],
673
+ "sin_cos_embedding_keys": null,
674
+ "mean_std_embedding_keys": null,
675
+ "action_configs": null
676
+ },
677
+ "action": {
678
+ "delta_indices": [
679
+ 0,
680
+ 1,
681
+ 2,
682
+ 3,
683
+ 4,
684
+ 5,
685
+ 6,
686
+ 7,
687
+ 8,
688
+ 9,
689
+ 10,
690
+ 11,
691
+ 12,
692
+ 13,
693
+ 14,
694
+ 15,
695
+ 16,
696
+ 17,
697
+ 18,
698
+ 19,
699
+ 20,
700
+ 21,
701
+ 22,
702
+ 23,
703
+ 24,
704
+ 25,
705
+ 26,
706
+ 27,
707
+ 28,
708
+ 29,
709
+ 30,
710
+ 31,
711
+ 32,
712
+ 33,
713
+ 34,
714
+ 35,
715
+ 36,
716
+ 37,
717
+ 38,
718
+ 39
719
+ ],
720
+ "modality_keys": [
721
+ "left_wrist_eef",
722
+ "right_wrist_eef",
723
+ "left_hand_joints",
724
+ "right_hand_joints"
725
+ ],
726
+ "sin_cos_embedding_keys": null,
727
+ "mean_std_embedding_keys": null,
728
+ "action_configs": [
729
+ {
730
+ "rep": "RELATIVE",
731
+ "type": "EEF",
732
+ "format": "XYZ_ROT6D",
733
+ "state_key": "left_wrist_eef"
734
+ },
735
+ {
736
+ "rep": "RELATIVE",
737
+ "type": "EEF",
738
+ "format": "XYZ_ROT6D",
739
+ "state_key": "right_wrist_eef"
740
+ },
741
+ {
742
+ "rep": "ABSOLUTE",
743
+ "type": "NON_EEF",
744
+ "format": "DEFAULT",
745
+ "state_key": "left_hand_joints"
746
+ },
747
+ {
748
+ "rep": "ABSOLUTE",
749
+ "type": "NON_EEF",
750
+ "format": "DEFAULT",
751
+ "state_key": "right_hand_joints"
752
+ }
753
+ ]
754
+ },
755
+ "language": {
756
+ "delta_indices": [
757
+ 0
758
+ ],
759
+ "modality_keys": [
760
+ "annotation.human.coarse_action"
761
+ ],
762
+ "sin_cos_embedding_keys": null,
763
+ "mean_std_embedding_keys": null,
764
+ "action_configs": null
765
+ }
766
+ },
767
+ "xdof_relative_eef_relative_joint_subtask": {
768
+ "video": {
769
+ "delta_indices": [
770
+ -30,
771
+ 0
772
+ ],
773
+ "modality_keys": [
774
+ "top_camera-images-rgb_320_240",
775
+ "left_camera-images-rgb_320_240",
776
+ "right_camera-images-rgb_320_240"
777
+ ],
778
+ "sin_cos_embedding_keys": null,
779
+ "mean_std_embedding_keys": null,
780
+ "action_configs": null
781
+ },
782
+ "state": {
783
+ "delta_indices": [
784
+ 0
785
+ ],
786
+ "modality_keys": [
787
+ "left_wrist_eef",
788
+ "right_wrist_eef",
789
+ "left_gripper_pos",
790
+ "right_gripper_pos",
791
+ "left_joint_pos",
792
+ "right_joint_pos"
793
+ ],
794
+ "sin_cos_embedding_keys": null,
795
+ "mean_std_embedding_keys": null,
796
+ "action_configs": null
797
+ },
798
+ "action": {
799
+ "delta_indices": [
800
+ 0,
801
+ 1,
802
+ 2,
803
+ 3,
804
+ 4,
805
+ 5,
806
+ 6,
807
+ 7,
808
+ 8,
809
+ 9,
810
+ 10,
811
+ 11,
812
+ 12,
813
+ 13,
814
+ 14,
815
+ 15,
816
+ 16,
817
+ 17,
818
+ 18,
819
+ 19,
820
+ 20,
821
+ 21,
822
+ 22,
823
+ 23,
824
+ 24,
825
+ 25,
826
+ 26,
827
+ 27,
828
+ 28,
829
+ 29,
830
+ 30,
831
+ 31,
832
+ 32,
833
+ 33,
834
+ 34,
835
+ 35,
836
+ 36,
837
+ 37,
838
+ 38,
839
+ 39
840
+ ],
841
+ "modality_keys": [
842
+ "left_wrist_eef",
843
+ "right_wrist_eef",
844
+ "left_gripper_pos",
845
+ "right_gripper_pos",
846
+ "left_joint_pos",
847
+ "right_joint_pos"
848
+ ],
849
+ "sin_cos_embedding_keys": null,
850
+ "mean_std_embedding_keys": null,
851
+ "action_configs": [
852
+ {
853
+ "rep": "RELATIVE",
854
+ "type": "EEF",
855
+ "format": "XYZ_ROT6D",
856
+ "state_key": "left_wrist_eef"
857
+ },
858
+ {
859
+ "rep": "RELATIVE",
860
+ "type": "EEF",
861
+ "format": "XYZ_ROT6D",
862
+ "state_key": "right_wrist_eef"
863
+ },
864
+ {
865
+ "rep": "ABSOLUTE",
866
+ "type": "NON_EEF",
867
+ "format": "DEFAULT",
868
+ "state_key": "left_gripper_pos"
869
+ },
870
+ {
871
+ "rep": "ABSOLUTE",
872
+ "type": "NON_EEF",
873
+ "format": "DEFAULT",
874
+ "state_key": "right_gripper_pos"
875
+ },
876
+ {
877
+ "rep": "RELATIVE",
878
+ "type": "NON_EEF",
879
+ "format": "DEFAULT",
880
+ "state_key": "left_joint_pos"
881
+ },
882
+ {
883
+ "rep": "RELATIVE",
884
+ "type": "NON_EEF",
885
+ "format": "DEFAULT",
886
+ "state_key": "right_joint_pos"
887
+ }
888
+ ]
889
+ },
890
+ "language": {
891
+ "delta_indices": [
892
+ 0
893
+ ],
894
+ "modality_keys": [
895
+ "annotation.sub_task"
896
+ ],
897
+ "sin_cos_embedding_keys": null,
898
+ "mean_std_embedding_keys": null,
899
+ "action_configs": null
900
+ }
901
+ },
902
+ "oxe_droid_relative_eef_relative_joint": {
903
+ "video": {
904
+ "delta_indices": [
905
+ -15,
906
+ 0
907
+ ],
908
+ "modality_keys": [
909
+ "exterior_image_1_left",
910
+ "wrist_image_left"
911
+ ],
912
+ "sin_cos_embedding_keys": null,
913
+ "mean_std_embedding_keys": null,
914
+ "action_configs": null
915
+ },
916
+ "state": {
917
+ "delta_indices": [
918
+ 0
919
+ ],
920
+ "modality_keys": [
921
+ "eef_9d",
922
+ "gripper_position",
923
+ "joint_position"
924
+ ],
925
+ "sin_cos_embedding_keys": null,
926
+ "mean_std_embedding_keys": null,
927
+ "action_configs": null
928
+ },
929
+ "action": {
930
+ "delta_indices": [
931
+ 0,
932
+ 1,
933
+ 2,
934
+ 3,
935
+ 4,
936
+ 5,
937
+ 6,
938
+ 7,
939
+ 8,
940
+ 9,
941
+ 10,
942
+ 11,
943
+ 12,
944
+ 13,
945
+ 14,
946
+ 15,
947
+ 16,
948
+ 17,
949
+ 18,
950
+ 19,
951
+ 20,
952
+ 21,
953
+ 22,
954
+ 23,
955
+ 24,
956
+ 25,
957
+ 26,
958
+ 27,
959
+ 28,
960
+ 29,
961
+ 30,
962
+ 31,
963
+ 32,
964
+ 33,
965
+ 34,
966
+ 35,
967
+ 36,
968
+ 37,
969
+ 38,
970
+ 39
971
+ ],
972
+ "modality_keys": [
973
+ "eef_9d",
974
+ "gripper_position",
975
+ "joint_position"
976
+ ],
977
+ "sin_cos_embedding_keys": null,
978
+ "mean_std_embedding_keys": null,
979
+ "action_configs": [
980
+ {
981
+ "rep": "RELATIVE",
982
+ "type": "EEF",
983
+ "format": "XYZ_ROT6D",
984
+ "state_key": "eef_9d"
985
+ },
986
+ {
987
+ "rep": "ABSOLUTE",
988
+ "type": "NON_EEF",
989
+ "format": "DEFAULT",
990
+ "state_key": "gripper_position"
991
+ },
992
+ {
993
+ "rep": "RELATIVE",
994
+ "type": "NON_EEF",
995
+ "format": "DEFAULT",
996
+ "state_key": "joint_position"
997
+ }
998
+ ]
999
+ },
1000
+ "language": {
1001
+ "delta_indices": [
1002
+ 0
1003
+ ],
1004
+ "modality_keys": [
1005
+ "annotation.language.language_instruction"
1006
+ ],
1007
+ "sin_cos_embedding_keys": null,
1008
+ "mean_std_embedding_keys": null,
1009
+ "action_configs": null
1010
+ }
1011
+ },
1012
+ "unitree_g1_sonic": {
1013
+ "video": {
1014
+ "delta_indices": [
1015
+ 0
1016
+ ],
1017
+ "modality_keys": [
1018
+ "ego_view"
1019
+ ],
1020
+ "sin_cos_embedding_keys": null,
1021
+ "mean_std_embedding_keys": null,
1022
+ "action_configs": null
1023
+ },
1024
+ "state": {
1025
+ "delta_indices": [
1026
+ 0
1027
+ ],
1028
+ "modality_keys": [
1029
+ "left_leg",
1030
+ "right_leg",
1031
+ "waist",
1032
+ "left_arm",
1033
+ "right_arm",
1034
+ "left_hand",
1035
+ "right_hand",
1036
+ "projected_gravity"
1037
+ ],
1038
+ "sin_cos_embedding_keys": null,
1039
+ "mean_std_embedding_keys": null,
1040
+ "action_configs": null
1041
+ },
1042
+ "action": {
1043
+ "delta_indices": [
1044
+ 0,
1045
+ 1,
1046
+ 2,
1047
+ 3,
1048
+ 4,
1049
+ 5,
1050
+ 6,
1051
+ 7,
1052
+ 8,
1053
+ 9,
1054
+ 10,
1055
+ 11,
1056
+ 12,
1057
+ 13,
1058
+ 14,
1059
+ 15,
1060
+ 16,
1061
+ 17,
1062
+ 18,
1063
+ 19,
1064
+ 20,
1065
+ 21,
1066
+ 22,
1067
+ 23,
1068
+ 24,
1069
+ 25,
1070
+ 26,
1071
+ 27,
1072
+ 28,
1073
+ 29,
1074
+ 30,
1075
+ 31,
1076
+ 32,
1077
+ 33,
1078
+ 34,
1079
+ 35,
1080
+ 36,
1081
+ 37,
1082
+ 38,
1083
+ 39
1084
+ ],
1085
+ "modality_keys": [
1086
+ "motion_token",
1087
+ "left_hand_joints",
1088
+ "right_hand_joints"
1089
+ ],
1090
+ "sin_cos_embedding_keys": null,
1091
+ "mean_std_embedding_keys": null,
1092
+ "action_configs": [
1093
+ {
1094
+ "rep": "ABSOLUTE",
1095
+ "type": "NON_EEF",
1096
+ "format": "DEFAULT",
1097
+ "state_key": null
1098
+ },
1099
+ {
1100
+ "rep": "ABSOLUTE",
1101
+ "type": "NON_EEF",
1102
+ "format": "DEFAULT",
1103
+ "state_key": null
1104
+ },
1105
+ {
1106
+ "rep": "ABSOLUTE",
1107
+ "type": "NON_EEF",
1108
+ "format": "DEFAULT",
1109
+ "state_key": null
1110
+ }
1111
+ ]
1112
+ },
1113
+ "language": {
1114
+ "delta_indices": [
1115
+ 0
1116
+ ],
1117
+ "modality_keys": [
1118
+ "annotation.human.task_description"
1119
+ ],
1120
+ "sin_cos_embedding_keys": null,
1121
+ "mean_std_embedding_keys": null,
1122
+ "action_configs": null
1123
+ }
1124
+ }
1125
+ },
1126
+ "image_crop_size": [
1127
+ 230,
1128
+ 230
1129
+ ],
1130
+ "image_target_size": [
1131
+ 256,
1132
+ 256
1133
+ ],
1134
+ "use_albumentations": true,
1135
+ "random_rotation_angle": 0,
1136
+ "color_jitter_params": {
1137
+ "brightness": 0.3,
1138
+ "contrast": 0.4,
1139
+ "saturation": 0.5,
1140
+ "hue": 0.08
1141
+ },
1142
+ "shortest_image_edge": 256,
1143
+ "crop_fraction": 0.95,
1144
+ "letter_box_transform": false,
1145
+ "model_name": "nvidia/Cosmos-Reason2-2B",
1146
+ "model_type": "qwen",
1147
+ "formalize_language": true,
1148
+ "max_state_dim": 132,
1149
+ "max_action_dim": 132,
1150
+ "max_action_horizon": 40,
1151
+ "use_percentiles": true,
1152
+ "use_mean_std": false,
1153
+ "clip_outliers": true,
1154
+ "apply_sincos_state_encoding": false,
1155
+ "use_relative_action": true,
1156
+ "exclude_state": false,
1157
+ "state_dropout_prob": 0.2
1158
+ }
1159
+ }
checkpoint-4000/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd98c31f766a536cd34304ee0ff7a51a96c12559376b8a6052786fdd30f30f97
3
+ size 15429
checkpoint-4000/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:896041d7bb6c5bf734f3eef8de14c82d0bb5e9e622b1dc59a63a5ba28022b8b2
3
+ size 15429
checkpoint-4000/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddec98d587e0d9aac50d7c7143e3a1842f9222278150810fae0b6ba8c2a23c57
3
+ size 15429
checkpoint-4000/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab81c9532f34b639f63aeaaad2363b16763904453d85cc3191bfa03c9da1ce9a
3
+ size 15429
checkpoint-4000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e735feb89f2e1201272c210a226e7690cb2b92d37a1c1eb9a843518a2be8b38f
3
+ size 1465
checkpoint-4000/statistics.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-4000/trainer_state.json ADDED
@@ -0,0 +1,2466 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.2,
6
+ "eval_steps": 500,
7
+ "global_step": 4000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "grad_norm": 0.16940702497959137,
14
+ "learning_rate": 9e-07,
15
+ "loss": 1.208,
16
+ "step": 10
17
+ },
18
+ {
19
+ "grad_norm": 0.14846493303775787,
20
+ "learning_rate": 1.9e-06,
21
+ "loss": 1.207,
22
+ "step": 20
23
+ },
24
+ {
25
+ "grad_norm": 0.1575060486793518,
26
+ "learning_rate": 2.9e-06,
27
+ "loss": 1.2031,
28
+ "step": 30
29
+ },
30
+ {
31
+ "grad_norm": 0.18646641075611115,
32
+ "learning_rate": 3.9e-06,
33
+ "loss": 1.19,
34
+ "step": 40
35
+ },
36
+ {
37
+ "grad_norm": 0.30270645022392273,
38
+ "learning_rate": 4.9000000000000005e-06,
39
+ "loss": 1.1729,
40
+ "step": 50
41
+ },
42
+ {
43
+ "grad_norm": 0.39245566725730896,
44
+ "learning_rate": 5.9e-06,
45
+ "loss": 1.1582,
46
+ "step": 60
47
+ },
48
+ {
49
+ "grad_norm": 0.44334590435028076,
50
+ "learning_rate": 6.900000000000001e-06,
51
+ "loss": 1.1293,
52
+ "step": 70
53
+ },
54
+ {
55
+ "grad_norm": 0.30264705419540405,
56
+ "learning_rate": 7.9e-06,
57
+ "loss": 1.1102,
58
+ "step": 80
59
+ },
60
+ {
61
+ "grad_norm": 0.24947984516620636,
62
+ "learning_rate": 8.9e-06,
63
+ "loss": 1.1029,
64
+ "step": 90
65
+ },
66
+ {
67
+ "grad_norm": 0.3442651927471161,
68
+ "learning_rate": 9.900000000000002e-06,
69
+ "loss": 1.0967,
70
+ "step": 100
71
+ },
72
+ {
73
+ "grad_norm": 0.2968611717224121,
74
+ "learning_rate": 1.09e-05,
75
+ "loss": 1.092,
76
+ "step": 110
77
+ },
78
+ {
79
+ "grad_norm": 0.8635988235473633,
80
+ "learning_rate": 1.19e-05,
81
+ "loss": 1.0971,
82
+ "step": 120
83
+ },
84
+ {
85
+ "grad_norm": 0.549736738204956,
86
+ "learning_rate": 1.29e-05,
87
+ "loss": 1.0877,
88
+ "step": 130
89
+ },
90
+ {
91
+ "grad_norm": 0.3879706859588623,
92
+ "learning_rate": 1.3900000000000002e-05,
93
+ "loss": 1.0965,
94
+ "step": 140
95
+ },
96
+ {
97
+ "grad_norm": 0.8023251295089722,
98
+ "learning_rate": 1.49e-05,
99
+ "loss": 1.0902,
100
+ "step": 150
101
+ },
102
+ {
103
+ "grad_norm": 0.3089507222175598,
104
+ "learning_rate": 1.59e-05,
105
+ "loss": 1.0938,
106
+ "step": 160
107
+ },
108
+ {
109
+ "grad_norm": 0.2845019996166229,
110
+ "learning_rate": 1.69e-05,
111
+ "loss": 1.0857,
112
+ "step": 170
113
+ },
114
+ {
115
+ "grad_norm": 0.5286039710044861,
116
+ "learning_rate": 1.79e-05,
117
+ "loss": 1.0811,
118
+ "step": 180
119
+ },
120
+ {
121
+ "grad_norm": 0.3651806712150574,
122
+ "learning_rate": 1.8900000000000002e-05,
123
+ "loss": 1.0711,
124
+ "step": 190
125
+ },
126
+ {
127
+ "grad_norm": 0.49551165103912354,
128
+ "learning_rate": 1.9900000000000003e-05,
129
+ "loss": 1.0568,
130
+ "step": 200
131
+ },
132
+ {
133
+ "grad_norm": 0.48400798439979553,
134
+ "learning_rate": 2.09e-05,
135
+ "loss": 1.0261,
136
+ "step": 210
137
+ },
138
+ {
139
+ "grad_norm": 0.5731498599052429,
140
+ "learning_rate": 2.19e-05,
141
+ "loss": 1.0099,
142
+ "step": 220
143
+ },
144
+ {
145
+ "grad_norm": 0.5158259868621826,
146
+ "learning_rate": 2.29e-05,
147
+ "loss": 0.9947,
148
+ "step": 230
149
+ },
150
+ {
151
+ "grad_norm": 0.8482366800308228,
152
+ "learning_rate": 2.39e-05,
153
+ "loss": 0.9701,
154
+ "step": 240
155
+ },
156
+ {
157
+ "grad_norm": 0.668360710144043,
158
+ "learning_rate": 2.4900000000000002e-05,
159
+ "loss": 0.933,
160
+ "step": 250
161
+ },
162
+ {
163
+ "grad_norm": 1.0341784954071045,
164
+ "learning_rate": 2.5900000000000003e-05,
165
+ "loss": 0.9101,
166
+ "step": 260
167
+ },
168
+ {
169
+ "grad_norm": 0.8576267957687378,
170
+ "learning_rate": 2.6900000000000003e-05,
171
+ "loss": 0.8772,
172
+ "step": 270
173
+ },
174
+ {
175
+ "grad_norm": 1.177884578704834,
176
+ "learning_rate": 2.7900000000000004e-05,
177
+ "loss": 0.8447,
178
+ "step": 280
179
+ },
180
+ {
181
+ "grad_norm": 1.2616709470748901,
182
+ "learning_rate": 2.8899999999999998e-05,
183
+ "loss": 0.8277,
184
+ "step": 290
185
+ },
186
+ {
187
+ "grad_norm": 0.9310820698738098,
188
+ "learning_rate": 2.9900000000000002e-05,
189
+ "loss": 0.8179,
190
+ "step": 300
191
+ },
192
+ {
193
+ "grad_norm": 0.9291635751724243,
194
+ "learning_rate": 3.09e-05,
195
+ "loss": 0.7966,
196
+ "step": 310
197
+ },
198
+ {
199
+ "grad_norm": 0.9610940217971802,
200
+ "learning_rate": 3.19e-05,
201
+ "loss": 0.7595,
202
+ "step": 320
203
+ },
204
+ {
205
+ "grad_norm": 1.082502841949463,
206
+ "learning_rate": 3.29e-05,
207
+ "loss": 0.7442,
208
+ "step": 330
209
+ },
210
+ {
211
+ "grad_norm": 1.0246247053146362,
212
+ "learning_rate": 3.3900000000000004e-05,
213
+ "loss": 0.7153,
214
+ "step": 340
215
+ },
216
+ {
217
+ "grad_norm": 1.1535388231277466,
218
+ "learning_rate": 3.49e-05,
219
+ "loss": 0.7008,
220
+ "step": 350
221
+ },
222
+ {
223
+ "grad_norm": 1.1344460248947144,
224
+ "learning_rate": 3.59e-05,
225
+ "loss": 0.6781,
226
+ "step": 360
227
+ },
228
+ {
229
+ "grad_norm": 1.0874427556991577,
230
+ "learning_rate": 3.69e-05,
231
+ "loss": 0.6773,
232
+ "step": 370
233
+ },
234
+ {
235
+ "grad_norm": 1.1591013669967651,
236
+ "learning_rate": 3.79e-05,
237
+ "loss": 0.6567,
238
+ "step": 380
239
+ },
240
+ {
241
+ "grad_norm": 1.2492725849151611,
242
+ "learning_rate": 3.8900000000000004e-05,
243
+ "loss": 0.6557,
244
+ "step": 390
245
+ },
246
+ {
247
+ "grad_norm": 1.333844542503357,
248
+ "learning_rate": 3.99e-05,
249
+ "loss": 0.6557,
250
+ "step": 400
251
+ },
252
+ {
253
+ "grad_norm": 1.157532811164856,
254
+ "learning_rate": 4.09e-05,
255
+ "loss": 0.6327,
256
+ "step": 410
257
+ },
258
+ {
259
+ "grad_norm": 1.0811901092529297,
260
+ "learning_rate": 4.19e-05,
261
+ "loss": 0.6181,
262
+ "step": 420
263
+ },
264
+ {
265
+ "grad_norm": 1.211959958076477,
266
+ "learning_rate": 4.29e-05,
267
+ "loss": 0.6222,
268
+ "step": 430
269
+ },
270
+ {
271
+ "grad_norm": 1.1791653633117676,
272
+ "learning_rate": 4.39e-05,
273
+ "loss": 0.6018,
274
+ "step": 440
275
+ },
276
+ {
277
+ "grad_norm": 1.483304500579834,
278
+ "learning_rate": 4.49e-05,
279
+ "loss": 0.5883,
280
+ "step": 450
281
+ },
282
+ {
283
+ "grad_norm": 1.136581540107727,
284
+ "learning_rate": 4.5900000000000004e-05,
285
+ "loss": 0.5781,
286
+ "step": 460
287
+ },
288
+ {
289
+ "grad_norm": 1.0122281312942505,
290
+ "learning_rate": 4.69e-05,
291
+ "loss": 0.5833,
292
+ "step": 470
293
+ },
294
+ {
295
+ "grad_norm": 1.294203519821167,
296
+ "learning_rate": 4.79e-05,
297
+ "loss": 0.5811,
298
+ "step": 480
299
+ },
300
+ {
301
+ "grad_norm": 1.036759614944458,
302
+ "learning_rate": 4.89e-05,
303
+ "loss": 0.5826,
304
+ "step": 490
305
+ },
306
+ {
307
+ "eval/loss": 0.540949667096138,
308
+ "step": 500
309
+ },
310
+ {
311
+ "grad_norm": 1.4752445220947266,
312
+ "learning_rate": 4.99e-05,
313
+ "loss": 0.5589,
314
+ "step": 500
315
+ },
316
+ {
317
+ "grad_norm": 0.9996066093444824,
318
+ "learning_rate": 5.0900000000000004e-05,
319
+ "loss": 0.5726,
320
+ "step": 510
321
+ },
322
+ {
323
+ "grad_norm": 1.1314283609390259,
324
+ "learning_rate": 5.19e-05,
325
+ "loss": 0.5489,
326
+ "step": 520
327
+ },
328
+ {
329
+ "grad_norm": 1.0463645458221436,
330
+ "learning_rate": 5.2900000000000005e-05,
331
+ "loss": 0.537,
332
+ "step": 530
333
+ },
334
+ {
335
+ "grad_norm": 1.1870821714401245,
336
+ "learning_rate": 5.390000000000001e-05,
337
+ "loss": 0.5419,
338
+ "step": 540
339
+ },
340
+ {
341
+ "grad_norm": 1.0127766132354736,
342
+ "learning_rate": 5.4900000000000006e-05,
343
+ "loss": 0.5383,
344
+ "step": 550
345
+ },
346
+ {
347
+ "grad_norm": 1.1530522108078003,
348
+ "learning_rate": 5.590000000000001e-05,
349
+ "loss": 0.5255,
350
+ "step": 560
351
+ },
352
+ {
353
+ "grad_norm": 1.6963386535644531,
354
+ "learning_rate": 5.69e-05,
355
+ "loss": 0.5248,
356
+ "step": 570
357
+ },
358
+ {
359
+ "grad_norm": 1.5842453241348267,
360
+ "learning_rate": 5.79e-05,
361
+ "loss": 0.5243,
362
+ "step": 580
363
+ },
364
+ {
365
+ "grad_norm": 1.3649457693099976,
366
+ "learning_rate": 5.89e-05,
367
+ "loss": 0.5147,
368
+ "step": 590
369
+ },
370
+ {
371
+ "grad_norm": 1.018904447555542,
372
+ "learning_rate": 5.99e-05,
373
+ "loss": 0.5042,
374
+ "step": 600
375
+ },
376
+ {
377
+ "grad_norm": 1.252278208732605,
378
+ "learning_rate": 6.09e-05,
379
+ "loss": 0.5213,
380
+ "step": 610
381
+ },
382
+ {
383
+ "grad_norm": 1.2415512800216675,
384
+ "learning_rate": 6.19e-05,
385
+ "loss": 0.4769,
386
+ "step": 620
387
+ },
388
+ {
389
+ "grad_norm": 1.3829114437103271,
390
+ "learning_rate": 6.29e-05,
391
+ "loss": 0.4806,
392
+ "step": 630
393
+ },
394
+ {
395
+ "grad_norm": 1.2860313653945923,
396
+ "learning_rate": 6.390000000000001e-05,
397
+ "loss": 0.4687,
398
+ "step": 640
399
+ },
400
+ {
401
+ "grad_norm": 1.1453088521957397,
402
+ "learning_rate": 6.49e-05,
403
+ "loss": 0.477,
404
+ "step": 650
405
+ },
406
+ {
407
+ "grad_norm": 1.2535901069641113,
408
+ "learning_rate": 6.59e-05,
409
+ "loss": 0.4541,
410
+ "step": 660
411
+ },
412
+ {
413
+ "grad_norm": 1.2619575262069702,
414
+ "learning_rate": 6.690000000000001e-05,
415
+ "loss": 0.4565,
416
+ "step": 670
417
+ },
418
+ {
419
+ "grad_norm": 1.1378668546676636,
420
+ "learning_rate": 6.790000000000001e-05,
421
+ "loss": 0.4395,
422
+ "step": 680
423
+ },
424
+ {
425
+ "grad_norm": 1.0631095170974731,
426
+ "learning_rate": 6.89e-05,
427
+ "loss": 0.4185,
428
+ "step": 690
429
+ },
430
+ {
431
+ "grad_norm": 1.1509623527526855,
432
+ "learning_rate": 6.99e-05,
433
+ "loss": 0.437,
434
+ "step": 700
435
+ },
436
+ {
437
+ "grad_norm": 1.249911904335022,
438
+ "learning_rate": 7.09e-05,
439
+ "loss": 0.4273,
440
+ "step": 710
441
+ },
442
+ {
443
+ "grad_norm": 1.1548298597335815,
444
+ "learning_rate": 7.19e-05,
445
+ "loss": 0.4296,
446
+ "step": 720
447
+ },
448
+ {
449
+ "grad_norm": 1.0660429000854492,
450
+ "learning_rate": 7.29e-05,
451
+ "loss": 0.4438,
452
+ "step": 730
453
+ },
454
+ {
455
+ "grad_norm": 1.2336221933364868,
456
+ "learning_rate": 7.390000000000001e-05,
457
+ "loss": 0.407,
458
+ "step": 740
459
+ },
460
+ {
461
+ "grad_norm": 1.073397159576416,
462
+ "learning_rate": 7.49e-05,
463
+ "loss": 0.3936,
464
+ "step": 750
465
+ },
466
+ {
467
+ "grad_norm": 1.2548182010650635,
468
+ "learning_rate": 7.59e-05,
469
+ "loss": 0.3991,
470
+ "step": 760
471
+ },
472
+ {
473
+ "grad_norm": 1.4380117654800415,
474
+ "learning_rate": 7.69e-05,
475
+ "loss": 0.3957,
476
+ "step": 770
477
+ },
478
+ {
479
+ "grad_norm": 1.2932844161987305,
480
+ "learning_rate": 7.790000000000001e-05,
481
+ "loss": 0.4081,
482
+ "step": 780
483
+ },
484
+ {
485
+ "grad_norm": 1.1372441053390503,
486
+ "learning_rate": 7.890000000000001e-05,
487
+ "loss": 0.3812,
488
+ "step": 790
489
+ },
490
+ {
491
+ "grad_norm": 1.1620570421218872,
492
+ "learning_rate": 7.99e-05,
493
+ "loss": 0.3952,
494
+ "step": 800
495
+ },
496
+ {
497
+ "grad_norm": 1.1965490579605103,
498
+ "learning_rate": 8.090000000000001e-05,
499
+ "loss": 0.3677,
500
+ "step": 810
501
+ },
502
+ {
503
+ "grad_norm": 1.176527738571167,
504
+ "learning_rate": 8.19e-05,
505
+ "loss": 0.3798,
506
+ "step": 820
507
+ },
508
+ {
509
+ "grad_norm": 1.153993010520935,
510
+ "learning_rate": 8.29e-05,
511
+ "loss": 0.3629,
512
+ "step": 830
513
+ },
514
+ {
515
+ "grad_norm": 1.3327205181121826,
516
+ "learning_rate": 8.39e-05,
517
+ "loss": 0.3578,
518
+ "step": 840
519
+ },
520
+ {
521
+ "grad_norm": 1.1645392179489136,
522
+ "learning_rate": 8.49e-05,
523
+ "loss": 0.3542,
524
+ "step": 850
525
+ },
526
+ {
527
+ "grad_norm": 1.1183959245681763,
528
+ "learning_rate": 8.59e-05,
529
+ "loss": 0.3452,
530
+ "step": 860
531
+ },
532
+ {
533
+ "grad_norm": 1.4171571731567383,
534
+ "learning_rate": 8.69e-05,
535
+ "loss": 0.328,
536
+ "step": 870
537
+ },
538
+ {
539
+ "grad_norm": 1.2265501022338867,
540
+ "learning_rate": 8.790000000000001e-05,
541
+ "loss": 0.3427,
542
+ "step": 880
543
+ },
544
+ {
545
+ "grad_norm": 1.3434756994247437,
546
+ "learning_rate": 8.89e-05,
547
+ "loss": 0.3333,
548
+ "step": 890
549
+ },
550
+ {
551
+ "grad_norm": 1.3676091432571411,
552
+ "learning_rate": 8.99e-05,
553
+ "loss": 0.3142,
554
+ "step": 900
555
+ },
556
+ {
557
+ "grad_norm": 1.0545670986175537,
558
+ "learning_rate": 9.090000000000001e-05,
559
+ "loss": 0.3242,
560
+ "step": 910
561
+ },
562
+ {
563
+ "grad_norm": 1.1802937984466553,
564
+ "learning_rate": 9.190000000000001e-05,
565
+ "loss": 0.3419,
566
+ "step": 920
567
+ },
568
+ {
569
+ "grad_norm": 1.2357131242752075,
570
+ "learning_rate": 9.290000000000001e-05,
571
+ "loss": 0.2918,
572
+ "step": 930
573
+ },
574
+ {
575
+ "grad_norm": 1.2467869520187378,
576
+ "learning_rate": 9.39e-05,
577
+ "loss": 0.2812,
578
+ "step": 940
579
+ },
580
+ {
581
+ "grad_norm": 1.2177903652191162,
582
+ "learning_rate": 9.49e-05,
583
+ "loss": 0.2786,
584
+ "step": 950
585
+ },
586
+ {
587
+ "grad_norm": 1.2031254768371582,
588
+ "learning_rate": 9.59e-05,
589
+ "loss": 0.2801,
590
+ "step": 960
591
+ },
592
+ {
593
+ "grad_norm": 1.27996826171875,
594
+ "learning_rate": 9.69e-05,
595
+ "loss": 0.2944,
596
+ "step": 970
597
+ },
598
+ {
599
+ "grad_norm": 1.4937174320220947,
600
+ "learning_rate": 9.790000000000001e-05,
601
+ "loss": 0.259,
602
+ "step": 980
603
+ },
604
+ {
605
+ "grad_norm": 1.2263216972351074,
606
+ "learning_rate": 9.89e-05,
607
+ "loss": 0.269,
608
+ "step": 990
609
+ },
610
+ {
611
+ "eval/loss": 0.2459017077088356,
612
+ "step": 1000
613
+ },
614
+ {
615
+ "grad_norm": 1.1868503093719482,
616
+ "learning_rate": 9.99e-05,
617
+ "loss": 0.2673,
618
+ "step": 1000
619
+ },
620
+ {
621
+ "grad_norm": 1.2517995834350586,
622
+ "learning_rate": 9.999994463727085e-05,
623
+ "loss": 0.2957,
624
+ "step": 1010
625
+ },
626
+ {
627
+ "grad_norm": 1.1621079444885254,
628
+ "learning_rate": 9.999975326009292e-05,
629
+ "loss": 0.2406,
630
+ "step": 1020
631
+ },
632
+ {
633
+ "grad_norm": 1.2248700857162476,
634
+ "learning_rate": 9.999942518549879e-05,
635
+ "loss": 0.2588,
636
+ "step": 1030
637
+ },
638
+ {
639
+ "grad_norm": 1.1486198902130127,
640
+ "learning_rate": 9.999896041438544e-05,
641
+ "loss": 0.2869,
642
+ "step": 1040
643
+ },
644
+ {
645
+ "grad_norm": 1.1869938373565674,
646
+ "learning_rate": 9.999835894802353e-05,
647
+ "loss": 0.2613,
648
+ "step": 1050
649
+ },
650
+ {
651
+ "grad_norm": 1.2058380842208862,
652
+ "learning_rate": 9.999762078805743e-05,
653
+ "loss": 0.2367,
654
+ "step": 1060
655
+ },
656
+ {
657
+ "grad_norm": 1.2073358297348022,
658
+ "learning_rate": 9.999674593650526e-05,
659
+ "loss": 0.2343,
660
+ "step": 1070
661
+ },
662
+ {
663
+ "grad_norm": 1.3462257385253906,
664
+ "learning_rate": 9.99957343957588e-05,
665
+ "loss": 0.2043,
666
+ "step": 1080
667
+ },
668
+ {
669
+ "grad_norm": 1.21333646774292,
670
+ "learning_rate": 9.99945861685836e-05,
671
+ "loss": 0.22,
672
+ "step": 1090
673
+ },
674
+ {
675
+ "grad_norm": 1.172276496887207,
676
+ "learning_rate": 9.999330125811884e-05,
677
+ "loss": 0.2268,
678
+ "step": 1100
679
+ },
680
+ {
681
+ "grad_norm": 1.5802624225616455,
682
+ "learning_rate": 9.999187966787744e-05,
683
+ "loss": 0.2389,
684
+ "step": 1110
685
+ },
686
+ {
687
+ "grad_norm": 1.0722038745880127,
688
+ "learning_rate": 9.999032140174595e-05,
689
+ "loss": 0.2069,
690
+ "step": 1120
691
+ },
692
+ {
693
+ "grad_norm": 1.2428017854690552,
694
+ "learning_rate": 9.998862646398464e-05,
695
+ "loss": 0.2105,
696
+ "step": 1130
697
+ },
698
+ {
699
+ "grad_norm": 1.109406590461731,
700
+ "learning_rate": 9.998679485922739e-05,
701
+ "loss": 0.204,
702
+ "step": 1140
703
+ },
704
+ {
705
+ "grad_norm": 1.133062720298767,
706
+ "learning_rate": 9.998482659248174e-05,
707
+ "loss": 0.1862,
708
+ "step": 1150
709
+ },
710
+ {
711
+ "grad_norm": 1.185992956161499,
712
+ "learning_rate": 9.998272166912883e-05,
713
+ "loss": 0.1944,
714
+ "step": 1160
715
+ },
716
+ {
717
+ "grad_norm": 1.0539828538894653,
718
+ "learning_rate": 9.998048009492347e-05,
719
+ "loss": 0.1603,
720
+ "step": 1170
721
+ },
722
+ {
723
+ "grad_norm": 1.2745673656463623,
724
+ "learning_rate": 9.997810187599403e-05,
725
+ "loss": 0.1815,
726
+ "step": 1180
727
+ },
728
+ {
729
+ "grad_norm": 1.2294188737869263,
730
+ "learning_rate": 9.997558701884249e-05,
731
+ "loss": 0.1774,
732
+ "step": 1190
733
+ },
734
+ {
735
+ "grad_norm": 1.6289048194885254,
736
+ "learning_rate": 9.997293553034433e-05,
737
+ "loss": 0.171,
738
+ "step": 1200
739
+ },
740
+ {
741
+ "grad_norm": 1.2011067867279053,
742
+ "learning_rate": 9.997014741774866e-05,
743
+ "loss": 0.1657,
744
+ "step": 1210
745
+ },
746
+ {
747
+ "grad_norm": 1.1529210805892944,
748
+ "learning_rate": 9.996722268867803e-05,
749
+ "loss": 0.1642,
750
+ "step": 1220
751
+ },
752
+ {
753
+ "grad_norm": 0.8735513091087341,
754
+ "learning_rate": 9.996416135112858e-05,
755
+ "loss": 0.1393,
756
+ "step": 1230
757
+ },
758
+ {
759
+ "grad_norm": 1.3112437725067139,
760
+ "learning_rate": 9.996096341346988e-05,
761
+ "loss": 0.1392,
762
+ "step": 1240
763
+ },
764
+ {
765
+ "grad_norm": 1.2347687482833862,
766
+ "learning_rate": 9.995762888444495e-05,
767
+ "loss": 0.1674,
768
+ "step": 1250
769
+ },
770
+ {
771
+ "grad_norm": 1.539437174797058,
772
+ "learning_rate": 9.995415777317027e-05,
773
+ "loss": 0.1625,
774
+ "step": 1260
775
+ },
776
+ {
777
+ "grad_norm": 1.2333788871765137,
778
+ "learning_rate": 9.995055008913574e-05,
779
+ "loss": 0.1328,
780
+ "step": 1270
781
+ },
782
+ {
783
+ "grad_norm": 1.1541303396224976,
784
+ "learning_rate": 9.994680584220463e-05,
785
+ "loss": 0.1294,
786
+ "step": 1280
787
+ },
788
+ {
789
+ "grad_norm": 1.0528708696365356,
790
+ "learning_rate": 9.994292504261355e-05,
791
+ "loss": 0.1441,
792
+ "step": 1290
793
+ },
794
+ {
795
+ "grad_norm": 1.0454338788986206,
796
+ "learning_rate": 9.993890770097247e-05,
797
+ "loss": 0.1266,
798
+ "step": 1300
799
+ },
800
+ {
801
+ "grad_norm": 1.1280555725097656,
802
+ "learning_rate": 9.993475382826467e-05,
803
+ "loss": 0.1426,
804
+ "step": 1310
805
+ },
806
+ {
807
+ "grad_norm": 1.187239170074463,
808
+ "learning_rate": 9.993046343584664e-05,
809
+ "loss": 0.1422,
810
+ "step": 1320
811
+ },
812
+ {
813
+ "grad_norm": 1.0262149572372437,
814
+ "learning_rate": 9.992603653544816e-05,
815
+ "loss": 0.1161,
816
+ "step": 1330
817
+ },
818
+ {
819
+ "grad_norm": 1.1586066484451294,
820
+ "learning_rate": 9.992147313917222e-05,
821
+ "loss": 0.1408,
822
+ "step": 1340
823
+ },
824
+ {
825
+ "grad_norm": 0.9765651226043701,
826
+ "learning_rate": 9.991677325949497e-05,
827
+ "loss": 0.1611,
828
+ "step": 1350
829
+ },
830
+ {
831
+ "grad_norm": 0.9763075709342957,
832
+ "learning_rate": 9.991193690926568e-05,
833
+ "loss": 0.1464,
834
+ "step": 1360
835
+ },
836
+ {
837
+ "grad_norm": 1.2092801332473755,
838
+ "learning_rate": 9.990696410170678e-05,
839
+ "loss": 0.1466,
840
+ "step": 1370
841
+ },
842
+ {
843
+ "grad_norm": 1.0392274856567383,
844
+ "learning_rate": 9.990185485041371e-05,
845
+ "loss": 0.1263,
846
+ "step": 1380
847
+ },
848
+ {
849
+ "grad_norm": 1.0358021259307861,
850
+ "learning_rate": 9.989660916935498e-05,
851
+ "loss": 0.1282,
852
+ "step": 1390
853
+ },
854
+ {
855
+ "grad_norm": 1.0262398719787598,
856
+ "learning_rate": 9.989122707287208e-05,
857
+ "loss": 0.1391,
858
+ "step": 1400
859
+ },
860
+ {
861
+ "grad_norm": 1.1978421211242676,
862
+ "learning_rate": 9.988570857567945e-05,
863
+ "loss": 0.1218,
864
+ "step": 1410
865
+ },
866
+ {
867
+ "grad_norm": 0.9296699166297913,
868
+ "learning_rate": 9.988005369286446e-05,
869
+ "loss": 0.1331,
870
+ "step": 1420
871
+ },
872
+ {
873
+ "grad_norm": 1.0004020929336548,
874
+ "learning_rate": 9.987426243988734e-05,
875
+ "loss": 0.1372,
876
+ "step": 1430
877
+ },
878
+ {
879
+ "grad_norm": 1.0646557807922363,
880
+ "learning_rate": 9.986833483258114e-05,
881
+ "loss": 0.1334,
882
+ "step": 1440
883
+ },
884
+ {
885
+ "grad_norm": 0.9959461688995361,
886
+ "learning_rate": 9.986227088715173e-05,
887
+ "loss": 0.1187,
888
+ "step": 1450
889
+ },
890
+ {
891
+ "grad_norm": 1.0928994417190552,
892
+ "learning_rate": 9.98560706201777e-05,
893
+ "loss": 0.1455,
894
+ "step": 1460
895
+ },
896
+ {
897
+ "grad_norm": 1.1495130062103271,
898
+ "learning_rate": 9.984973404861036e-05,
899
+ "loss": 0.1098,
900
+ "step": 1470
901
+ },
902
+ {
903
+ "grad_norm": 1.3037567138671875,
904
+ "learning_rate": 9.984326118977361e-05,
905
+ "loss": 0.1255,
906
+ "step": 1480
907
+ },
908
+ {
909
+ "grad_norm": 0.923818051815033,
910
+ "learning_rate": 9.983665206136406e-05,
911
+ "loss": 0.1439,
912
+ "step": 1490
913
+ },
914
+ {
915
+ "eval/loss": 0.12131376132369041,
916
+ "step": 1500
917
+ },
918
+ {
919
+ "grad_norm": 0.9496746063232422,
920
+ "learning_rate": 9.982990668145075e-05,
921
+ "loss": 0.123,
922
+ "step": 1500
923
+ },
924
+ {
925
+ "grad_norm": 1.0832738876342773,
926
+ "learning_rate": 9.982302506847534e-05,
927
+ "loss": 0.1408,
928
+ "step": 1510
929
+ },
930
+ {
931
+ "grad_norm": 1.033626914024353,
932
+ "learning_rate": 9.981600724125189e-05,
933
+ "loss": 0.1108,
934
+ "step": 1520
935
+ },
936
+ {
937
+ "grad_norm": 1.0005450248718262,
938
+ "learning_rate": 9.980885321896685e-05,
939
+ "loss": 0.1274,
940
+ "step": 1530
941
+ },
942
+ {
943
+ "grad_norm": 1.0442663431167603,
944
+ "learning_rate": 9.980156302117905e-05,
945
+ "loss": 0.1238,
946
+ "step": 1540
947
+ },
948
+ {
949
+ "grad_norm": 0.8260471820831299,
950
+ "learning_rate": 9.979413666781963e-05,
951
+ "loss": 0.1231,
952
+ "step": 1550
953
+ },
954
+ {
955
+ "grad_norm": 0.884735107421875,
956
+ "learning_rate": 9.978657417919193e-05,
957
+ "loss": 0.1495,
958
+ "step": 1560
959
+ },
960
+ {
961
+ "grad_norm": 0.9236319661140442,
962
+ "learning_rate": 9.977887557597153e-05,
963
+ "loss": 0.1327,
964
+ "step": 1570
965
+ },
966
+ {
967
+ "grad_norm": 0.9460572004318237,
968
+ "learning_rate": 9.97710408792061e-05,
969
+ "loss": 0.1263,
970
+ "step": 1580
971
+ },
972
+ {
973
+ "grad_norm": 0.9749200344085693,
974
+ "learning_rate": 9.976307011031542e-05,
975
+ "loss": 0.12,
976
+ "step": 1590
977
+ },
978
+ {
979
+ "grad_norm": 1.1136820316314697,
980
+ "learning_rate": 9.975496329109126e-05,
981
+ "loss": 0.1323,
982
+ "step": 1600
983
+ },
984
+ {
985
+ "grad_norm": 0.8567096590995789,
986
+ "learning_rate": 9.974672044369732e-05,
987
+ "loss": 0.125,
988
+ "step": 1610
989
+ },
990
+ {
991
+ "grad_norm": 1.0884920358657837,
992
+ "learning_rate": 9.97383415906693e-05,
993
+ "loss": 0.115,
994
+ "step": 1620
995
+ },
996
+ {
997
+ "grad_norm": 1.0339338779449463,
998
+ "learning_rate": 9.97298267549146e-05,
999
+ "loss": 0.1386,
1000
+ "step": 1630
1001
+ },
1002
+ {
1003
+ "grad_norm": 0.9121850728988647,
1004
+ "learning_rate": 9.972117595971249e-05,
1005
+ "loss": 0.1249,
1006
+ "step": 1640
1007
+ },
1008
+ {
1009
+ "grad_norm": 0.9620202779769897,
1010
+ "learning_rate": 9.971238922871391e-05,
1011
+ "loss": 0.1322,
1012
+ "step": 1650
1013
+ },
1014
+ {
1015
+ "grad_norm": 1.0946760177612305,
1016
+ "learning_rate": 9.970346658594142e-05,
1017
+ "loss": 0.1123,
1018
+ "step": 1660
1019
+ },
1020
+ {
1021
+ "grad_norm": 0.9470517635345459,
1022
+ "learning_rate": 9.969440805578923e-05,
1023
+ "loss": 0.1308,
1024
+ "step": 1670
1025
+ },
1026
+ {
1027
+ "grad_norm": 0.7607911229133606,
1028
+ "learning_rate": 9.968521366302298e-05,
1029
+ "loss": 0.1212,
1030
+ "step": 1680
1031
+ },
1032
+ {
1033
+ "grad_norm": 1.0321109294891357,
1034
+ "learning_rate": 9.967588343277981e-05,
1035
+ "loss": 0.1351,
1036
+ "step": 1690
1037
+ },
1038
+ {
1039
+ "grad_norm": 1.2910419702529907,
1040
+ "learning_rate": 9.966641739056818e-05,
1041
+ "loss": 0.1594,
1042
+ "step": 1700
1043
+ },
1044
+ {
1045
+ "grad_norm": 0.7687619924545288,
1046
+ "learning_rate": 9.965681556226793e-05,
1047
+ "loss": 0.1442,
1048
+ "step": 1710
1049
+ },
1050
+ {
1051
+ "grad_norm": 0.882544755935669,
1052
+ "learning_rate": 9.964707797413006e-05,
1053
+ "loss": 0.1131,
1054
+ "step": 1720
1055
+ },
1056
+ {
1057
+ "grad_norm": 1.0736138820648193,
1058
+ "learning_rate": 9.963720465277679e-05,
1059
+ "loss": 0.0997,
1060
+ "step": 1730
1061
+ },
1062
+ {
1063
+ "grad_norm": 1.0465947389602661,
1064
+ "learning_rate": 9.96271956252014e-05,
1065
+ "loss": 0.1114,
1066
+ "step": 1740
1067
+ },
1068
+ {
1069
+ "grad_norm": 0.8517502546310425,
1070
+ "learning_rate": 9.961705091876816e-05,
1071
+ "loss": 0.1054,
1072
+ "step": 1750
1073
+ },
1074
+ {
1075
+ "grad_norm": 0.8722822666168213,
1076
+ "learning_rate": 9.960677056121235e-05,
1077
+ "loss": 0.1409,
1078
+ "step": 1760
1079
+ },
1080
+ {
1081
+ "grad_norm": 1.3352707624435425,
1082
+ "learning_rate": 9.959635458064005e-05,
1083
+ "loss": 0.1207,
1084
+ "step": 1770
1085
+ },
1086
+ {
1087
+ "grad_norm": 1.0375741720199585,
1088
+ "learning_rate": 9.958580300552815e-05,
1089
+ "loss": 0.1018,
1090
+ "step": 1780
1091
+ },
1092
+ {
1093
+ "grad_norm": 0.9663418531417847,
1094
+ "learning_rate": 9.957511586472426e-05,
1095
+ "loss": 0.1131,
1096
+ "step": 1790
1097
+ },
1098
+ {
1099
+ "grad_norm": 0.9925614595413208,
1100
+ "learning_rate": 9.956429318744662e-05,
1101
+ "loss": 0.0994,
1102
+ "step": 1800
1103
+ },
1104
+ {
1105
+ "grad_norm": 0.9052272439002991,
1106
+ "learning_rate": 9.955333500328404e-05,
1107
+ "loss": 0.1439,
1108
+ "step": 1810
1109
+ },
1110
+ {
1111
+ "grad_norm": 1.1061209440231323,
1112
+ "learning_rate": 9.95422413421957e-05,
1113
+ "loss": 0.099,
1114
+ "step": 1820
1115
+ },
1116
+ {
1117
+ "grad_norm": 0.9718905687332153,
1118
+ "learning_rate": 9.953101223451133e-05,
1119
+ "loss": 0.1334,
1120
+ "step": 1830
1121
+ },
1122
+ {
1123
+ "grad_norm": 1.041150450706482,
1124
+ "learning_rate": 9.951964771093085e-05,
1125
+ "loss": 0.102,
1126
+ "step": 1840
1127
+ },
1128
+ {
1129
+ "grad_norm": 0.7296974658966064,
1130
+ "learning_rate": 9.950814780252442e-05,
1131
+ "loss": 0.1216,
1132
+ "step": 1850
1133
+ },
1134
+ {
1135
+ "grad_norm": 0.956504225730896,
1136
+ "learning_rate": 9.949651254073236e-05,
1137
+ "loss": 0.1504,
1138
+ "step": 1860
1139
+ },
1140
+ {
1141
+ "grad_norm": 0.9494929909706116,
1142
+ "learning_rate": 9.948474195736504e-05,
1143
+ "loss": 0.1176,
1144
+ "step": 1870
1145
+ },
1146
+ {
1147
+ "grad_norm": 1.073912501335144,
1148
+ "learning_rate": 9.947283608460277e-05,
1149
+ "loss": 0.0968,
1150
+ "step": 1880
1151
+ },
1152
+ {
1153
+ "grad_norm": 0.9738394618034363,
1154
+ "learning_rate": 9.946079495499577e-05,
1155
+ "loss": 0.1157,
1156
+ "step": 1890
1157
+ },
1158
+ {
1159
+ "grad_norm": 0.9605513215065002,
1160
+ "learning_rate": 9.944861860146401e-05,
1161
+ "loss": 0.122,
1162
+ "step": 1900
1163
+ },
1164
+ {
1165
+ "grad_norm": 0.8335412740707397,
1166
+ "learning_rate": 9.943630705729719e-05,
1167
+ "loss": 0.0999,
1168
+ "step": 1910
1169
+ },
1170
+ {
1171
+ "grad_norm": 0.881373405456543,
1172
+ "learning_rate": 9.942386035615459e-05,
1173
+ "loss": 0.1014,
1174
+ "step": 1920
1175
+ },
1176
+ {
1177
+ "grad_norm": 0.7942837476730347,
1178
+ "learning_rate": 9.941127853206503e-05,
1179
+ "loss": 0.1095,
1180
+ "step": 1930
1181
+ },
1182
+ {
1183
+ "grad_norm": 1.0058091878890991,
1184
+ "learning_rate": 9.939856161942673e-05,
1185
+ "loss": 0.118,
1186
+ "step": 1940
1187
+ },
1188
+ {
1189
+ "grad_norm": 0.8951269388198853,
1190
+ "learning_rate": 9.938570965300724e-05,
1191
+ "loss": 0.1093,
1192
+ "step": 1950
1193
+ },
1194
+ {
1195
+ "grad_norm": 0.8100780844688416,
1196
+ "learning_rate": 9.937272266794335e-05,
1197
+ "loss": 0.1344,
1198
+ "step": 1960
1199
+ },
1200
+ {
1201
+ "grad_norm": 0.8485944867134094,
1202
+ "learning_rate": 9.935960069974096e-05,
1203
+ "loss": 0.1062,
1204
+ "step": 1970
1205
+ },
1206
+ {
1207
+ "grad_norm": 1.0462373495101929,
1208
+ "learning_rate": 9.934634378427506e-05,
1209
+ "loss": 0.1073,
1210
+ "step": 1980
1211
+ },
1212
+ {
1213
+ "grad_norm": 0.7918229103088379,
1214
+ "learning_rate": 9.933295195778954e-05,
1215
+ "loss": 0.0865,
1216
+ "step": 1990
1217
+ },
1218
+ {
1219
+ "eval/loss": 0.10448359854519368,
1220
+ "step": 2000
1221
+ },
1222
+ {
1223
+ "grad_norm": 1.0324554443359375,
1224
+ "learning_rate": 9.931942525689715e-05,
1225
+ "loss": 0.1235,
1226
+ "step": 2000
1227
+ },
1228
+ {
1229
+ "grad_norm": 0.9461175203323364,
1230
+ "learning_rate": 9.930576371857936e-05,
1231
+ "loss": 0.1044,
1232
+ "step": 2010
1233
+ },
1234
+ {
1235
+ "grad_norm": 0.909123420715332,
1236
+ "learning_rate": 9.929196738018629e-05,
1237
+ "loss": 0.1025,
1238
+ "step": 2020
1239
+ },
1240
+ {
1241
+ "grad_norm": 0.777509868144989,
1242
+ "learning_rate": 9.927803627943662e-05,
1243
+ "loss": 0.1044,
1244
+ "step": 2030
1245
+ },
1246
+ {
1247
+ "grad_norm": 1.0725702047348022,
1248
+ "learning_rate": 9.926397045441744e-05,
1249
+ "loss": 0.1066,
1250
+ "step": 2040
1251
+ },
1252
+ {
1253
+ "grad_norm": 0.9734055399894714,
1254
+ "learning_rate": 9.924976994358417e-05,
1255
+ "loss": 0.1145,
1256
+ "step": 2050
1257
+ },
1258
+ {
1259
+ "grad_norm": 0.9912729859352112,
1260
+ "learning_rate": 9.923543478576048e-05,
1261
+ "loss": 0.0964,
1262
+ "step": 2060
1263
+ },
1264
+ {
1265
+ "grad_norm": 0.7638317346572876,
1266
+ "learning_rate": 9.922096502013813e-05,
1267
+ "loss": 0.0896,
1268
+ "step": 2070
1269
+ },
1270
+ {
1271
+ "grad_norm": 0.8764864802360535,
1272
+ "learning_rate": 9.92063606862769e-05,
1273
+ "loss": 0.0934,
1274
+ "step": 2080
1275
+ },
1276
+ {
1277
+ "grad_norm": 0.8820127248764038,
1278
+ "learning_rate": 9.919162182410453e-05,
1279
+ "loss": 0.1327,
1280
+ "step": 2090
1281
+ },
1282
+ {
1283
+ "grad_norm": 0.9800596237182617,
1284
+ "learning_rate": 9.917674847391645e-05,
1285
+ "loss": 0.0862,
1286
+ "step": 2100
1287
+ },
1288
+ {
1289
+ "grad_norm": 0.844849705696106,
1290
+ "learning_rate": 9.916174067637584e-05,
1291
+ "loss": 0.1044,
1292
+ "step": 2110
1293
+ },
1294
+ {
1295
+ "grad_norm": 0.8713732957839966,
1296
+ "learning_rate": 9.914659847251348e-05,
1297
+ "loss": 0.0839,
1298
+ "step": 2120
1299
+ },
1300
+ {
1301
+ "grad_norm": 0.8485847115516663,
1302
+ "learning_rate": 9.913132190372753e-05,
1303
+ "loss": 0.1138,
1304
+ "step": 2130
1305
+ },
1306
+ {
1307
+ "grad_norm": 0.8496665358543396,
1308
+ "learning_rate": 9.911591101178359e-05,
1309
+ "loss": 0.097,
1310
+ "step": 2140
1311
+ },
1312
+ {
1313
+ "grad_norm": 0.936180830001831,
1314
+ "learning_rate": 9.910036583881443e-05,
1315
+ "loss": 0.1023,
1316
+ "step": 2150
1317
+ },
1318
+ {
1319
+ "grad_norm": 0.9755275249481201,
1320
+ "learning_rate": 9.908468642731995e-05,
1321
+ "loss": 0.1117,
1322
+ "step": 2160
1323
+ },
1324
+ {
1325
+ "grad_norm": 0.9466411471366882,
1326
+ "learning_rate": 9.906887282016707e-05,
1327
+ "loss": 0.1055,
1328
+ "step": 2170
1329
+ },
1330
+ {
1331
+ "grad_norm": 0.8077055811882019,
1332
+ "learning_rate": 9.90529250605896e-05,
1333
+ "loss": 0.1113,
1334
+ "step": 2180
1335
+ },
1336
+ {
1337
+ "grad_norm": 0.8465186953544617,
1338
+ "learning_rate": 9.903684319218809e-05,
1339
+ "loss": 0.1205,
1340
+ "step": 2190
1341
+ },
1342
+ {
1343
+ "grad_norm": 0.7864586114883423,
1344
+ "learning_rate": 9.902062725892976e-05,
1345
+ "loss": 0.1043,
1346
+ "step": 2200
1347
+ },
1348
+ {
1349
+ "grad_norm": 0.8424517512321472,
1350
+ "learning_rate": 9.900427730514834e-05,
1351
+ "loss": 0.1116,
1352
+ "step": 2210
1353
+ },
1354
+ {
1355
+ "grad_norm": 0.9575174450874329,
1356
+ "learning_rate": 9.8987793375544e-05,
1357
+ "loss": 0.1438,
1358
+ "step": 2220
1359
+ },
1360
+ {
1361
+ "grad_norm": 0.8532448410987854,
1362
+ "learning_rate": 9.897117551518318e-05,
1363
+ "loss": 0.1122,
1364
+ "step": 2230
1365
+ },
1366
+ {
1367
+ "grad_norm": 0.994422435760498,
1368
+ "learning_rate": 9.895442376949844e-05,
1369
+ "loss": 0.1269,
1370
+ "step": 2240
1371
+ },
1372
+ {
1373
+ "grad_norm": 0.7749019861221313,
1374
+ "learning_rate": 9.893753818428845e-05,
1375
+ "loss": 0.109,
1376
+ "step": 2250
1377
+ },
1378
+ {
1379
+ "grad_norm": 0.914318323135376,
1380
+ "learning_rate": 9.892051880571773e-05,
1381
+ "loss": 0.0931,
1382
+ "step": 2260
1383
+ },
1384
+ {
1385
+ "grad_norm": 0.9681793451309204,
1386
+ "learning_rate": 9.890336568031663e-05,
1387
+ "loss": 0.0974,
1388
+ "step": 2270
1389
+ },
1390
+ {
1391
+ "grad_norm": 0.7713805437088013,
1392
+ "learning_rate": 9.888607885498113e-05,
1393
+ "loss": 0.1163,
1394
+ "step": 2280
1395
+ },
1396
+ {
1397
+ "grad_norm": 0.9047562479972839,
1398
+ "learning_rate": 9.886865837697275e-05,
1399
+ "loss": 0.0984,
1400
+ "step": 2290
1401
+ },
1402
+ {
1403
+ "grad_norm": 0.9030624628067017,
1404
+ "learning_rate": 9.88511042939184e-05,
1405
+ "loss": 0.088,
1406
+ "step": 2300
1407
+ },
1408
+ {
1409
+ "grad_norm": 0.863299548625946,
1410
+ "learning_rate": 9.883341665381028e-05,
1411
+ "loss": 0.0892,
1412
+ "step": 2310
1413
+ },
1414
+ {
1415
+ "grad_norm": 0.6943339705467224,
1416
+ "learning_rate": 9.881559550500575e-05,
1417
+ "loss": 0.0984,
1418
+ "step": 2320
1419
+ },
1420
+ {
1421
+ "grad_norm": 0.8274405002593994,
1422
+ "learning_rate": 9.879764089622712e-05,
1423
+ "loss": 0.0853,
1424
+ "step": 2330
1425
+ },
1426
+ {
1427
+ "grad_norm": 1.09088134765625,
1428
+ "learning_rate": 9.87795528765616e-05,
1429
+ "loss": 0.0974,
1430
+ "step": 2340
1431
+ },
1432
+ {
1433
+ "grad_norm": 0.7990790605545044,
1434
+ "learning_rate": 9.876133149546118e-05,
1435
+ "loss": 0.1104,
1436
+ "step": 2350
1437
+ },
1438
+ {
1439
+ "grad_norm": 0.7597699165344238,
1440
+ "learning_rate": 9.874297680274238e-05,
1441
+ "loss": 0.1052,
1442
+ "step": 2360
1443
+ },
1444
+ {
1445
+ "grad_norm": 0.850264847278595,
1446
+ "learning_rate": 9.872448884858624e-05,
1447
+ "loss": 0.1015,
1448
+ "step": 2370
1449
+ },
1450
+ {
1451
+ "grad_norm": 0.779171347618103,
1452
+ "learning_rate": 9.870586768353815e-05,
1453
+ "loss": 0.1039,
1454
+ "step": 2380
1455
+ },
1456
+ {
1457
+ "grad_norm": 0.8852418661117554,
1458
+ "learning_rate": 9.868711335850764e-05,
1459
+ "loss": 0.0949,
1460
+ "step": 2390
1461
+ },
1462
+ {
1463
+ "grad_norm": 0.780746579170227,
1464
+ "learning_rate": 9.866822592476833e-05,
1465
+ "loss": 0.112,
1466
+ "step": 2400
1467
+ },
1468
+ {
1469
+ "grad_norm": 0.9411232471466064,
1470
+ "learning_rate": 9.86492054339577e-05,
1471
+ "loss": 0.1031,
1472
+ "step": 2410
1473
+ },
1474
+ {
1475
+ "grad_norm": 0.8456240296363831,
1476
+ "learning_rate": 9.863005193807711e-05,
1477
+ "loss": 0.0854,
1478
+ "step": 2420
1479
+ },
1480
+ {
1481
+ "grad_norm": 0.960747480392456,
1482
+ "learning_rate": 9.861076548949143e-05,
1483
+ "loss": 0.0884,
1484
+ "step": 2430
1485
+ },
1486
+ {
1487
+ "grad_norm": 0.8031632304191589,
1488
+ "learning_rate": 9.859134614092912e-05,
1489
+ "loss": 0.1086,
1490
+ "step": 2440
1491
+ },
1492
+ {
1493
+ "grad_norm": 0.918484628200531,
1494
+ "learning_rate": 9.857179394548191e-05,
1495
+ "loss": 0.1083,
1496
+ "step": 2450
1497
+ },
1498
+ {
1499
+ "grad_norm": 0.7876137495040894,
1500
+ "learning_rate": 9.855210895660477e-05,
1501
+ "loss": 0.1088,
1502
+ "step": 2460
1503
+ },
1504
+ {
1505
+ "grad_norm": 0.8716227412223816,
1506
+ "learning_rate": 9.853229122811568e-05,
1507
+ "loss": 0.1157,
1508
+ "step": 2470
1509
+ },
1510
+ {
1511
+ "grad_norm": 0.680898904800415,
1512
+ "learning_rate": 9.851234081419559e-05,
1513
+ "loss": 0.1339,
1514
+ "step": 2480
1515
+ },
1516
+ {
1517
+ "grad_norm": 0.8554558753967285,
1518
+ "learning_rate": 9.849225776938814e-05,
1519
+ "loss": 0.0945,
1520
+ "step": 2490
1521
+ },
1522
+ {
1523
+ "eval/loss": 0.10325490295886994,
1524
+ "step": 2500
1525
+ },
1526
+ {
1527
+ "grad_norm": 0.82692551612854,
1528
+ "learning_rate": 9.847204214859964e-05,
1529
+ "loss": 0.0895,
1530
+ "step": 2500
1531
+ },
1532
+ {
1533
+ "grad_norm": 0.7433873414993286,
1534
+ "learning_rate": 9.845169400709879e-05,
1535
+ "loss": 0.0943,
1536
+ "step": 2510
1537
+ },
1538
+ {
1539
+ "grad_norm": 0.9095647931098938,
1540
+ "learning_rate": 9.843121340051664e-05,
1541
+ "loss": 0.0963,
1542
+ "step": 2520
1543
+ },
1544
+ {
1545
+ "grad_norm": 0.8508788347244263,
1546
+ "learning_rate": 9.841060038484641e-05,
1547
+ "loss": 0.1102,
1548
+ "step": 2530
1549
+ },
1550
+ {
1551
+ "grad_norm": 0.8506011366844177,
1552
+ "learning_rate": 9.838985501644328e-05,
1553
+ "loss": 0.1057,
1554
+ "step": 2540
1555
+ },
1556
+ {
1557
+ "grad_norm": 0.8930569887161255,
1558
+ "learning_rate": 9.83689773520243e-05,
1559
+ "loss": 0.0895,
1560
+ "step": 2550
1561
+ },
1562
+ {
1563
+ "grad_norm": 0.696980357170105,
1564
+ "learning_rate": 9.834796744866819e-05,
1565
+ "loss": 0.1297,
1566
+ "step": 2560
1567
+ },
1568
+ {
1569
+ "grad_norm": 0.9346020221710205,
1570
+ "learning_rate": 9.832682536381525e-05,
1571
+ "loss": 0.1123,
1572
+ "step": 2570
1573
+ },
1574
+ {
1575
+ "grad_norm": 0.829645037651062,
1576
+ "learning_rate": 9.830555115526711e-05,
1577
+ "loss": 0.0952,
1578
+ "step": 2580
1579
+ },
1580
+ {
1581
+ "grad_norm": 0.6775968074798584,
1582
+ "learning_rate": 9.828414488118667e-05,
1583
+ "loss": 0.1127,
1584
+ "step": 2590
1585
+ },
1586
+ {
1587
+ "grad_norm": 0.7343615293502808,
1588
+ "learning_rate": 9.826260660009785e-05,
1589
+ "loss": 0.1101,
1590
+ "step": 2600
1591
+ },
1592
+ {
1593
+ "grad_norm": 0.9343723058700562,
1594
+ "learning_rate": 9.824093637088547e-05,
1595
+ "loss": 0.1,
1596
+ "step": 2610
1597
+ },
1598
+ {
1599
+ "grad_norm": 0.8306047320365906,
1600
+ "learning_rate": 9.821913425279514e-05,
1601
+ "loss": 0.0839,
1602
+ "step": 2620
1603
+ },
1604
+ {
1605
+ "grad_norm": 0.8243780732154846,
1606
+ "learning_rate": 9.8197200305433e-05,
1607
+ "loss": 0.0916,
1608
+ "step": 2630
1609
+ },
1610
+ {
1611
+ "grad_norm": 0.6883335709571838,
1612
+ "learning_rate": 9.817513458876564e-05,
1613
+ "loss": 0.0898,
1614
+ "step": 2640
1615
+ },
1616
+ {
1617
+ "grad_norm": 0.8698527216911316,
1618
+ "learning_rate": 9.815293716311987e-05,
1619
+ "loss": 0.0995,
1620
+ "step": 2650
1621
+ },
1622
+ {
1623
+ "grad_norm": 0.6949645280838013,
1624
+ "learning_rate": 9.813060808918262e-05,
1625
+ "loss": 0.0867,
1626
+ "step": 2660
1627
+ },
1628
+ {
1629
+ "grad_norm": 1.0080634355545044,
1630
+ "learning_rate": 9.810814742800069e-05,
1631
+ "loss": 0.1031,
1632
+ "step": 2670
1633
+ },
1634
+ {
1635
+ "grad_norm": 0.8581037521362305,
1636
+ "learning_rate": 9.808555524098074e-05,
1637
+ "loss": 0.0745,
1638
+ "step": 2680
1639
+ },
1640
+ {
1641
+ "grad_norm": 0.7769279479980469,
1642
+ "learning_rate": 9.806283158988887e-05,
1643
+ "loss": 0.0808,
1644
+ "step": 2690
1645
+ },
1646
+ {
1647
+ "grad_norm": 0.6746631264686584,
1648
+ "learning_rate": 9.803997653685072e-05,
1649
+ "loss": 0.0777,
1650
+ "step": 2700
1651
+ },
1652
+ {
1653
+ "grad_norm": 0.848559558391571,
1654
+ "learning_rate": 9.801699014435112e-05,
1655
+ "loss": 0.0877,
1656
+ "step": 2710
1657
+ },
1658
+ {
1659
+ "grad_norm": 0.8104966878890991,
1660
+ "learning_rate": 9.799387247523398e-05,
1661
+ "loss": 0.0807,
1662
+ "step": 2720
1663
+ },
1664
+ {
1665
+ "grad_norm": 0.7445914149284363,
1666
+ "learning_rate": 9.797062359270215e-05,
1667
+ "loss": 0.0924,
1668
+ "step": 2730
1669
+ },
1670
+ {
1671
+ "grad_norm": 0.7468881607055664,
1672
+ "learning_rate": 9.794724356031715e-05,
1673
+ "loss": 0.1078,
1674
+ "step": 2740
1675
+ },
1676
+ {
1677
+ "grad_norm": 0.8936640024185181,
1678
+ "learning_rate": 9.792373244199913e-05,
1679
+ "loss": 0.0991,
1680
+ "step": 2750
1681
+ },
1682
+ {
1683
+ "grad_norm": 0.6333166360855103,
1684
+ "learning_rate": 9.790009030202658e-05,
1685
+ "loss": 0.0737,
1686
+ "step": 2760
1687
+ },
1688
+ {
1689
+ "grad_norm": 0.7722197771072388,
1690
+ "learning_rate": 9.78763172050362e-05,
1691
+ "loss": 0.0841,
1692
+ "step": 2770
1693
+ },
1694
+ {
1695
+ "grad_norm": 0.8541356921195984,
1696
+ "learning_rate": 9.785241321602274e-05,
1697
+ "loss": 0.0873,
1698
+ "step": 2780
1699
+ },
1700
+ {
1701
+ "grad_norm": 0.7936137318611145,
1702
+ "learning_rate": 9.782837840033879e-05,
1703
+ "loss": 0.0894,
1704
+ "step": 2790
1705
+ },
1706
+ {
1707
+ "grad_norm": 0.6794918775558472,
1708
+ "learning_rate": 9.780421282369461e-05,
1709
+ "loss": 0.0933,
1710
+ "step": 2800
1711
+ },
1712
+ {
1713
+ "grad_norm": 0.8522564768791199,
1714
+ "learning_rate": 9.777991655215797e-05,
1715
+ "loss": 0.0938,
1716
+ "step": 2810
1717
+ },
1718
+ {
1719
+ "grad_norm": 0.6844787001609802,
1720
+ "learning_rate": 9.775548965215394e-05,
1721
+ "loss": 0.0723,
1722
+ "step": 2820
1723
+ },
1724
+ {
1725
+ "grad_norm": 0.8117145895957947,
1726
+ "learning_rate": 9.773093219046474e-05,
1727
+ "loss": 0.0949,
1728
+ "step": 2830
1729
+ },
1730
+ {
1731
+ "grad_norm": 0.7425336837768555,
1732
+ "learning_rate": 9.770624423422954e-05,
1733
+ "loss": 0.1161,
1734
+ "step": 2840
1735
+ },
1736
+ {
1737
+ "grad_norm": 0.780025839805603,
1738
+ "learning_rate": 9.768142585094426e-05,
1739
+ "loss": 0.0932,
1740
+ "step": 2850
1741
+ },
1742
+ {
1743
+ "grad_norm": 0.962777853012085,
1744
+ "learning_rate": 9.765647710846142e-05,
1745
+ "loss": 0.074,
1746
+ "step": 2860
1747
+ },
1748
+ {
1749
+ "grad_norm": 0.85185706615448,
1750
+ "learning_rate": 9.763139807498991e-05,
1751
+ "loss": 0.0777,
1752
+ "step": 2870
1753
+ },
1754
+ {
1755
+ "grad_norm": 0.6887856125831604,
1756
+ "learning_rate": 9.760618881909487e-05,
1757
+ "loss": 0.0797,
1758
+ "step": 2880
1759
+ },
1760
+ {
1761
+ "grad_norm": 0.752108097076416,
1762
+ "learning_rate": 9.758084940969744e-05,
1763
+ "loss": 0.0902,
1764
+ "step": 2890
1765
+ },
1766
+ {
1767
+ "grad_norm": 0.7339804172515869,
1768
+ "learning_rate": 9.755537991607459e-05,
1769
+ "loss": 0.0835,
1770
+ "step": 2900
1771
+ },
1772
+ {
1773
+ "grad_norm": 0.8898280262947083,
1774
+ "learning_rate": 9.752978040785895e-05,
1775
+ "loss": 0.0949,
1776
+ "step": 2910
1777
+ },
1778
+ {
1779
+ "grad_norm": 0.899470329284668,
1780
+ "learning_rate": 9.750405095503859e-05,
1781
+ "loss": 0.0803,
1782
+ "step": 2920
1783
+ },
1784
+ {
1785
+ "grad_norm": 0.8418382406234741,
1786
+ "learning_rate": 9.747819162795686e-05,
1787
+ "loss": 0.1044,
1788
+ "step": 2930
1789
+ },
1790
+ {
1791
+ "grad_norm": 0.7426697015762329,
1792
+ "learning_rate": 9.745220249731217e-05,
1793
+ "loss": 0.0953,
1794
+ "step": 2940
1795
+ },
1796
+ {
1797
+ "grad_norm": 0.8499985933303833,
1798
+ "learning_rate": 9.742608363415781e-05,
1799
+ "loss": 0.0967,
1800
+ "step": 2950
1801
+ },
1802
+ {
1803
+ "grad_norm": 0.8468800783157349,
1804
+ "learning_rate": 9.739983510990176e-05,
1805
+ "loss": 0.0888,
1806
+ "step": 2960
1807
+ },
1808
+ {
1809
+ "grad_norm": 0.7636969089508057,
1810
+ "learning_rate": 9.737345699630647e-05,
1811
+ "loss": 0.1128,
1812
+ "step": 2970
1813
+ },
1814
+ {
1815
+ "grad_norm": 0.7591195106506348,
1816
+ "learning_rate": 9.734694936548869e-05,
1817
+ "loss": 0.1036,
1818
+ "step": 2980
1819
+ },
1820
+ {
1821
+ "grad_norm": 0.756714403629303,
1822
+ "learning_rate": 9.732031228991932e-05,
1823
+ "loss": 0.0833,
1824
+ "step": 2990
1825
+ },
1826
+ {
1827
+ "eval/loss": 0.08892382547259331,
1828
+ "step": 3000
1829
+ },
1830
+ {
1831
+ "grad_norm": 0.8152308464050293,
1832
+ "learning_rate": 9.729354584242302e-05,
1833
+ "loss": 0.0923,
1834
+ "step": 3000
1835
+ },
1836
+ {
1837
+ "grad_norm": 0.6225714683532715,
1838
+ "learning_rate": 9.726665009617832e-05,
1839
+ "loss": 0.0941,
1840
+ "step": 3010
1841
+ },
1842
+ {
1843
+ "grad_norm": 0.9567254185676575,
1844
+ "learning_rate": 9.723962512471714e-05,
1845
+ "loss": 0.116,
1846
+ "step": 3020
1847
+ },
1848
+ {
1849
+ "grad_norm": 0.7935624122619629,
1850
+ "learning_rate": 9.72124710019247e-05,
1851
+ "loss": 0.0843,
1852
+ "step": 3030
1853
+ },
1854
+ {
1855
+ "grad_norm": 0.9391128420829773,
1856
+ "learning_rate": 9.718518780203934e-05,
1857
+ "loss": 0.0871,
1858
+ "step": 3040
1859
+ },
1860
+ {
1861
+ "grad_norm": 0.8182015419006348,
1862
+ "learning_rate": 9.715777559965228e-05,
1863
+ "loss": 0.0915,
1864
+ "step": 3050
1865
+ },
1866
+ {
1867
+ "grad_norm": 0.6948622465133667,
1868
+ "learning_rate": 9.713023446970746e-05,
1869
+ "loss": 0.0814,
1870
+ "step": 3060
1871
+ },
1872
+ {
1873
+ "grad_norm": 0.8153758645057678,
1874
+ "learning_rate": 9.710256448750126e-05,
1875
+ "loss": 0.089,
1876
+ "step": 3070
1877
+ },
1878
+ {
1879
+ "grad_norm": 0.6592750549316406,
1880
+ "learning_rate": 9.707476572868235e-05,
1881
+ "loss": 0.1341,
1882
+ "step": 3080
1883
+ },
1884
+ {
1885
+ "grad_norm": 0.760163426399231,
1886
+ "learning_rate": 9.704683826925149e-05,
1887
+ "loss": 0.0784,
1888
+ "step": 3090
1889
+ },
1890
+ {
1891
+ "grad_norm": 0.6800974607467651,
1892
+ "learning_rate": 9.701878218556129e-05,
1893
+ "loss": 0.0969,
1894
+ "step": 3100
1895
+ },
1896
+ {
1897
+ "grad_norm": 0.7948629260063171,
1898
+ "learning_rate": 9.699059755431598e-05,
1899
+ "loss": 0.0847,
1900
+ "step": 3110
1901
+ },
1902
+ {
1903
+ "grad_norm": 0.7480330467224121,
1904
+ "learning_rate": 9.696228445257132e-05,
1905
+ "loss": 0.0823,
1906
+ "step": 3120
1907
+ },
1908
+ {
1909
+ "grad_norm": 0.8076481223106384,
1910
+ "learning_rate": 9.693384295773419e-05,
1911
+ "loss": 0.0859,
1912
+ "step": 3130
1913
+ },
1914
+ {
1915
+ "grad_norm": 0.9288508892059326,
1916
+ "learning_rate": 9.690527314756259e-05,
1917
+ "loss": 0.1012,
1918
+ "step": 3140
1919
+ },
1920
+ {
1921
+ "grad_norm": 0.833281397819519,
1922
+ "learning_rate": 9.687657510016527e-05,
1923
+ "loss": 0.0921,
1924
+ "step": 3150
1925
+ },
1926
+ {
1927
+ "grad_norm": 0.786363422870636,
1928
+ "learning_rate": 9.684774889400161e-05,
1929
+ "loss": 0.0769,
1930
+ "step": 3160
1931
+ },
1932
+ {
1933
+ "grad_norm": 0.8035467863082886,
1934
+ "learning_rate": 9.681879460788135e-05,
1935
+ "loss": 0.1043,
1936
+ "step": 3170
1937
+ },
1938
+ {
1939
+ "grad_norm": 0.8117609620094299,
1940
+ "learning_rate": 9.67897123209644e-05,
1941
+ "loss": 0.0946,
1942
+ "step": 3180
1943
+ },
1944
+ {
1945
+ "grad_norm": 0.8063046932220459,
1946
+ "learning_rate": 9.676050211276062e-05,
1947
+ "loss": 0.0901,
1948
+ "step": 3190
1949
+ },
1950
+ {
1951
+ "grad_norm": 0.6972727179527283,
1952
+ "learning_rate": 9.673116406312962e-05,
1953
+ "loss": 0.0719,
1954
+ "step": 3200
1955
+ },
1956
+ {
1957
+ "grad_norm": 0.7159572839736938,
1958
+ "learning_rate": 9.67016982522805e-05,
1959
+ "loss": 0.0821,
1960
+ "step": 3210
1961
+ },
1962
+ {
1963
+ "grad_norm": 0.7346596717834473,
1964
+ "learning_rate": 9.667210476077164e-05,
1965
+ "loss": 0.0872,
1966
+ "step": 3220
1967
+ },
1968
+ {
1969
+ "grad_norm": 0.6855632662773132,
1970
+ "learning_rate": 9.664238366951055e-05,
1971
+ "loss": 0.089,
1972
+ "step": 3230
1973
+ },
1974
+ {
1975
+ "grad_norm": 0.8691261410713196,
1976
+ "learning_rate": 9.661253505975355e-05,
1977
+ "loss": 0.0848,
1978
+ "step": 3240
1979
+ },
1980
+ {
1981
+ "grad_norm": 0.7525714039802551,
1982
+ "learning_rate": 9.658255901310557e-05,
1983
+ "loss": 0.0898,
1984
+ "step": 3250
1985
+ },
1986
+ {
1987
+ "grad_norm": 0.7712537050247192,
1988
+ "learning_rate": 9.655245561152e-05,
1989
+ "loss": 0.0661,
1990
+ "step": 3260
1991
+ },
1992
+ {
1993
+ "grad_norm": 0.8987488746643066,
1994
+ "learning_rate": 9.65222249372984e-05,
1995
+ "loss": 0.0946,
1996
+ "step": 3270
1997
+ },
1998
+ {
1999
+ "grad_norm": 0.7688019871711731,
2000
+ "learning_rate": 9.649186707309026e-05,
2001
+ "loss": 0.1034,
2002
+ "step": 3280
2003
+ },
2004
+ {
2005
+ "grad_norm": 0.8300652503967285,
2006
+ "learning_rate": 9.646138210189283e-05,
2007
+ "loss": 0.0984,
2008
+ "step": 3290
2009
+ },
2010
+ {
2011
+ "grad_norm": 0.7578057646751404,
2012
+ "learning_rate": 9.643077010705087e-05,
2013
+ "loss": 0.0892,
2014
+ "step": 3300
2015
+ },
2016
+ {
2017
+ "grad_norm": 0.6529524326324463,
2018
+ "learning_rate": 9.640003117225637e-05,
2019
+ "loss": 0.0895,
2020
+ "step": 3310
2021
+ },
2022
+ {
2023
+ "grad_norm": 0.8418911695480347,
2024
+ "learning_rate": 9.636916538154846e-05,
2025
+ "loss": 0.0809,
2026
+ "step": 3320
2027
+ },
2028
+ {
2029
+ "grad_norm": 0.7712683081626892,
2030
+ "learning_rate": 9.633817281931296e-05,
2031
+ "loss": 0.0686,
2032
+ "step": 3330
2033
+ },
2034
+ {
2035
+ "grad_norm": 0.7736837863922119,
2036
+ "learning_rate": 9.630705357028242e-05,
2037
+ "loss": 0.0807,
2038
+ "step": 3340
2039
+ },
2040
+ {
2041
+ "grad_norm": 0.6950215101242065,
2042
+ "learning_rate": 9.627580771953563e-05,
2043
+ "loss": 0.0891,
2044
+ "step": 3350
2045
+ },
2046
+ {
2047
+ "grad_norm": 0.6912685632705688,
2048
+ "learning_rate": 9.624443535249759e-05,
2049
+ "loss": 0.0837,
2050
+ "step": 3360
2051
+ },
2052
+ {
2053
+ "grad_norm": 0.8387035727500916,
2054
+ "learning_rate": 9.621293655493913e-05,
2055
+ "loss": 0.0944,
2056
+ "step": 3370
2057
+ },
2058
+ {
2059
+ "grad_norm": 0.7013605237007141,
2060
+ "learning_rate": 9.618131141297675e-05,
2061
+ "loss": 0.0868,
2062
+ "step": 3380
2063
+ },
2064
+ {
2065
+ "grad_norm": 0.8324646353721619,
2066
+ "learning_rate": 9.614956001307242e-05,
2067
+ "loss": 0.0789,
2068
+ "step": 3390
2069
+ },
2070
+ {
2071
+ "grad_norm": 0.7250398993492126,
2072
+ "learning_rate": 9.611768244203321e-05,
2073
+ "loss": 0.0795,
2074
+ "step": 3400
2075
+ },
2076
+ {
2077
+ "grad_norm": 0.8336584568023682,
2078
+ "learning_rate": 9.60856787870112e-05,
2079
+ "loss": 0.0816,
2080
+ "step": 3410
2081
+ },
2082
+ {
2083
+ "grad_norm": 0.8211973309516907,
2084
+ "learning_rate": 9.605354913550318e-05,
2085
+ "loss": 0.089,
2086
+ "step": 3420
2087
+ },
2088
+ {
2089
+ "grad_norm": 0.9170548915863037,
2090
+ "learning_rate": 9.602129357535037e-05,
2091
+ "loss": 0.0747,
2092
+ "step": 3430
2093
+ },
2094
+ {
2095
+ "grad_norm": 0.7421762943267822,
2096
+ "learning_rate": 9.598891219473825e-05,
2097
+ "loss": 0.0767,
2098
+ "step": 3440
2099
+ },
2100
+ {
2101
+ "grad_norm": 0.7628731727600098,
2102
+ "learning_rate": 9.595640508219625e-05,
2103
+ "loss": 0.0889,
2104
+ "step": 3450
2105
+ },
2106
+ {
2107
+ "grad_norm": 0.5860986113548279,
2108
+ "learning_rate": 9.592377232659761e-05,
2109
+ "loss": 0.0737,
2110
+ "step": 3460
2111
+ },
2112
+ {
2113
+ "grad_norm": 0.6898937821388245,
2114
+ "learning_rate": 9.589101401715904e-05,
2115
+ "loss": 0.0938,
2116
+ "step": 3470
2117
+ },
2118
+ {
2119
+ "grad_norm": 0.7277891635894775,
2120
+ "learning_rate": 9.585813024344045e-05,
2121
+ "loss": 0.0868,
2122
+ "step": 3480
2123
+ },
2124
+ {
2125
+ "grad_norm": 0.6677371263504028,
2126
+ "learning_rate": 9.58251210953449e-05,
2127
+ "loss": 0.0785,
2128
+ "step": 3490
2129
+ },
2130
+ {
2131
+ "eval/loss": 0.09203485958278179,
2132
+ "step": 3500
2133
+ },
2134
+ {
2135
+ "grad_norm": 0.7086572051048279,
2136
+ "learning_rate": 9.579198666311809e-05,
2137
+ "loss": 0.1026,
2138
+ "step": 3500
2139
+ },
2140
+ {
2141
+ "grad_norm": 0.6426539421081543,
2142
+ "learning_rate": 9.575872703734832e-05,
2143
+ "loss": 0.072,
2144
+ "step": 3510
2145
+ },
2146
+ {
2147
+ "grad_norm": 0.705496609210968,
2148
+ "learning_rate": 9.572534230896611e-05,
2149
+ "loss": 0.0736,
2150
+ "step": 3520
2151
+ },
2152
+ {
2153
+ "grad_norm": 0.861102283000946,
2154
+ "learning_rate": 9.569183256924403e-05,
2155
+ "loss": 0.0712,
2156
+ "step": 3530
2157
+ },
2158
+ {
2159
+ "grad_norm": 0.8250711560249329,
2160
+ "learning_rate": 9.565819790979646e-05,
2161
+ "loss": 0.0907,
2162
+ "step": 3540
2163
+ },
2164
+ {
2165
+ "grad_norm": 0.76627117395401,
2166
+ "learning_rate": 9.562443842257925e-05,
2167
+ "loss": 0.0833,
2168
+ "step": 3550
2169
+ },
2170
+ {
2171
+ "grad_norm": 0.599372148513794,
2172
+ "learning_rate": 9.559055419988956e-05,
2173
+ "loss": 0.0809,
2174
+ "step": 3560
2175
+ },
2176
+ {
2177
+ "grad_norm": 0.8596682548522949,
2178
+ "learning_rate": 9.555654533436557e-05,
2179
+ "loss": 0.091,
2180
+ "step": 3570
2181
+ },
2182
+ {
2183
+ "grad_norm": 0.8261439800262451,
2184
+ "learning_rate": 9.552241191898621e-05,
2185
+ "loss": 0.0799,
2186
+ "step": 3580
2187
+ },
2188
+ {
2189
+ "grad_norm": 0.6706359386444092,
2190
+ "learning_rate": 9.548815404707092e-05,
2191
+ "loss": 0.0991,
2192
+ "step": 3590
2193
+ },
2194
+ {
2195
+ "grad_norm": 0.7521600723266602,
2196
+ "learning_rate": 9.545377181227942e-05,
2197
+ "loss": 0.0848,
2198
+ "step": 3600
2199
+ },
2200
+ {
2201
+ "grad_norm": 0.8199614882469177,
2202
+ "learning_rate": 9.541926530861145e-05,
2203
+ "loss": 0.0956,
2204
+ "step": 3610
2205
+ },
2206
+ {
2207
+ "grad_norm": 0.7299994230270386,
2208
+ "learning_rate": 9.538463463040645e-05,
2209
+ "loss": 0.0728,
2210
+ "step": 3620
2211
+ },
2212
+ {
2213
+ "grad_norm": 0.7606593370437622,
2214
+ "learning_rate": 9.534987987234337e-05,
2215
+ "loss": 0.0846,
2216
+ "step": 3630
2217
+ },
2218
+ {
2219
+ "grad_norm": 0.7459183931350708,
2220
+ "learning_rate": 9.53150011294404e-05,
2221
+ "loss": 0.0862,
2222
+ "step": 3640
2223
+ },
2224
+ {
2225
+ "grad_norm": 0.6445205807685852,
2226
+ "learning_rate": 9.527999849705471e-05,
2227
+ "loss": 0.0853,
2228
+ "step": 3650
2229
+ },
2230
+ {
2231
+ "grad_norm": 0.7891181707382202,
2232
+ "learning_rate": 9.524487207088213e-05,
2233
+ "loss": 0.0746,
2234
+ "step": 3660
2235
+ },
2236
+ {
2237
+ "grad_norm": 0.7532823085784912,
2238
+ "learning_rate": 9.520962194695698e-05,
2239
+ "loss": 0.0804,
2240
+ "step": 3670
2241
+ },
2242
+ {
2243
+ "grad_norm": 0.6685933470726013,
2244
+ "learning_rate": 9.517424822165175e-05,
2245
+ "loss": 0.0881,
2246
+ "step": 3680
2247
+ },
2248
+ {
2249
+ "grad_norm": 0.7136467695236206,
2250
+ "learning_rate": 9.513875099167685e-05,
2251
+ "loss": 0.0589,
2252
+ "step": 3690
2253
+ },
2254
+ {
2255
+ "grad_norm": 0.7880435585975647,
2256
+ "learning_rate": 9.510313035408035e-05,
2257
+ "loss": 0.0866,
2258
+ "step": 3700
2259
+ },
2260
+ {
2261
+ "grad_norm": 0.681725263595581,
2262
+ "learning_rate": 9.506738640624775e-05,
2263
+ "loss": 0.0821,
2264
+ "step": 3710
2265
+ },
2266
+ {
2267
+ "grad_norm": 0.7891600131988525,
2268
+ "learning_rate": 9.50315192459016e-05,
2269
+ "loss": 0.0673,
2270
+ "step": 3720
2271
+ },
2272
+ {
2273
+ "grad_norm": 0.8075012564659119,
2274
+ "learning_rate": 9.499552897110136e-05,
2275
+ "loss": 0.0909,
2276
+ "step": 3730
2277
+ },
2278
+ {
2279
+ "grad_norm": 0.5734759569168091,
2280
+ "learning_rate": 9.495941568024304e-05,
2281
+ "loss": 0.0831,
2282
+ "step": 3740
2283
+ },
2284
+ {
2285
+ "grad_norm": 0.6696135997772217,
2286
+ "learning_rate": 9.492317947205904e-05,
2287
+ "loss": 0.0726,
2288
+ "step": 3750
2289
+ },
2290
+ {
2291
+ "grad_norm": 0.6502518057823181,
2292
+ "learning_rate": 9.488682044561775e-05,
2293
+ "loss": 0.0813,
2294
+ "step": 3760
2295
+ },
2296
+ {
2297
+ "grad_norm": 0.7042556405067444,
2298
+ "learning_rate": 9.485033870032335e-05,
2299
+ "loss": 0.0871,
2300
+ "step": 3770
2301
+ },
2302
+ {
2303
+ "grad_norm": 0.666341245174408,
2304
+ "learning_rate": 9.481373433591556e-05,
2305
+ "loss": 0.0794,
2306
+ "step": 3780
2307
+ },
2308
+ {
2309
+ "grad_norm": 0.8834477663040161,
2310
+ "learning_rate": 9.47770074524693e-05,
2311
+ "loss": 0.0833,
2312
+ "step": 3790
2313
+ },
2314
+ {
2315
+ "grad_norm": 0.6137105226516724,
2316
+ "learning_rate": 9.474015815039446e-05,
2317
+ "loss": 0.0939,
2318
+ "step": 3800
2319
+ },
2320
+ {
2321
+ "grad_norm": 0.6735588312149048,
2322
+ "learning_rate": 9.470318653043565e-05,
2323
+ "loss": 0.076,
2324
+ "step": 3810
2325
+ },
2326
+ {
2327
+ "grad_norm": 0.7587777972221375,
2328
+ "learning_rate": 9.466609269367185e-05,
2329
+ "loss": 0.0701,
2330
+ "step": 3820
2331
+ },
2332
+ {
2333
+ "grad_norm": 0.6362771987915039,
2334
+ "learning_rate": 9.46288767415162e-05,
2335
+ "loss": 0.0718,
2336
+ "step": 3830
2337
+ },
2338
+ {
2339
+ "grad_norm": 0.7394053339958191,
2340
+ "learning_rate": 9.459153877571567e-05,
2341
+ "loss": 0.0756,
2342
+ "step": 3840
2343
+ },
2344
+ {
2345
+ "grad_norm": 0.7164848446846008,
2346
+ "learning_rate": 9.455407889835087e-05,
2347
+ "loss": 0.0734,
2348
+ "step": 3850
2349
+ },
2350
+ {
2351
+ "grad_norm": 0.6653488278388977,
2352
+ "learning_rate": 9.451649721183564e-05,
2353
+ "loss": 0.0755,
2354
+ "step": 3860
2355
+ },
2356
+ {
2357
+ "grad_norm": 0.7268480658531189,
2358
+ "learning_rate": 9.447879381891692e-05,
2359
+ "loss": 0.0937,
2360
+ "step": 3870
2361
+ },
2362
+ {
2363
+ "grad_norm": 0.6954826712608337,
2364
+ "learning_rate": 9.444096882267428e-05,
2365
+ "loss": 0.1,
2366
+ "step": 3880
2367
+ },
2368
+ {
2369
+ "grad_norm": 0.6395136117935181,
2370
+ "learning_rate": 9.440302232651988e-05,
2371
+ "loss": 0.0955,
2372
+ "step": 3890
2373
+ },
2374
+ {
2375
+ "grad_norm": 0.5530162453651428,
2376
+ "learning_rate": 9.436495443419795e-05,
2377
+ "loss": 0.0884,
2378
+ "step": 3900
2379
+ },
2380
+ {
2381
+ "grad_norm": 0.6270701885223389,
2382
+ "learning_rate": 9.432676524978466e-05,
2383
+ "loss": 0.0939,
2384
+ "step": 3910
2385
+ },
2386
+ {
2387
+ "grad_norm": 0.6683644652366638,
2388
+ "learning_rate": 9.42884548776878e-05,
2389
+ "loss": 0.0846,
2390
+ "step": 3920
2391
+ },
2392
+ {
2393
+ "grad_norm": 0.5946120619773865,
2394
+ "learning_rate": 9.425002342264646e-05,
2395
+ "loss": 0.0716,
2396
+ "step": 3930
2397
+ },
2398
+ {
2399
+ "grad_norm": 0.643402099609375,
2400
+ "learning_rate": 9.421147098973077e-05,
2401
+ "loss": 0.0779,
2402
+ "step": 3940
2403
+ },
2404
+ {
2405
+ "grad_norm": 0.604381263256073,
2406
+ "learning_rate": 9.41727976843416e-05,
2407
+ "loss": 0.0651,
2408
+ "step": 3950
2409
+ },
2410
+ {
2411
+ "grad_norm": 0.5524080991744995,
2412
+ "learning_rate": 9.413400361221029e-05,
2413
+ "loss": 0.0781,
2414
+ "step": 3960
2415
+ },
2416
+ {
2417
+ "grad_norm": 0.6096197366714478,
2418
+ "learning_rate": 9.409508887939835e-05,
2419
+ "loss": 0.1109,
2420
+ "step": 3970
2421
+ },
2422
+ {
2423
+ "grad_norm": 0.7158094048500061,
2424
+ "learning_rate": 9.40560535922972e-05,
2425
+ "loss": 0.0746,
2426
+ "step": 3980
2427
+ },
2428
+ {
2429
+ "grad_norm": 0.6933304667472839,
2430
+ "learning_rate": 9.40168978576278e-05,
2431
+ "loss": 0.0624,
2432
+ "step": 3990
2433
+ },
2434
+ {
2435
+ "eval/loss": 0.09422188766300678,
2436
+ "step": 4000
2437
+ },
2438
+ {
2439
+ "grad_norm": 0.690434455871582,
2440
+ "learning_rate": 9.397762178244043e-05,
2441
+ "loss": 0.0626,
2442
+ "step": 4000
2443
+ }
2444
+ ],
2445
+ "logging_steps": 10,
2446
+ "max_steps": 20000,
2447
+ "num_input_tokens_seen": 0,
2448
+ "num_train_epochs": 9223372036854775807,
2449
+ "save_steps": 2000,
2450
+ "stateful_callbacks": {
2451
+ "TrainerControl": {
2452
+ "args": {
2453
+ "should_epoch_stop": false,
2454
+ "should_evaluate": false,
2455
+ "should_log": false,
2456
+ "should_save": true,
2457
+ "should_training_stop": false
2458
+ },
2459
+ "attributes": {}
2460
+ }
2461
+ },
2462
+ "total_flos": 0.0,
2463
+ "train_batch_size": 8,
2464
+ "trial_name": null,
2465
+ "trial_params": null
2466
+ }
checkpoint-4000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:521ef611c73b3813b50d95976021bd03f553b27ba895bdffb88ecbbf626ff345
3
+ size 7825
checkpoint-4000/wandb_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"project": "groot-finetune", "run_id": "g1_finetune-20260527-142325"}
checkpoint-4000/zero_to_fp32.py ADDED
@@ -0,0 +1,760 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ # Copyright (c) Microsoft Corporation.
4
+ # SPDX-License-Identifier: Apache-2.0
5
+
6
+ # DeepSpeed Team
7
+
8
+ # This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
9
+ # copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
10
+ # the future. Once extracted, the weights don't require DeepSpeed and can be used in any
11
+ # application.
12
+ #
13
+ # example:
14
+ # python zero_to_fp32.py . output_dir/
15
+ # or
16
+ # python zero_to_fp32.py . output_dir/ --safe_serialization
17
+
18
+ import argparse
19
+ import torch
20
+ import glob
21
+ import math
22
+ import os
23
+ import re
24
+ import gc
25
+ import json
26
+ import numpy as np
27
+ from tqdm import tqdm
28
+ from collections import OrderedDict
29
+ from dataclasses import dataclass
30
+
31
+ # while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
32
+ # DeepSpeed data structures it has to be available in the current python environment.
33
+ from deepspeed.utils import logger
34
+ from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
35
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
36
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
37
+
38
+
39
+ @dataclass
40
+ class zero_model_state:
41
+ buffers: dict()
42
+ param_shapes: dict()
43
+ shared_params: list
44
+ ds_version: int
45
+ frozen_param_shapes: dict()
46
+ frozen_param_fragments: dict()
47
+
48
+
49
+ debug = 0
50
+
51
+ # load to cpu
52
+ device = torch.device('cpu')
53
+
54
+
55
+ def atoi(text):
56
+ return int(text) if text.isdigit() else text
57
+
58
+
59
+ def natural_keys(text):
60
+ '''
61
+ alist.sort(key=natural_keys) sorts in human order
62
+ http://nedbatchelder.com/blog/200712/human_sorting.html
63
+ (See Toothy's implementation in the comments)
64
+ '''
65
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
66
+
67
+
68
+ def get_model_state_file(checkpoint_dir, zero_stage):
69
+ if not os.path.isdir(checkpoint_dir):
70
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
71
+
72
+ # there should be only one file
73
+ if zero_stage <= 2:
74
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
75
+ elif zero_stage == 3:
76
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
77
+
78
+ if not os.path.exists(file):
79
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
80
+
81
+ return file
82
+
83
+
84
+ def get_checkpoint_files(checkpoint_dir, glob_pattern):
85
+ # XXX: need to test that this simple glob rule works for multi-node setup too
86
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
87
+
88
+ if len(ckpt_files) == 0:
89
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
90
+
91
+ return ckpt_files
92
+
93
+
94
+ def get_optim_files(checkpoint_dir):
95
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
96
+
97
+
98
+ def get_model_state_files(checkpoint_dir):
99
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
100
+
101
+
102
+ def parse_model_states(files):
103
+ zero_model_states = []
104
+ for file in files:
105
+ state_dict = torch.load(file, map_location=device, weights_only=False)
106
+
107
+ if BUFFER_NAMES not in state_dict:
108
+ raise ValueError(f"{file} is not a model state checkpoint")
109
+ buffer_names = state_dict[BUFFER_NAMES]
110
+ if debug:
111
+ print("Found buffers:", buffer_names)
112
+
113
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
114
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
115
+ param_shapes = state_dict[PARAM_SHAPES]
116
+
117
+ # collect parameters that are included in param_shapes
118
+ param_names = []
119
+ for s in param_shapes:
120
+ for name in s.keys():
121
+ param_names.append(name)
122
+
123
+ # update with frozen parameters
124
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
125
+ if frozen_param_shapes is not None:
126
+ if debug:
127
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
128
+ param_names += list(frozen_param_shapes.keys())
129
+
130
+ # handle shared params
131
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
132
+
133
+ ds_version = state_dict.get(DS_VERSION, None)
134
+
135
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
136
+
137
+ z_model_state = zero_model_state(buffers=buffers,
138
+ param_shapes=param_shapes,
139
+ shared_params=shared_params,
140
+ ds_version=ds_version,
141
+ frozen_param_shapes=frozen_param_shapes,
142
+ frozen_param_fragments=frozen_param_fragments)
143
+ zero_model_states.append(z_model_state)
144
+
145
+ return zero_model_states
146
+
147
+
148
+ def parse_optim_states(files, ds_checkpoint_dir):
149
+ total_files = len(files)
150
+ state_dicts = []
151
+ for f in tqdm(files, desc='Loading checkpoint shards'):
152
+ state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
153
+ # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
154
+ # and also handle the case where it was already removed by another helper script
155
+ state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
156
+ state_dicts.append(state_dict)
157
+
158
+ if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]:
159
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
160
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
161
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
162
+
163
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
164
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
165
+ # use the max of the partition_count to get the dp world_size.
166
+
167
+ if type(world_size) is list:
168
+ world_size = max(world_size)
169
+
170
+ if world_size != total_files:
171
+ raise ValueError(
172
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
173
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
174
+ )
175
+
176
+ # the groups are named differently in each stage
177
+ if zero_stage <= 2:
178
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
179
+ elif zero_stage == 3:
180
+ fp32_groups_key = FP32_FLAT_GROUPS
181
+ else:
182
+ raise ValueError(f"unknown zero stage {zero_stage}")
183
+
184
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
185
+ return zero_stage, world_size, fp32_flat_groups
186
+
187
+
188
+ def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
189
+ """
190
+ Returns fp32 state_dict reconstructed from ds checkpoint
191
+
192
+ Args:
193
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
194
+
195
+ """
196
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
197
+
198
+ optim_files = get_optim_files(ds_checkpoint_dir)
199
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
200
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
201
+
202
+ model_files = get_model_state_files(ds_checkpoint_dir)
203
+
204
+ zero_model_states = parse_model_states(model_files)
205
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
206
+
207
+ if zero_stage <= 2:
208
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
209
+ exclude_frozen_parameters)
210
+ elif zero_stage == 3:
211
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
212
+ exclude_frozen_parameters)
213
+
214
+
215
+ def _zero2_merge_frozen_params(state_dict, zero_model_states):
216
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
217
+ return
218
+
219
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
220
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
221
+
222
+ if debug:
223
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
224
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
225
+
226
+ wanted_params = len(frozen_param_shapes)
227
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
228
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
229
+ print(f'Frozen params: Have {avail_numel} numels to process.')
230
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
231
+
232
+ total_params = 0
233
+ total_numel = 0
234
+ for name, shape in frozen_param_shapes.items():
235
+ total_params += 1
236
+ unpartitioned_numel = shape.numel()
237
+ total_numel += unpartitioned_numel
238
+
239
+ state_dict[name] = frozen_param_fragments[name]
240
+
241
+ if debug:
242
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
243
+
244
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
245
+
246
+
247
+ def _has_callable(obj, fn):
248
+ attr = getattr(obj, fn, None)
249
+ return callable(attr)
250
+
251
+
252
+ def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
253
+ param_shapes = zero_model_states[0].param_shapes
254
+
255
+ # Reconstruction protocol:
256
+ #
257
+ # XXX: document this
258
+
259
+ if debug:
260
+ for i in range(world_size):
261
+ for j in range(len(fp32_flat_groups[0])):
262
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
263
+
264
+ # XXX: memory usage doubles here (zero2)
265
+ num_param_groups = len(fp32_flat_groups[0])
266
+ merged_single_partition_of_fp32_groups = []
267
+ for i in range(num_param_groups):
268
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
269
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
270
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
271
+ avail_numel = sum(
272
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
273
+
274
+ if debug:
275
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
276
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
277
+ # not asserting if there is a mismatch due to possible padding
278
+ print(f"Have {avail_numel} numels to process.")
279
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
280
+
281
+ # params
282
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
283
+ # out-of-core computing solution
284
+ total_numel = 0
285
+ total_params = 0
286
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
287
+ offset = 0
288
+ avail_numel = full_single_fp32_vector.numel()
289
+ for name, shape in shapes.items():
290
+
291
+ unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
292
+ total_numel += unpartitioned_numel
293
+ total_params += 1
294
+
295
+ if debug:
296
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
297
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
298
+ offset += unpartitioned_numel
299
+
300
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
301
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
302
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
303
+ # live optimizer object, so we are checking that the numbers are within the right range
304
+ align_to = 2 * world_size
305
+
306
+ def zero2_align(x):
307
+ return align_to * math.ceil(x / align_to)
308
+
309
+ if debug:
310
+ print(f"original offset={offset}, avail_numel={avail_numel}")
311
+
312
+ offset = zero2_align(offset)
313
+ avail_numel = zero2_align(avail_numel)
314
+
315
+ if debug:
316
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
317
+
318
+ # Sanity check
319
+ if offset != avail_numel:
320
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
321
+
322
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
323
+
324
+
325
+ def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
326
+ exclude_frozen_parameters):
327
+ state_dict = OrderedDict()
328
+
329
+ # buffers
330
+ buffers = zero_model_states[0].buffers
331
+ state_dict.update(buffers)
332
+ if debug:
333
+ print(f"added {len(buffers)} buffers")
334
+
335
+ if not exclude_frozen_parameters:
336
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
337
+
338
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
339
+
340
+ # recover shared parameters
341
+ for pair in zero_model_states[0].shared_params:
342
+ if pair[1] in state_dict:
343
+ state_dict[pair[0]] = state_dict[pair[1]]
344
+
345
+ return state_dict
346
+
347
+
348
+ def zero3_partitioned_param_info(unpartitioned_numel, world_size):
349
+ remainder = unpartitioned_numel % world_size
350
+ padding_numel = (world_size - remainder) if remainder else 0
351
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
352
+ return partitioned_numel, padding_numel
353
+
354
+
355
+ def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
356
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
357
+ return
358
+
359
+ if debug:
360
+ for i in range(world_size):
361
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
362
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
363
+
364
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
365
+ wanted_params = len(frozen_param_shapes)
366
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
367
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
368
+ print(f'Frozen params: Have {avail_numel} numels to process.')
369
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
370
+
371
+ total_params = 0
372
+ total_numel = 0
373
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
374
+ total_params += 1
375
+ unpartitioned_numel = shape.numel()
376
+ total_numel += unpartitioned_numel
377
+
378
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
379
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
380
+
381
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
382
+
383
+ if debug:
384
+ print(
385
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
386
+ )
387
+
388
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
389
+
390
+
391
+ class GatheredTensor:
392
+ """
393
+ A pseudo tensor that collects partitioned weights.
394
+ It is more memory efficient when there are multiple groups.
395
+ """
396
+
397
+ def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
398
+ self.flat_groups = flat_groups
399
+ self.flat_groups_offset = flat_groups_offset
400
+ self.offset = offset
401
+ self.partitioned_numel = partitioned_numel
402
+ self.shape = shape
403
+ self.dtype = self.flat_groups[0][0].dtype
404
+
405
+ def contiguous(self):
406
+ """
407
+ Merge partitioned weights from flat_groups into a single tensor.
408
+ """
409
+ end_idx = self.offset + self.partitioned_numel
410
+ world_size = len(self.flat_groups)
411
+ pad_flat_param_chunks = []
412
+
413
+ for rank_i in range(world_size):
414
+ # for each rank, we need to collect weights from related group/groups
415
+ flat_groups_at_rank_i = self.flat_groups[rank_i]
416
+ start_group_id = None
417
+ end_group_id = None
418
+ for group_id in range(len(self.flat_groups_offset)):
419
+ if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
420
+ start_group_id = group_id
421
+ if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
422
+ end_group_id = group_id
423
+ break
424
+ # collect weights from related group/groups
425
+ for group_id in range(start_group_id, end_group_id + 1):
426
+ flat_tensor = flat_groups_at_rank_i[group_id]
427
+ start_offset = self.offset - self.flat_groups_offset[group_id]
428
+ end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
429
+ pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
430
+
431
+ # collect weights from all ranks
432
+ pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
433
+ param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
434
+ return param
435
+
436
+
437
+ def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
438
+ param_shapes = zero_model_states[0].param_shapes
439
+ avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
440
+
441
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
442
+ # param, re-consolidating each param, while dealing with padding if any
443
+
444
+ # merge list of dicts, preserving order
445
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
446
+
447
+ if debug:
448
+ for i in range(world_size):
449
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
450
+
451
+ wanted_params = len(param_shapes)
452
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
453
+ # not asserting if there is a mismatch due to possible padding
454
+ avail_numel = fp32_flat_groups[0].numel() * world_size
455
+ print(f"Trainable params: Have {avail_numel} numels to process.")
456
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
457
+
458
+ # params
459
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
460
+ # out-of-core computing solution
461
+ offset = 0
462
+ total_numel = 0
463
+ total_params = 0
464
+ flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
465
+ for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
466
+ unpartitioned_numel = shape.numel()
467
+ total_numel += unpartitioned_numel
468
+ total_params += 1
469
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
470
+
471
+ if debug:
472
+ print(
473
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
474
+ )
475
+
476
+ # memory efficient tensor
477
+ tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
478
+ state_dict[name] = tensor
479
+ offset += partitioned_numel
480
+
481
+ offset *= world_size
482
+
483
+ # Sanity check
484
+ if offset != avail_numel:
485
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
486
+
487
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
488
+
489
+
490
+ def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
491
+ exclude_frozen_parameters):
492
+ state_dict = OrderedDict()
493
+
494
+ # buffers
495
+ buffers = zero_model_states[0].buffers
496
+ state_dict.update(buffers)
497
+ if debug:
498
+ print(f"added {len(buffers)} buffers")
499
+
500
+ if not exclude_frozen_parameters:
501
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
502
+
503
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
504
+
505
+ # recover shared parameters
506
+ for pair in zero_model_states[0].shared_params:
507
+ if pair[1] in state_dict:
508
+ state_dict[pair[0]] = state_dict[pair[1]]
509
+
510
+ return state_dict
511
+
512
+
513
+ def to_torch_tensor(state_dict, return_empty_tensor=False):
514
+ """
515
+ Convert state_dict of GatheredTensor to torch tensor
516
+ """
517
+ torch_state_dict = {}
518
+ converted_tensors = {}
519
+ for name, tensor in state_dict.items():
520
+ tensor_id = id(tensor)
521
+ if tensor_id in converted_tensors: # shared tensors
522
+ shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
523
+ torch_state_dict[name] = shared_tensor
524
+ else:
525
+ converted_tensors[tensor_id] = name
526
+ if return_empty_tensor:
527
+ torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
528
+ else:
529
+ torch_state_dict[name] = tensor.contiguous()
530
+ return torch_state_dict
531
+
532
+
533
+ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
534
+ tag=None,
535
+ exclude_frozen_parameters=False,
536
+ lazy_mode=False):
537
+ """
538
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
539
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
540
+ via a model hub.
541
+
542
+ Args:
543
+ - ``checkpoint_dir``: path to the desired checkpoint folder
544
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
545
+ - ``exclude_frozen_parameters``: exclude frozen parameters
546
+ - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
547
+ Convert the pesduo tensor to torch tensor by ``.contiguous()``
548
+
549
+ Returns:
550
+ - pytorch ``state_dict``
551
+
552
+ A typical usage might be ::
553
+
554
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
555
+ # do the training and checkpoint saving
556
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
557
+ model = model.cpu() # move to cpu
558
+ model.load_state_dict(state_dict)
559
+ # submit to model hub or save the model to share with others
560
+
561
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
562
+ application. i.e. you will need to re-initialize the deepspeed engine, since
563
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
564
+
565
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
566
+
567
+ Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
568
+ You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
569
+ the checkpoint. Or you can load state_dict in lazy mode ::
570
+
571
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
572
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
573
+ for name, lazy_tensor in state_dict.item():
574
+ tensor = lazy_tensor.contiguous() # to cpu
575
+ print(name, tensor)
576
+ # del tensor to release memory if it no longer in use
577
+ """
578
+ if tag is None:
579
+ latest_path = os.path.join(checkpoint_dir, 'latest')
580
+ if os.path.isfile(latest_path):
581
+ with open(latest_path, 'r') as fd:
582
+ tag = fd.read().strip()
583
+ else:
584
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
585
+
586
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
587
+
588
+ if not os.path.isdir(ds_checkpoint_dir):
589
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
590
+
591
+ state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
592
+ if lazy_mode:
593
+ return state_dict
594
+ else:
595
+ return to_torch_tensor(state_dict)
596
+
597
+
598
+ def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
599
+ output_dir,
600
+ max_shard_size="5GB",
601
+ safe_serialization=False,
602
+ tag=None,
603
+ exclude_frozen_parameters=False):
604
+ """
605
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
606
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
607
+
608
+ Args:
609
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
610
+ - ``output_dir``: directory to the pytorch fp32 state_dict output files
611
+ - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
612
+ - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
613
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
614
+ - ``exclude_frozen_parameters``: exclude frozen parameters
615
+ """
616
+
617
+ # Dependency pre-check
618
+ if safe_serialization:
619
+ try:
620
+ from safetensors.torch import save_file
621
+ except ImportError:
622
+ print('If you want to use `safe_serialization`, please `pip install safetensors`')
623
+ raise
624
+ if max_shard_size is not None:
625
+ try:
626
+ from huggingface_hub import split_torch_state_dict_into_shards
627
+ except ImportError:
628
+ print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
629
+ raise
630
+
631
+ # Convert zero checkpoint to state_dict
632
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
633
+ tag,
634
+ exclude_frozen_parameters,
635
+ lazy_mode=True)
636
+
637
+ # Shard the model if it is too big.
638
+ weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
639
+ if max_shard_size is not None:
640
+ filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
641
+ # an memory-efficient approach for sharding
642
+ empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
643
+ state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
644
+ filename_pattern=filename_pattern,
645
+ max_shard_size=max_shard_size)
646
+ else:
647
+ from collections import namedtuple
648
+ StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
649
+ state_dict_split = StateDictSplit(is_sharded=False,
650
+ filename_to_tensors={weights_name: list(state_dict.keys())})
651
+
652
+ # Save the model by shard
653
+ os.makedirs(output_dir, exist_ok=True)
654
+ filename_to_tensors = state_dict_split.filename_to_tensors.items()
655
+ for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
656
+ shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
657
+ shard_state_dict = to_torch_tensor(shard_state_dict)
658
+ output_path = os.path.join(output_dir, shard_file)
659
+ if safe_serialization:
660
+ save_file(shard_state_dict, output_path, metadata={"format": "pt"})
661
+ else:
662
+ torch.save(shard_state_dict, output_path)
663
+ # release the memory of current shard
664
+ for tensor_name in list(shard_state_dict.keys()):
665
+ del state_dict[tensor_name]
666
+ del shard_state_dict[tensor_name]
667
+ del shard_state_dict
668
+ gc.collect()
669
+
670
+ # Save index if sharded
671
+ if state_dict_split.is_sharded:
672
+ index = {
673
+ "metadata": state_dict_split.metadata,
674
+ "weight_map": state_dict_split.tensor_to_filename,
675
+ }
676
+ save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
677
+ save_index_file = os.path.join(output_dir, save_index_file)
678
+ with open(save_index_file, "w", encoding="utf-8") as f:
679
+ content = json.dumps(index, indent=2, sort_keys=True) + "\n"
680
+ f.write(content)
681
+
682
+
683
+ def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
684
+ """
685
+ 1. Put the provided model to cpu
686
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
687
+ 3. Load it into the provided model
688
+
689
+ Args:
690
+ - ``model``: the model object to update
691
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
692
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
693
+
694
+ Returns:
695
+ - ``model`: modified model
696
+
697
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
698
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
699
+ conveniently placed for you in the checkpoint folder.
700
+
701
+ A typical usage might be ::
702
+
703
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
704
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
705
+ # submit to model hub or save the model to share with others
706
+
707
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
708
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
709
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
710
+
711
+ """
712
+ logger.info("Extracting fp32 weights")
713
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
714
+
715
+ logger.info("Overwriting model with fp32 weights")
716
+ model = model.cpu()
717
+ model.load_state_dict(state_dict, strict=False)
718
+
719
+ return model
720
+
721
+
722
+ if __name__ == "__main__":
723
+ parser = argparse.ArgumentParser()
724
+ parser.add_argument("checkpoint_dir",
725
+ type=str,
726
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
727
+ parser.add_argument("output_dir",
728
+ type=str,
729
+ help="directory to the pytorch fp32 state_dict output files"
730
+ "(e.g. path/checkpoint-12-output/)")
731
+ parser.add_argument(
732
+ "--max_shard_size",
733
+ type=str,
734
+ default="5GB",
735
+ help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
736
+ "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
737
+ "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
738
+ "without CPU OOM issues.")
739
+ parser.add_argument(
740
+ "--safe_serialization",
741
+ default=False,
742
+ action='store_true',
743
+ help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
744
+ parser.add_argument("-t",
745
+ "--tag",
746
+ type=str,
747
+ default=None,
748
+ help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
749
+ parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
750
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
751
+ args = parser.parse_args()
752
+
753
+ debug = args.debug
754
+
755
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
756
+ args.output_dir,
757
+ max_shard_size=args.max_shard_size,
758
+ safe_serialization=args.safe_serialization,
759
+ tag=args.tag,
760
+ exclude_frozen_parameters=args.exclude_frozen_parameters)