quao627 commited on
Commit
9463593
·
verified ·
1 Parent(s): 99976cb

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
args.json ADDED
@@ -0,0 +1,326 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "Qwen/Qwen2.5-7B",
3
+ "model_type": "qwen2_5",
4
+ "model_revision": null,
5
+ "task_type": "causal_lm",
6
+ "torch_dtype": "bfloat16",
7
+ "attn_impl": null,
8
+ "num_labels": null,
9
+ "problem_type": null,
10
+ "rope_scaling": null,
11
+ "device_map": null,
12
+ "max_memory": {},
13
+ "local_repo_path": null,
14
+ "template": "qwen2_5",
15
+ "system": null,
16
+ "max_length": 8192,
17
+ "truncation_strategy": "delete",
18
+ "max_pixels": null,
19
+ "agent_template": null,
20
+ "norm_bbox": null,
21
+ "response_prefix": null,
22
+ "padding_side": "right",
23
+ "loss_scale": "default",
24
+ "sequence_parallel_size": 1,
25
+ "use_chat_template": true,
26
+ "template_backend": "swift",
27
+ "dataset": [
28
+ "./RAG_train_sft.json"
29
+ ],
30
+ "val_dataset": [],
31
+ "split_dataset_ratio": 0.01,
32
+ "data_seed": 42,
33
+ "dataset_num_proc": 1,
34
+ "dataset_shuffle": true,
35
+ "val_dataset_shuffle": false,
36
+ "streaming": false,
37
+ "interleave_prob": null,
38
+ "stopping_strategy": "first_exhausted",
39
+ "shuffle_buffer_size": 1000,
40
+ "enable_cache": false,
41
+ "download_mode": "reuse_dataset_if_exists",
42
+ "columns": {},
43
+ "strict": false,
44
+ "remove_unused_columns": true,
45
+ "model_name": [
46
+ null,
47
+ null
48
+ ],
49
+ "model_author": [
50
+ null,
51
+ null
52
+ ],
53
+ "custom_dataset_info": [],
54
+ "quant_method": null,
55
+ "quant_bits": null,
56
+ "hqq_axis": null,
57
+ "bnb_4bit_compute_dtype": "bfloat16",
58
+ "bnb_4bit_quant_type": "nf4",
59
+ "bnb_4bit_use_double_quant": true,
60
+ "bnb_4bit_quant_storage": null,
61
+ "max_new_tokens": 64,
62
+ "temperature": 0.0,
63
+ "top_k": null,
64
+ "top_p": null,
65
+ "repetition_penalty": null,
66
+ "num_beams": 1,
67
+ "stream": false,
68
+ "stop_words": [],
69
+ "logprobs": false,
70
+ "top_logprobs": null,
71
+ "ckpt_dir": null,
72
+ "load_dataset_config": null,
73
+ "lora_modules": [],
74
+ "tuner_backend": "peft",
75
+ "train_type": "full",
76
+ "adapters": [],
77
+ "external_plugins": [],
78
+ "seed": 42,
79
+ "model_kwargs": {},
80
+ "load_args": false,
81
+ "load_data_args": false,
82
+ "use_hf": true,
83
+ "hub_token": null,
84
+ "custom_register_path": [],
85
+ "ignore_args_error": false,
86
+ "use_swift_lora": false,
87
+ "output_dir": "/raid/shared/mem1/models/Qwen2.5-7B-search-sft-v2/v0-20250511-083818",
88
+ "overwrite_output_dir": false,
89
+ "do_train": false,
90
+ "do_eval": false,
91
+ "do_predict": false,
92
+ "eval_strategy": "steps",
93
+ "prediction_loss_only": false,
94
+ "per_device_train_batch_size": 1,
95
+ "per_device_eval_batch_size": 1,
96
+ "per_gpu_train_batch_size": null,
97
+ "per_gpu_eval_batch_size": null,
98
+ "gradient_accumulation_steps": 16,
99
+ "eval_accumulation_steps": null,
100
+ "eval_delay": 0,
101
+ "torch_empty_cache_steps": null,
102
+ "learning_rate": 0.0001,
103
+ "weight_decay": 0.1,
104
+ "adam_beta1": 0.9,
105
+ "adam_beta2": 0.95,
106
+ "adam_epsilon": 1e-08,
107
+ "max_grad_norm": 1.0,
108
+ "num_train_epochs": 1.0,
109
+ "max_steps": -1,
110
+ "lr_scheduler_type": "cosine",
111
+ "lr_scheduler_kwargs": null,
112
+ "warmup_ratio": 0.0,
113
+ "warmup_steps": 0,
114
+ "log_level": "passive",
115
+ "log_level_replica": "warning",
116
+ "log_on_each_node": true,
117
+ "logging_dir": "/raid/shared/mem1/models/Qwen2.5-7B-search-sft-v2/v0-20250511-083818/runs",
118
+ "logging_strategy": "steps",
119
+ "logging_first_step": true,
120
+ "logging_steps": 5,
121
+ "logging_nan_inf_filter": true,
122
+ "save_strategy": "steps",
123
+ "save_steps": 50.0,
124
+ "save_total_limit": 2,
125
+ "save_safetensors": true,
126
+ "save_on_each_node": false,
127
+ "save_only_model": false,
128
+ "restore_callback_states_from_checkpoint": false,
129
+ "no_cuda": false,
130
+ "use_cpu": false,
131
+ "use_mps_device": false,
132
+ "jit_mode_eval": false,
133
+ "use_ipex": false,
134
+ "bf16": true,
135
+ "fp16": false,
136
+ "fp16_opt_level": "O1",
137
+ "half_precision_backend": "auto",
138
+ "bf16_full_eval": false,
139
+ "fp16_full_eval": false,
140
+ "tf32": null,
141
+ "local_rank": -1,
142
+ "ddp_backend": null,
143
+ "tpu_num_cores": null,
144
+ "tpu_metrics_debug": false,
145
+ "debug": null,
146
+ "dataloader_drop_last": false,
147
+ "eval_steps": 50.0,
148
+ "dataloader_num_workers": null,
149
+ "dataloader_prefetch_factor": null,
150
+ "past_index": -1,
151
+ "run_name": null,
152
+ "disable_tqdm": null,
153
+ "label_names": null,
154
+ "load_best_model_at_end": false,
155
+ "metric_for_best_model": "loss",
156
+ "greater_is_better": false,
157
+ "ignore_data_skip": false,
158
+ "fsdp": "",
159
+ "fsdp_min_num_params": 0,
160
+ "fsdp_config": null,
161
+ "tp_size": 0,
162
+ "fsdp_transformer_layer_cls_to_wrap": null,
163
+ "accelerator_config": {
164
+ "dispatch_batches": false
165
+ },
166
+ "deepspeed": null,
167
+ "label_smoothing_factor": 0.0,
168
+ "optim": "adamw_torch",
169
+ "optim_args": null,
170
+ "adafactor": false,
171
+ "group_by_length": false,
172
+ "length_column_name": "length",
173
+ "report_to": [
174
+ "tensorboard"
175
+ ],
176
+ "ddp_find_unused_parameters": null,
177
+ "ddp_bucket_cap_mb": null,
178
+ "ddp_broadcast_buffers": null,
179
+ "dataloader_pin_memory": true,
180
+ "dataloader_persistent_workers": false,
181
+ "skip_memory_metrics": true,
182
+ "use_legacy_prediction_loop": false,
183
+ "push_to_hub": false,
184
+ "resume_from_checkpoint": null,
185
+ "hub_model_id": null,
186
+ "hub_strategy": "every_save",
187
+ "hub_private_repo": null,
188
+ "hub_always_push": false,
189
+ "gradient_checkpointing": true,
190
+ "gradient_checkpointing_kwargs": null,
191
+ "include_inputs_for_metrics": false,
192
+ "include_for_metrics": [],
193
+ "eval_do_concat_batches": true,
194
+ "fp16_backend": "auto",
195
+ "push_to_hub_model_id": null,
196
+ "push_to_hub_organization": null,
197
+ "push_to_hub_token": null,
198
+ "mp_parameters": "",
199
+ "auto_find_batch_size": false,
200
+ "full_determinism": false,
201
+ "torchdynamo": null,
202
+ "ray_scope": "last",
203
+ "ddp_timeout": 1800,
204
+ "torch_compile": false,
205
+ "torch_compile_backend": null,
206
+ "torch_compile_mode": null,
207
+ "include_tokens_per_second": false,
208
+ "include_num_input_tokens_seen": false,
209
+ "neftune_noise_alpha": null,
210
+ "optim_target_modules": null,
211
+ "batch_eval_metrics": false,
212
+ "eval_on_start": false,
213
+ "use_liger_kernel": false,
214
+ "eval_use_gather_object": false,
215
+ "average_tokens_across_devices": false,
216
+ "sortish_sampler": false,
217
+ "predict_with_generate": false,
218
+ "generation_max_length": null,
219
+ "generation_num_beams": null,
220
+ "generation_config": null,
221
+ "check_model": true,
222
+ "acc_strategy": "token",
223
+ "train_dataloader_shuffle": true,
224
+ "metric_warmup_step": 0,
225
+ "fsdp_num": 1,
226
+ "acc_steps": 1,
227
+ "eval_use_evalscope": false,
228
+ "eval_datasets": [],
229
+ "eval_limit": null,
230
+ "eval_datasets_args": null,
231
+ "eval_generation_config": null,
232
+ "freeze_parameters": [],
233
+ "freeze_parameters_ratio": 0.0,
234
+ "trainable_parameters": [],
235
+ "freeze_llm": false,
236
+ "freeze_vit": true,
237
+ "freeze_aligner": true,
238
+ "target_modules": [
239
+ "all-linear"
240
+ ],
241
+ "target_regex": null,
242
+ "modules_to_save": [],
243
+ "lora_rank": 8,
244
+ "lora_alpha": 32,
245
+ "lora_dropout": 0.05,
246
+ "lora_bias": "none",
247
+ "lora_dtype": null,
248
+ "lorap_lr_ratio": null,
249
+ "use_rslora": false,
250
+ "use_dora": false,
251
+ "lora_ga_batch_size": 2,
252
+ "lora_ga_iters": 2,
253
+ "lora_ga_max_length": 1024,
254
+ "lora_ga_direction": "ArB2r",
255
+ "lora_ga_scale": "stable",
256
+ "lora_ga_stable_gamma": 16,
257
+ "init_weights": true,
258
+ "fourier_n_frequency": 2000,
259
+ "fourier_scaling": 300.0,
260
+ "boft_block_size": 4,
261
+ "boft_block_num": 0,
262
+ "boft_n_butterfly_factor": 1,
263
+ "boft_dropout": 0.0,
264
+ "vera_rank": 256,
265
+ "vera_projection_prng_key": 0,
266
+ "vera_dropout": 0.0,
267
+ "vera_d_initial": 0.1,
268
+ "adapter_act": "gelu",
269
+ "adapter_length": 128,
270
+ "use_galore": false,
271
+ "galore_target_modules": null,
272
+ "galore_rank": 128,
273
+ "galore_update_proj_gap": 50,
274
+ "galore_scale": 1.0,
275
+ "galore_proj_type": "std",
276
+ "galore_optim_per_parameter": false,
277
+ "galore_with_embedding": false,
278
+ "galore_quantization": false,
279
+ "galore_proj_quant": false,
280
+ "galore_proj_bits": 4,
281
+ "galore_proj_group_size": 256,
282
+ "galore_cos_threshold": 0.4,
283
+ "galore_gamma_proj": 2,
284
+ "galore_queue_size": 5,
285
+ "adalora_target_r": 8,
286
+ "adalora_init_r": 12,
287
+ "adalora_tinit": 0,
288
+ "adalora_tfinal": 0,
289
+ "adalora_deltaT": 1,
290
+ "adalora_beta1": 0.85,
291
+ "adalora_beta2": 0.85,
292
+ "adalora_orth_reg_weight": 0.5,
293
+ "llamapro_num_new_blocks": 4,
294
+ "llamapro_num_groups": null,
295
+ "lisa_activated_layers": 0,
296
+ "lisa_step_interval": 20,
297
+ "reft_layer_key": null,
298
+ "reft_layers": null,
299
+ "reft_rank": 4,
300
+ "reft_intervention_type": "LoreftIntervention",
301
+ "reft_args": null,
302
+ "swanlab_token": null,
303
+ "swanlab_project": null,
304
+ "swanlab_workspace": null,
305
+ "swanlab_exp_name": null,
306
+ "swanlab_mode": "cloud",
307
+ "add_version": true,
308
+ "resume_only_model": false,
309
+ "create_checkpoint_symlink": false,
310
+ "packing": false,
311
+ "lazy_tokenize": false,
312
+ "loss_type": null,
313
+ "optimizer": null,
314
+ "metric": null,
315
+ "zero_hpz_partition_size": null,
316
+ "rank": -1,
317
+ "global_world_size": 1,
318
+ "local_world_size": 1,
319
+ "model_suffix": "Qwen2.5-7B",
320
+ "model_info": "ModelInfo(model_type='qwen2_5', model_dir='/raid/zijian/.cache/huggingface/hub/models--Qwen--Qwen2.5-7B/snapshots/d149729398750b98c0af14eb82c78cfe92750796', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, rope_scaling=None, config=None, task_type='causal_lm', num_labels=None)",
321
+ "model_meta": "ModelMeta(model_type='qwen2_5', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct', hf_model_id='Qwen/Qwen2.5-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct', hf_model_id='Qwen/Qwen2.5-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct', hf_model_id='Qwen/Qwen2.5-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct', hf_model_id='Qwen/Qwen2.5-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct', hf_model_id='Qwen/Qwen2.5-72B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B', hf_model_id='Qwen/Qwen2.5-0.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B', hf_model_id='Qwen/Qwen2.5-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B', hf_model_id='Qwen/Qwen2.5-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B', hf_model_id='Qwen/Qwen2.5-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B', hf_model_id='Qwen/Qwen2.5-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B', hf_model_id='Qwen/Qwen2.5-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B', hf_model_id='Qwen/Qwen2.5-72B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-14B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-72B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B', hf_model_id='Qwen/Qwen2.5-Coder-0.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B', hf_model_id='Qwen/Qwen2.5-Coder-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B', hf_model_id='Qwen/Qwen2.5-Coder-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B', hf_model_id='Qwen/Qwen2.5-Coder-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B', hf_model_id='Qwen/Qwen2.5-Coder-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B', hf_model_id='Qwen/Qwen2.5-Coder-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=['coding'])], template='qwen2_5', get_function=<function get_model_tokenizer_with_flash_attn at 0x7f2780bc5ee0>, model_arch='llama', architectures=['Qwen2ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.37'], tags=[])",
322
+ "model_dir": "/raid/zijian/.cache/huggingface/hub/models--Qwen--Qwen2.5-7B/snapshots/d149729398750b98c0af14eb82c78cfe92750796",
323
+ "hub": "<class 'swift.hub.hub.HFHub'>",
324
+ "evaluation_strategy": "steps",
325
+ "training_args": "Seq2SeqTrainingArguments(output_dir='/raid/shared/mem1/models/Qwen2.5-7B-search-sft-v2/v0-20250511-083818', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.STEPS: 'steps'>, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=16, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=1.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.0, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/raid/shared/mem1/models/Qwen2.5-7B-search-sft-v2/v0-20250511-083818/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.STEPS: 'steps'>, save_steps=50, save_total_limit=2, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=50, dataloader_num_workers=1, dataloader_prefetch_factor=10, past_index=-1, run_name='/raid/shared/mem1/models/Qwen2.5-7B-search-sft-v2/v0-20250511-083818', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, tp_size=0, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed=None, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH: 'adamw_torch'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, metric_warmup_step=0, fsdp_num=1, acc_steps=1, eval_use_evalscope=False, eval_datasets=[], eval_limit=None, eval_datasets_args=None, eval_generation_config=None, train_type='full', optimizer=None, local_repo_path=None, galore_config=None)"
326
+ }
config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2ForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "bos_token_id": 151643,
7
+ "eos_token_id": 151643,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 3584,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 18944,
12
+ "max_position_embeddings": 131072,
13
+ "max_window_layers": 28,
14
+ "model_type": "qwen2",
15
+ "num_attention_heads": 28,
16
+ "num_hidden_layers": 28,
17
+ "num_key_value_heads": 4,
18
+ "pad_token_id": 151643,
19
+ "rms_norm_eps": 1e-06,
20
+ "rope_scaling": null,
21
+ "rope_theta": 1000000.0,
22
+ "sliding_window": 131072,
23
+ "tie_word_embeddings": false,
24
+ "torch_dtype": "bfloat16",
25
+ "transformers_version": "4.51.3",
26
+ "use_cache": false,
27
+ "use_mrope": false,
28
+ "use_sliding_window": false,
29
+ "vocab_size": 152064
30
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "eos_token_id": 151643,
4
+ "max_new_tokens": 2048,
5
+ "transformers_version": "4.51.3"
6
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae19c94702eeee54b765d3d132bf011a5ddfbceab365ad532f28c9827cb425fe
3
+ size 4877660776
model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f06d9470b3a17d38e974a160e017bd783c08c95d8f0e842239fcda0a246e0f09
3
+ size 4932751008
model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e3fa594a0292195903b75e0ad28a12efe811837498692cb74a2ee94e48ad41f
3
+ size 4330865200
model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d2773cc936219e062aabb0f6564fd5e76ba462a05e024b2ec929bab904f48ad
3
+ size 1089994880
model.safetensors.index.json ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 15231233024
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00004-of-00004.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
13
+ "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
14
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
15
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
16
+ "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
17
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
18
+ "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
19
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
20
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
21
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
22
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
23
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
24
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
25
+ "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
26
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
27
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
28
+ "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
29
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
30
+ "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
31
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
32
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
33
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
34
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
35
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
36
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
37
+ "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
38
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
39
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
40
+ "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
41
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
42
+ "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
43
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
44
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
45
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
46
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
47
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
48
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
49
+ "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
50
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
51
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
52
+ "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
53
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
54
+ "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
55
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
56
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
57
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
58
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
59
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
60
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
61
+ "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
62
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
63
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
64
+ "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
65
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
66
+ "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
67
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
68
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
69
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
70
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
71
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
72
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
73
+ "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
74
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
75
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
76
+ "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
77
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
78
+ "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
79
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
80
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
81
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
82
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
83
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
84
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
85
+ "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
86
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
87
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
88
+ "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
89
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
90
+ "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
91
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
92
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
93
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
94
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
95
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
96
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
97
+ "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
98
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
99
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
100
+ "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
101
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
102
+ "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
103
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
104
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
105
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
106
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
107
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
108
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
109
+ "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
110
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
111
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
112
+ "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
113
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
114
+ "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
115
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
116
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
117
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
118
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
119
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
120
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
121
+ "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
122
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
123
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
124
+ "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
125
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
126
+ "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
127
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
128
+ "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
129
+ "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
130
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
131
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
132
+ "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
133
+ "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
134
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
135
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
136
+ "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
137
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
138
+ "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
139
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
140
+ "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
141
+ "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
142
+ "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
143
+ "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
144
+ "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
145
+ "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
146
+ "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
147
+ "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
148
+ "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
149
+ "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
150
+ "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
151
+ "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
152
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
153
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
154
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
155
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
156
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
157
+ "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
158
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
159
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
160
+ "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
161
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
162
+ "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
163
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
164
+ "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
165
+ "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
166
+ "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
167
+ "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
168
+ "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
169
+ "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
170
+ "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
171
+ "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
172
+ "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
173
+ "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
174
+ "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
175
+ "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
176
+ "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
177
+ "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
178
+ "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
179
+ "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
180
+ "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
181
+ "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
182
+ "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
183
+ "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
184
+ "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
185
+ "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
186
+ "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
187
+ "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
188
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
189
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
190
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
191
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
192
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
193
+ "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
194
+ "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
195
+ "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
196
+ "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
197
+ "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
198
+ "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
199
+ "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
200
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
201
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
202
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
203
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
204
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
205
+ "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
206
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
207
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
208
+ "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
209
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
210
+ "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
211
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
212
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
213
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
214
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
215
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
216
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
217
+ "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
218
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
219
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
220
+ "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
221
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
222
+ "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
223
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
224
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
225
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
226
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
227
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
228
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
229
+ "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
230
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
231
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
232
+ "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
233
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
234
+ "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
235
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
236
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
237
+ "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
238
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
239
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
240
+ "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
241
+ "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
242
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
243
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
244
+ "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
245
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
246
+ "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
247
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
248
+ "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
249
+ "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
250
+ "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
251
+ "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
252
+ "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
253
+ "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
254
+ "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
255
+ "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
256
+ "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
257
+ "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
258
+ "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
259
+ "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
260
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
261
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
262
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
263
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
264
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
265
+ "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
266
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
267
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
268
+ "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
269
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
270
+ "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
271
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
272
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
273
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
274
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
275
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
276
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
277
+ "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
278
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
279
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
280
+ "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
281
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
282
+ "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
283
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
284
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
285
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
286
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
287
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
288
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
289
+ "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
290
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
291
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
292
+ "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
293
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
294
+ "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
295
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
296
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
297
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
298
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
299
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
300
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
301
+ "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
302
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
303
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
304
+ "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
305
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
306
+ "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
307
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
308
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
309
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
310
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
311
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
312
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
313
+ "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
314
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
315
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
316
+ "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
317
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
318
+ "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
319
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
320
+ "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
321
+ "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
322
+ "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
323
+ "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
324
+ "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
325
+ "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
326
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
327
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
328
+ "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
329
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
330
+ "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
331
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
332
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
333
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
334
+ "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
335
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
336
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
337
+ "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
338
+ "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
339
+ "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
340
+ "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
341
+ "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
342
+ "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
343
+ "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
344
+ "model.norm.weight": "model-00003-of-00004.safetensors"
345
+ }
346
+ }
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c29adc3f04b16f4ac7f5b1dc3d9fcb19c78040ad671c4b2bf4a3cc4d244df933
3
+ size 14645
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fab20963721a28ffd13afffac9feb3fc873b9e7cb9599f6edd6f60f5fe6c39fa
3
+ size 1465
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
tokenizer_config.json ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
199
+ "clean_up_tokenization_spaces": false,
200
+ "eos_token": "<|endoftext|>",
201
+ "errors": "replace",
202
+ "extra_special_tokens": {},
203
+ "model_max_length": 131072,
204
+ "pad_token": "<|endoftext|>",
205
+ "split_special_tokens": false,
206
+ "tokenizer_class": "Qwen2Tokenizer",
207
+ "unk_token": null
208
+ }
trainer_state.json ADDED
@@ -0,0 +1,3284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 1485,
3
+ "best_metric": 0.3291038,
4
+ "best_model_checkpoint": "/raid/shared/mem1/models/Qwen2.5-7B-search-sft-v2/v0-20250511-083818/checkpoint-1485",
5
+ "epoch": 0.9994531611492029,
6
+ "eval_steps": 50,
7
+ "global_step": 1485,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0006730324317503049,
14
+ "grad_norm": 179.0,
15
+ "learning_rate": 9.999988811118231e-05,
16
+ "loss": 0.9820185899734497,
17
+ "memory(GiB)": 71.61,
18
+ "step": 1,
19
+ "token_acc": 0.8044692737430168,
20
+ "train_speed(iter/s)": 0.218302
21
+ },
22
+ {
23
+ "epoch": 0.003365162158751525,
24
+ "grad_norm": 2928.0,
25
+ "learning_rate": 9.999720280459576e-05,
26
+ "loss": 5.42672061920166,
27
+ "memory(GiB)": 73.26,
28
+ "step": 5,
29
+ "token_acc": 0.3954864964853866,
30
+ "train_speed(iter/s)": 0.397179
31
+ },
32
+ {
33
+ "epoch": 0.00673032431750305,
34
+ "grad_norm": 35.75,
35
+ "learning_rate": 9.99888115313551e-05,
36
+ "loss": 1.4307238578796386,
37
+ "memory(GiB)": 73.26,
38
+ "step": 10,
39
+ "token_acc": 0.7211833231146536,
40
+ "train_speed(iter/s)": 0.450251
41
+ },
42
+ {
43
+ "epoch": 0.010095486476254575,
44
+ "grad_norm": 7.40625,
45
+ "learning_rate": 9.997482711915927e-05,
46
+ "loss": 1.1120232582092284,
47
+ "memory(GiB)": 73.26,
48
+ "step": 15,
49
+ "token_acc": 0.7599109131403118,
50
+ "train_speed(iter/s)": 0.467479
51
+ },
52
+ {
53
+ "epoch": 0.0134606486350061,
54
+ "grad_norm": 5.9375,
55
+ "learning_rate": 9.99552511326936e-05,
56
+ "loss": 0.819661808013916,
57
+ "memory(GiB)": 73.26,
58
+ "step": 20,
59
+ "token_acc": 0.7947040995374063,
60
+ "train_speed(iter/s)": 0.482418
61
+ },
62
+ {
63
+ "epoch": 0.016825810793757626,
64
+ "grad_norm": 5.4375,
65
+ "learning_rate": 9.993008576227247e-05,
66
+ "loss": 0.9263886451721192,
67
+ "memory(GiB)": 73.26,
68
+ "step": 25,
69
+ "token_acc": 0.7752791563275434,
70
+ "train_speed(iter/s)": 0.490393
71
+ },
72
+ {
73
+ "epoch": 0.02019097295250915,
74
+ "grad_norm": 4.1875,
75
+ "learning_rate": 9.989933382359422e-05,
76
+ "loss": 0.7815323352813721,
77
+ "memory(GiB)": 73.26,
78
+ "step": 30,
79
+ "token_acc": 0.8033576869267838,
80
+ "train_speed(iter/s)": 0.499449
81
+ },
82
+ {
83
+ "epoch": 0.023556135111260673,
84
+ "grad_norm": 4.0625,
85
+ "learning_rate": 9.986299875742613e-05,
86
+ "loss": 0.713004732131958,
87
+ "memory(GiB)": 73.26,
88
+ "step": 35,
89
+ "token_acc": 0.8128159139083646,
90
+ "train_speed(iter/s)": 0.503737
91
+ },
92
+ {
93
+ "epoch": 0.0269212972700122,
94
+ "grad_norm": 9.4375,
95
+ "learning_rate": 9.982108462921937e-05,
96
+ "loss": 0.7091834068298339,
97
+ "memory(GiB)": 73.26,
98
+ "step": 40,
99
+ "token_acc": 0.82666015625,
100
+ "train_speed(iter/s)": 0.502146
101
+ },
102
+ {
103
+ "epoch": 0.030286459428763724,
104
+ "grad_norm": 6.1875,
105
+ "learning_rate": 9.977359612865423e-05,
106
+ "loss": 0.6785173892974854,
107
+ "memory(GiB)": 73.26,
108
+ "step": 45,
109
+ "token_acc": 0.8104975044276284,
110
+ "train_speed(iter/s)": 0.505359
111
+ },
112
+ {
113
+ "epoch": 0.03365162158751525,
114
+ "grad_norm": 4.65625,
115
+ "learning_rate": 9.972053856911534e-05,
116
+ "loss": 0.7147142887115479,
117
+ "memory(GiB)": 73.26,
118
+ "step": 50,
119
+ "token_acc": 0.8093519535540562,
120
+ "train_speed(iter/s)": 0.508235
121
+ },
122
+ {
123
+ "epoch": 0.03365162158751525,
124
+ "eval_loss": 0.6807255148887634,
125
+ "eval_runtime": 6.4656,
126
+ "eval_samples_per_second": 37.119,
127
+ "eval_steps_per_second": 37.119,
128
+ "eval_token_acc": 0.8191331923890064,
129
+ "step": 50
130
+ },
131
+ {
132
+ "epoch": 0.037016783746266775,
133
+ "grad_norm": 3.875,
134
+ "learning_rate": 9.966191788709716e-05,
135
+ "loss": 0.7501668453216552,
136
+ "memory(GiB)": 73.26,
137
+ "step": 55,
138
+ "token_acc": 0.81598110608148,
139
+ "train_speed(iter/s)": 0.312271
140
+ },
141
+ {
142
+ "epoch": 0.0403819459050183,
143
+ "grad_norm": 5.15625,
144
+ "learning_rate": 9.959774064153977e-05,
145
+ "loss": 0.7556098461151123,
146
+ "memory(GiB)": 73.26,
147
+ "step": 60,
148
+ "token_acc": 0.812223746380125,
149
+ "train_speed(iter/s)": 0.323476
150
+ },
151
+ {
152
+ "epoch": 0.04374710806376982,
153
+ "grad_norm": 3.28125,
154
+ "learning_rate": 9.952801401309503e-05,
155
+ "loss": 0.6282004833221435,
156
+ "memory(GiB)": 73.26,
157
+ "step": 65,
158
+ "token_acc": 0.8382288469969311,
159
+ "train_speed(iter/s)": 0.332657
160
+ },
161
+ {
162
+ "epoch": 0.047112270222521346,
163
+ "grad_norm": 3.84375,
164
+ "learning_rate": 9.945274580332316e-05,
165
+ "loss": 0.6862215042114258,
166
+ "memory(GiB)": 73.26,
167
+ "step": 70,
168
+ "token_acc": 0.8230115830115831,
169
+ "train_speed(iter/s)": 0.341959
170
+ },
171
+ {
172
+ "epoch": 0.05047743238127287,
173
+ "grad_norm": 4.96875,
174
+ "learning_rate": 9.937194443381972e-05,
175
+ "loss": 0.8654034614562989,
176
+ "memory(GiB)": 73.26,
177
+ "step": 75,
178
+ "token_acc": 0.787591859807801,
179
+ "train_speed(iter/s)": 0.351409
180
+ },
181
+ {
182
+ "epoch": 0.0538425945400244,
183
+ "grad_norm": 3.734375,
184
+ "learning_rate": 9.928561894527353e-05,
185
+ "loss": 0.7333785057067871,
186
+ "memory(GiB)": 73.26,
187
+ "step": 80,
188
+ "token_acc": 0.8105247240284289,
189
+ "train_speed(iter/s)": 0.358757
190
+ },
191
+ {
192
+ "epoch": 0.057207756698775925,
193
+ "grad_norm": 10.6875,
194
+ "learning_rate": 9.919377899645497e-05,
195
+ "loss": 0.6422113418579102,
196
+ "memory(GiB)": 73.26,
197
+ "step": 85,
198
+ "token_acc": 0.8280427771176371,
199
+ "train_speed(iter/s)": 0.366331
200
+ },
201
+ {
202
+ "epoch": 0.06057291885752745,
203
+ "grad_norm": 3.265625,
204
+ "learning_rate": 9.909643486313533e-05,
205
+ "loss": 0.7867285251617432,
206
+ "memory(GiB)": 73.26,
207
+ "step": 90,
208
+ "token_acc": 0.8057151496824917,
209
+ "train_speed(iter/s)": 0.373092
210
+ },
211
+ {
212
+ "epoch": 0.06393808101627897,
213
+ "grad_norm": 3.0625,
214
+ "learning_rate": 9.899359743693714e-05,
215
+ "loss": 0.675608491897583,
216
+ "memory(GiB)": 73.26,
217
+ "step": 95,
218
+ "token_acc": 0.8164676304211188,
219
+ "train_speed(iter/s)": 0.379215
220
+ },
221
+ {
222
+ "epoch": 0.0673032431750305,
223
+ "grad_norm": 3.390625,
224
+ "learning_rate": 9.888527822411543e-05,
225
+ "loss": 0.7589282989501953,
226
+ "memory(GiB)": 73.26,
227
+ "step": 100,
228
+ "token_acc": 0.810183048761729,
229
+ "train_speed(iter/s)": 0.385325
230
+ },
231
+ {
232
+ "epoch": 0.0673032431750305,
233
+ "eval_loss": 0.6772852540016174,
234
+ "eval_runtime": 6.3754,
235
+ "eval_samples_per_second": 37.645,
236
+ "eval_steps_per_second": 37.645,
237
+ "eval_token_acc": 0.8261627906976744,
238
+ "step": 100
239
+ },
240
+ {
241
+ "epoch": 0.07066840533378202,
242
+ "grad_norm": 3.859375,
243
+ "learning_rate": 9.877148934427037e-05,
244
+ "loss": 0.6617954730987549,
245
+ "memory(GiB)": 73.26,
246
+ "step": 105,
247
+ "token_acc": 0.8273892727345341,
248
+ "train_speed(iter/s)": 0.311491
249
+ },
250
+ {
251
+ "epoch": 0.07403356749253355,
252
+ "grad_norm": 3.15625,
253
+ "learning_rate": 9.865224352899119e-05,
254
+ "loss": 0.7310012340545654,
255
+ "memory(GiB)": 73.26,
256
+ "step": 110,
257
+ "token_acc": 0.8089593596059114,
258
+ "train_speed(iter/s)": 0.317239
259
+ },
260
+ {
261
+ "epoch": 0.07739872965128507,
262
+ "grad_norm": 3.828125,
263
+ "learning_rate": 9.85275541204318e-05,
264
+ "loss": 0.703582763671875,
265
+ "memory(GiB)": 73.26,
266
+ "step": 115,
267
+ "token_acc": 0.8229212819376753,
268
+ "train_speed(iter/s)": 0.32253
269
+ },
270
+ {
271
+ "epoch": 0.0807638918100366,
272
+ "grad_norm": 4.1875,
273
+ "learning_rate": 9.839743506981782e-05,
274
+ "loss": 0.7135389804840088,
275
+ "memory(GiB)": 73.26,
276
+ "step": 120,
277
+ "token_acc": 0.8188603416104493,
278
+ "train_speed(iter/s)": 0.327653
279
+ },
280
+ {
281
+ "epoch": 0.08412905396878811,
282
+ "grad_norm": 3.5,
283
+ "learning_rate": 9.826190093588563e-05,
284
+ "loss": 0.6105506420135498,
285
+ "memory(GiB)": 73.26,
286
+ "step": 125,
287
+ "token_acc": 0.8411754713776117,
288
+ "train_speed(iter/s)": 0.332914
289
+ },
290
+ {
291
+ "epoch": 0.08749421612753965,
292
+ "grad_norm": 3.0625,
293
+ "learning_rate": 9.812096688325354e-05,
294
+ "loss": 0.7001046657562255,
295
+ "memory(GiB)": 73.26,
296
+ "step": 130,
297
+ "token_acc": 0.8249736406085254,
298
+ "train_speed(iter/s)": 0.337969
299
+ },
300
+ {
301
+ "epoch": 0.09085937828629118,
302
+ "grad_norm": 2.9375,
303
+ "learning_rate": 9.797464868072488e-05,
304
+ "loss": 0.6970377922058105,
305
+ "memory(GiB)": 73.26,
306
+ "step": 135,
307
+ "token_acc": 0.8269230769230769,
308
+ "train_speed(iter/s)": 0.342842
309
+ },
310
+ {
311
+ "epoch": 0.09422454044504269,
312
+ "grad_norm": 3.59375,
313
+ "learning_rate": 9.78229626995238e-05,
314
+ "loss": 0.6484662055969238,
315
+ "memory(GiB)": 73.26,
316
+ "step": 140,
317
+ "token_acc": 0.8294907944932824,
318
+ "train_speed(iter/s)": 0.347497
319
+ },
320
+ {
321
+ "epoch": 0.09758970260379422,
322
+ "grad_norm": 2.9375,
323
+ "learning_rate": 9.766592591146352e-05,
324
+ "loss": 0.6710952281951904,
325
+ "memory(GiB)": 73.26,
326
+ "step": 145,
327
+ "token_acc": 0.8295923041685753,
328
+ "train_speed(iter/s)": 0.352009
329
+ },
330
+ {
331
+ "epoch": 0.10095486476254574,
332
+ "grad_norm": 3.140625,
333
+ "learning_rate": 9.750355588704727e-05,
334
+ "loss": 0.6715181350708008,
335
+ "memory(GiB)": 73.26,
336
+ "step": 150,
337
+ "token_acc": 0.8266999559406668,
338
+ "train_speed(iter/s)": 0.355725
339
+ },
340
+ {
341
+ "epoch": 0.10095486476254574,
342
+ "eval_loss": 0.65446937084198,
343
+ "eval_runtime": 6.2927,
344
+ "eval_samples_per_second": 38.139,
345
+ "eval_steps_per_second": 38.139,
346
+ "eval_token_acc": 0.8305496828752643,
347
+ "step": 150
348
+ },
349
+ {
350
+ "epoch": 0.10432002692129727,
351
+ "grad_norm": 2.859375,
352
+ "learning_rate": 9.733587079350252e-05,
353
+ "loss": 0.6584550857543945,
354
+ "memory(GiB)": 73.26,
355
+ "step": 155,
356
+ "token_acc": 0.8323802163833076,
357
+ "train_speed(iter/s)": 0.310504
358
+ },
359
+ {
360
+ "epoch": 0.1076851890800488,
361
+ "grad_norm": 3.109375,
362
+ "learning_rate": 9.716288939274819e-05,
363
+ "loss": 0.7138989925384521,
364
+ "memory(GiB)": 73.26,
365
+ "step": 160,
366
+ "token_acc": 0.8222054380664653,
367
+ "train_speed(iter/s)": 0.314403
368
+ },
369
+ {
370
+ "epoch": 0.11105035123880032,
371
+ "grad_norm": 22.25,
372
+ "learning_rate": 9.698463103929542e-05,
373
+ "loss": 0.6830341339111328,
374
+ "memory(GiB)": 73.26,
375
+ "step": 165,
376
+ "token_acc": 0.8247949233864726,
377
+ "train_speed(iter/s)": 0.318566
378
+ },
379
+ {
380
+ "epoch": 0.11441551339755185,
381
+ "grad_norm": 2.859375,
382
+ "learning_rate": 9.680111567808213e-05,
383
+ "loss": 0.6824192047119141,
384
+ "memory(GiB)": 73.26,
385
+ "step": 170,
386
+ "token_acc": 0.836785661818716,
387
+ "train_speed(iter/s)": 0.3222
388
+ },
389
+ {
390
+ "epoch": 0.11778067555630337,
391
+ "grad_norm": 2.734375,
392
+ "learning_rate": 9.661236384224129e-05,
393
+ "loss": 0.6676050186157226,
394
+ "memory(GiB)": 73.26,
395
+ "step": 175,
396
+ "token_acc": 0.8297106664747373,
397
+ "train_speed(iter/s)": 0.325905
398
+ },
399
+ {
400
+ "epoch": 0.1211458377150549,
401
+ "grad_norm": 3.5,
402
+ "learning_rate": 9.641839665080363e-05,
403
+ "loss": 0.6630958557128906,
404
+ "memory(GiB)": 73.26,
405
+ "step": 180,
406
+ "token_acc": 0.8245426829268293,
407
+ "train_speed(iter/s)": 0.329211
408
+ },
409
+ {
410
+ "epoch": 0.12451099987380641,
411
+ "grad_norm": 3.140625,
412
+ "learning_rate": 9.62192358063346e-05,
413
+ "loss": 0.7024449348449707,
414
+ "memory(GiB)": 73.26,
415
+ "step": 185,
416
+ "token_acc": 0.8313099041533546,
417
+ "train_speed(iter/s)": 0.332825
418
+ },
419
+ {
420
+ "epoch": 0.12787616203255794,
421
+ "grad_norm": 3.953125,
422
+ "learning_rate": 9.601490359250615e-05,
423
+ "loss": 0.6541357517242432,
424
+ "memory(GiB)": 73.26,
425
+ "step": 190,
426
+ "token_acc": 0.829295154185022,
427
+ "train_speed(iter/s)": 0.336139
428
+ },
429
+ {
430
+ "epoch": 0.13124132419130946,
431
+ "grad_norm": 3.515625,
432
+ "learning_rate": 9.580542287160348e-05,
433
+ "loss": 0.6423999786376953,
434
+ "memory(GiB)": 73.26,
435
+ "step": 195,
436
+ "token_acc": 0.8379697413372377,
437
+ "train_speed(iter/s)": 0.339594
438
+ },
439
+ {
440
+ "epoch": 0.134606486350061,
441
+ "grad_norm": 3.515625,
442
+ "learning_rate": 9.559081708196696e-05,
443
+ "loss": 0.7582132339477539,
444
+ "memory(GiB)": 73.26,
445
+ "step": 200,
446
+ "token_acc": 0.8089741740008657,
447
+ "train_speed(iter/s)": 0.342609
448
+ },
449
+ {
450
+ "epoch": 0.134606486350061,
451
+ "eval_loss": 0.6314957737922668,
452
+ "eval_runtime": 6.3575,
453
+ "eval_samples_per_second": 37.75,
454
+ "eval_steps_per_second": 37.75,
455
+ "eval_token_acc": 0.8350422832980973,
456
+ "step": 200
457
+ },
458
+ {
459
+ "epoch": 0.13797164850881252,
460
+ "grad_norm": 3.96875,
461
+ "learning_rate": 9.537111023536973e-05,
462
+ "loss": 0.6429227352142334,
463
+ "memory(GiB)": 73.26,
464
+ "step": 205,
465
+ "token_acc": 0.8360507956416086,
466
+ "train_speed(iter/s)": 0.306958
467
+ },
468
+ {
469
+ "epoch": 0.14133681066756404,
470
+ "grad_norm": 3.109375,
471
+ "learning_rate": 9.514632691433107e-05,
472
+ "loss": 0.6970784187316894,
473
+ "memory(GiB)": 73.26,
474
+ "step": 210,
475
+ "token_acc": 0.8279309788743751,
476
+ "train_speed(iter/s)": 0.310052
477
+ },
478
+ {
479
+ "epoch": 0.14470197282631556,
480
+ "grad_norm": 2.84375,
481
+ "learning_rate": 9.491649226936585e-05,
482
+ "loss": 0.7154839515686036,
483
+ "memory(GiB)": 73.26,
484
+ "step": 215,
485
+ "token_acc": 0.8265466495213601,
486
+ "train_speed(iter/s)": 0.313148
487
+ },
488
+ {
489
+ "epoch": 0.1480671349850671,
490
+ "grad_norm": 3.5,
491
+ "learning_rate": 9.468163201617062e-05,
492
+ "loss": 0.6052781105041504,
493
+ "memory(GiB)": 73.26,
494
+ "step": 220,
495
+ "token_acc": 0.8488794669897032,
496
+ "train_speed(iter/s)": 0.315973
497
+ },
498
+ {
499
+ "epoch": 0.15143229714381862,
500
+ "grad_norm": 2.828125,
501
+ "learning_rate": 9.444177243274618e-05,
502
+ "loss": 0.5735151290893554,
503
+ "memory(GiB)": 73.26,
504
+ "step": 225,
505
+ "token_acc": 0.8521346213773762,
506
+ "train_speed(iter/s)": 0.318863
507
+ },
508
+ {
509
+ "epoch": 0.15479745930257013,
510
+ "grad_norm": 3.140625,
511
+ "learning_rate": 9.419694035645751e-05,
512
+ "loss": 0.6527684211730957,
513
+ "memory(GiB)": 73.26,
514
+ "step": 230,
515
+ "token_acc": 0.8357325655790148,
516
+ "train_speed(iter/s)": 0.32165
517
+ },
518
+ {
519
+ "epoch": 0.15816262146132168,
520
+ "grad_norm": 3.015625,
521
+ "learning_rate": 9.394716318103098e-05,
522
+ "loss": 0.672496223449707,
523
+ "memory(GiB)": 73.26,
524
+ "step": 235,
525
+ "token_acc": 0.8255225893459204,
526
+ "train_speed(iter/s)": 0.324423
527
+ },
528
+ {
529
+ "epoch": 0.1615277836200732,
530
+ "grad_norm": 2.5,
531
+ "learning_rate": 9.369246885348926e-05,
532
+ "loss": 0.5730775356292724,
533
+ "memory(GiB)": 73.26,
534
+ "step": 240,
535
+ "token_acc": 0.8565380231232699,
536
+ "train_speed(iter/s)": 0.327159
537
+ },
538
+ {
539
+ "epoch": 0.1648929457788247,
540
+ "grad_norm": 2.859375,
541
+ "learning_rate": 9.343288587102443e-05,
542
+ "loss": 0.6417149543762207,
543
+ "memory(GiB)": 73.26,
544
+ "step": 245,
545
+ "token_acc": 0.8340062808434275,
546
+ "train_speed(iter/s)": 0.329689
547
+ },
548
+ {
549
+ "epoch": 0.16825810793757623,
550
+ "grad_norm": 2.671875,
551
+ "learning_rate": 9.316844327780955e-05,
552
+ "loss": 0.6126539707183838,
553
+ "memory(GiB)": 73.26,
554
+ "step": 250,
555
+ "token_acc": 0.8416943761746422,
556
+ "train_speed(iter/s)": 0.332312
557
+ },
558
+ {
559
+ "epoch": 0.16825810793757623,
560
+ "eval_loss": 0.6005221009254456,
561
+ "eval_runtime": 6.383,
562
+ "eval_samples_per_second": 37.6,
563
+ "eval_steps_per_second": 37.6,
564
+ "eval_token_acc": 0.8414904862579281,
565
+ "step": 250
566
+ },
567
+ {
568
+ "epoch": 0.17162327009632777,
569
+ "grad_norm": 3.640625,
570
+ "learning_rate": 9.289917066174886e-05,
571
+ "loss": 0.5673539161682128,
572
+ "memory(GiB)": 73.78,
573
+ "step": 255,
574
+ "token_acc": 0.8447506770750358,
575
+ "train_speed(iter/s)": 0.305561
576
+ },
577
+ {
578
+ "epoch": 0.1749884322550793,
579
+ "grad_norm": 2.9375,
580
+ "learning_rate": 9.262509815116732e-05,
581
+ "loss": 0.696702241897583,
582
+ "memory(GiB)": 73.78,
583
+ "step": 260,
584
+ "token_acc": 0.8255023183925811,
585
+ "train_speed(iter/s)": 0.308253
586
+ },
587
+ {
588
+ "epoch": 0.1783535944138308,
589
+ "grad_norm": 3.34375,
590
+ "learning_rate": 9.23462564114396e-05,
591
+ "loss": 0.6081646919250489,
592
+ "memory(GiB)": 73.78,
593
+ "step": 265,
594
+ "token_acc": 0.83946592144077,
595
+ "train_speed(iter/s)": 0.310782
596
+ },
597
+ {
598
+ "epoch": 0.18171875657258235,
599
+ "grad_norm": 3.09375,
600
+ "learning_rate": 9.206267664155907e-05,
601
+ "loss": 0.6581857681274415,
602
+ "memory(GiB)": 73.78,
603
+ "step": 270,
604
+ "token_acc": 0.8402509652509652,
605
+ "train_speed(iter/s)": 0.31337
606
+ },
607
+ {
608
+ "epoch": 0.18508391873133387,
609
+ "grad_norm": 2.625,
610
+ "learning_rate": 9.177439057064683e-05,
611
+ "loss": 0.5923350334167481,
612
+ "memory(GiB)": 74.6,
613
+ "step": 275,
614
+ "token_acc": 0.8534911648653285,
615
+ "train_speed(iter/s)": 0.315773
616
+ },
617
+ {
618
+ "epoch": 0.18844908089008539,
619
+ "grad_norm": 3.640625,
620
+ "learning_rate": 9.14814304544018e-05,
621
+ "loss": 0.636703634262085,
622
+ "memory(GiB)": 74.6,
623
+ "step": 280,
624
+ "token_acc": 0.8385491895361901,
625
+ "train_speed(iter/s)": 0.318259
626
+ },
627
+ {
628
+ "epoch": 0.1918142430488369,
629
+ "grad_norm": 3.109375,
630
+ "learning_rate": 9.118382907149165e-05,
631
+ "loss": 0.6206272125244141,
632
+ "memory(GiB)": 74.6,
633
+ "step": 285,
634
+ "token_acc": 0.8444787644787645,
635
+ "train_speed(iter/s)": 0.320478
636
+ },
637
+ {
638
+ "epoch": 0.19517940520758845,
639
+ "grad_norm": 2.546875,
640
+ "learning_rate": 9.088161971988516e-05,
641
+ "loss": 0.6622869491577148,
642
+ "memory(GiB)": 74.6,
643
+ "step": 290,
644
+ "token_acc": 0.8280620155038759,
645
+ "train_speed(iter/s)": 0.322612
646
+ },
647
+ {
648
+ "epoch": 0.19854456736633996,
649
+ "grad_norm": 2.546875,
650
+ "learning_rate": 9.057483621312671e-05,
651
+ "loss": 0.5659195899963378,
652
+ "memory(GiB)": 74.6,
653
+ "step": 295,
654
+ "token_acc": 0.8537543198240654,
655
+ "train_speed(iter/s)": 0.324748
656
+ },
657
+ {
658
+ "epoch": 0.20190972952509148,
659
+ "grad_norm": 2.78125,
660
+ "learning_rate": 9.026351287655294e-05,
661
+ "loss": 0.576479721069336,
662
+ "memory(GiB)": 74.6,
663
+ "step": 300,
664
+ "token_acc": 0.8472282845918813,
665
+ "train_speed(iter/s)": 0.326837
666
+ },
667
+ {
668
+ "epoch": 0.20190972952509148,
669
+ "eval_loss": 0.6099406480789185,
670
+ "eval_runtime": 6.5229,
671
+ "eval_samples_per_second": 36.793,
672
+ "eval_steps_per_second": 36.793,
673
+ "eval_token_acc": 0.8406448202959831,
674
+ "step": 300
675
+ },
676
+ {
677
+ "epoch": 0.20527489168384302,
678
+ "grad_norm": 2.828125,
679
+ "learning_rate": 8.994768454345206e-05,
680
+ "loss": 0.6150260448455811,
681
+ "memory(GiB)": 74.6,
682
+ "step": 305,
683
+ "token_acc": 0.8427843137254902,
684
+ "train_speed(iter/s)": 0.30445
685
+ },
686
+ {
687
+ "epoch": 0.20864005384259454,
688
+ "grad_norm": 3.140625,
689
+ "learning_rate": 8.962738655116658e-05,
690
+ "loss": 0.6955391883850097,
691
+ "memory(GiB)": 74.6,
692
+ "step": 310,
693
+ "token_acc": 0.8351194121249235,
694
+ "train_speed(iter/s)": 0.306547
695
+ },
696
+ {
697
+ "epoch": 0.21200521600134606,
698
+ "grad_norm": 3.375,
699
+ "learning_rate": 8.930265473713938e-05,
700
+ "loss": 0.5725995063781738,
701
+ "memory(GiB)": 74.6,
702
+ "step": 315,
703
+ "token_acc": 0.854539641943734,
704
+ "train_speed(iter/s)": 0.308729
705
+ },
706
+ {
707
+ "epoch": 0.2153703781600976,
708
+ "grad_norm": 2.78125,
709
+ "learning_rate": 8.897352543490395e-05,
710
+ "loss": 0.5337778568267822,
711
+ "memory(GiB)": 74.6,
712
+ "step": 320,
713
+ "token_acc": 0.856396866840731,
714
+ "train_speed(iter/s)": 0.310883
715
+ },
716
+ {
717
+ "epoch": 0.21873554031884912,
718
+ "grad_norm": 3.0625,
719
+ "learning_rate": 8.864003547001915e-05,
720
+ "loss": 0.6348609447479248,
721
+ "memory(GiB)": 75.82,
722
+ "step": 325,
723
+ "token_acc": 0.8423275457531675,
724
+ "train_speed(iter/s)": 0.312767
725
+ },
726
+ {
727
+ "epoch": 0.22210070247760064,
728
+ "grad_norm": 3.09375,
729
+ "learning_rate": 8.83022221559489e-05,
730
+ "loss": 0.6023256778717041,
731
+ "memory(GiB)": 75.82,
732
+ "step": 330,
733
+ "token_acc": 0.8492975734355045,
734
+ "train_speed(iter/s)": 0.314836
735
+ },
736
+ {
737
+ "epoch": 0.22546586463635215,
738
+ "grad_norm": 3.171875,
739
+ "learning_rate": 8.796012328988716e-05,
740
+ "loss": 0.7502017498016358,
741
+ "memory(GiB)": 75.82,
742
+ "step": 335,
743
+ "token_acc": 0.8177310293012773,
744
+ "train_speed(iter/s)": 0.316755
745
+ },
746
+ {
747
+ "epoch": 0.2288310267951037,
748
+ "grad_norm": 3.125,
749
+ "learning_rate": 8.761377714852899e-05,
750
+ "loss": 0.5663125038146972,
751
+ "memory(GiB)": 75.82,
752
+ "step": 340,
753
+ "token_acc": 0.8523967726625534,
754
+ "train_speed(iter/s)": 0.318692
755
+ },
756
+ {
757
+ "epoch": 0.23219618895385521,
758
+ "grad_norm": 2.6875,
759
+ "learning_rate": 8.726322248378775e-05,
760
+ "loss": 0.6224043846130372,
761
+ "memory(GiB)": 75.82,
762
+ "step": 345,
763
+ "token_acc": 0.8415334471519479,
764
+ "train_speed(iter/s)": 0.32067
765
+ },
766
+ {
767
+ "epoch": 0.23556135111260673,
768
+ "grad_norm": 3.0625,
769
+ "learning_rate": 8.690849851845933e-05,
770
+ "loss": 0.6304502010345459,
771
+ "memory(GiB)": 75.82,
772
+ "step": 350,
773
+ "token_acc": 0.8429706005294691,
774
+ "train_speed(iter/s)": 0.322382
775
+ },
776
+ {
777
+ "epoch": 0.23556135111260673,
778
+ "eval_loss": 0.584740936756134,
779
+ "eval_runtime": 6.3513,
780
+ "eval_samples_per_second": 37.787,
781
+ "eval_steps_per_second": 37.787,
782
+ "eval_token_acc": 0.8461945031712473,
783
+ "step": 350
784
+ },
785
+ {
786
+ "epoch": 0.23892651327135828,
787
+ "grad_norm": 3.65625,
788
+ "learning_rate": 8.654964494183358e-05,
789
+ "loss": 0.6737657070159913,
790
+ "memory(GiB)": 75.82,
791
+ "step": 355,
792
+ "token_acc": 0.8434381584701626,
793
+ "train_speed(iter/s)": 0.303665
794
+ },
795
+ {
796
+ "epoch": 0.2422916754301098,
797
+ "grad_norm": 2.765625,
798
+ "learning_rate": 8.618670190525352e-05,
799
+ "loss": 0.6179668426513671,
800
+ "memory(GiB)": 75.82,
801
+ "step": 360,
802
+ "token_acc": 0.8392533779077866,
803
+ "train_speed(iter/s)": 0.305585
804
+ },
805
+ {
806
+ "epoch": 0.2456568375888613,
807
+ "grad_norm": 3.15625,
808
+ "learning_rate": 8.581971001762286e-05,
809
+ "loss": 0.626660680770874,
810
+ "memory(GiB)": 75.82,
811
+ "step": 365,
812
+ "token_acc": 0.8430995837335895,
813
+ "train_speed(iter/s)": 0.307436
814
+ },
815
+ {
816
+ "epoch": 0.24902199974761283,
817
+ "grad_norm": 3.265625,
818
+ "learning_rate": 8.54487103408625e-05,
819
+ "loss": 0.546476411819458,
820
+ "memory(GiB)": 75.82,
821
+ "step": 370,
822
+ "token_acc": 0.856482219741668,
823
+ "train_speed(iter/s)": 0.309304
824
+ },
825
+ {
826
+ "epoch": 0.25238716190636434,
827
+ "grad_norm": 3.09375,
828
+ "learning_rate": 8.507374438531607e-05,
829
+ "loss": 0.6368942260742188,
830
+ "memory(GiB)": 75.82,
831
+ "step": 375,
832
+ "token_acc": 0.8362845604224914,
833
+ "train_speed(iter/s)": 0.311038
834
+ },
835
+ {
836
+ "epoch": 0.2557523240651159,
837
+ "grad_norm": 2.921875,
838
+ "learning_rate": 8.469485410510545e-05,
839
+ "loss": 0.6205560684204101,
840
+ "memory(GiB)": 75.82,
841
+ "step": 380,
842
+ "token_acc": 0.8416890480453596,
843
+ "train_speed(iter/s)": 0.312832
844
+ },
845
+ {
846
+ "epoch": 0.25911748622386743,
847
+ "grad_norm": 3.1875,
848
+ "learning_rate": 8.43120818934367e-05,
849
+ "loss": 0.5687759399414063,
850
+ "memory(GiB)": 75.82,
851
+ "step": 385,
852
+ "token_acc": 0.8516474854169951,
853
+ "train_speed(iter/s)": 0.314546
854
+ },
855
+ {
856
+ "epoch": 0.2624826483826189,
857
+ "grad_norm": 3.375,
858
+ "learning_rate": 8.392547057785661e-05,
859
+ "loss": 0.6561696529388428,
860
+ "memory(GiB)": 75.82,
861
+ "step": 390,
862
+ "token_acc": 0.8435968137254902,
863
+ "train_speed(iter/s)": 0.316291
864
+ },
865
+ {
866
+ "epoch": 0.26584781054137047,
867
+ "grad_norm": 2.84375,
868
+ "learning_rate": 8.353506341546104e-05,
869
+ "loss": 0.6340418815612793,
870
+ "memory(GiB)": 75.82,
871
+ "step": 395,
872
+ "token_acc": 0.8422232182877634,
873
+ "train_speed(iter/s)": 0.317983
874
+ },
875
+ {
876
+ "epoch": 0.269212972700122,
877
+ "grad_norm": 2.921875,
878
+ "learning_rate": 8.314090408805482e-05,
879
+ "loss": 0.5887197017669678,
880
+ "memory(GiB)": 75.82,
881
+ "step": 400,
882
+ "token_acc": 0.8538324420677362,
883
+ "train_speed(iter/s)": 0.319589
884
+ },
885
+ {
886
+ "epoch": 0.269212972700122,
887
+ "eval_loss": 0.5566386580467224,
888
+ "eval_runtime": 6.2654,
889
+ "eval_samples_per_second": 38.306,
890
+ "eval_steps_per_second": 38.306,
891
+ "eval_token_acc": 0.8535412262156448,
892
+ "step": 400
893
+ },
894
+ {
895
+ "epoch": 0.2725781348588735,
896
+ "grad_norm": 3.828125,
897
+ "learning_rate": 8.274303669726426e-05,
898
+ "loss": 0.5180852890014649,
899
+ "memory(GiB)": 75.82,
900
+ "step": 405,
901
+ "token_acc": 0.8568248957953948,
902
+ "train_speed(iter/s)": 0.303898
903
+ },
904
+ {
905
+ "epoch": 0.27594329701762504,
906
+ "grad_norm": 3.171875,
907
+ "learning_rate": 8.234150575960288e-05,
908
+ "loss": 0.6065554618835449,
909
+ "memory(GiB)": 75.82,
910
+ "step": 410,
911
+ "token_acc": 0.8468217054263566,
912
+ "train_speed(iter/s)": 0.305556
913
+ },
914
+ {
915
+ "epoch": 0.2793084591763766,
916
+ "grad_norm": 2.078125,
917
+ "learning_rate": 8.19363562014904e-05,
918
+ "loss": 0.563250207901001,
919
+ "memory(GiB)": 75.82,
920
+ "step": 415,
921
+ "token_acc": 0.8505627962085308,
922
+ "train_speed(iter/s)": 0.307215
923
+ },
924
+ {
925
+ "epoch": 0.2826736213351281,
926
+ "grad_norm": 3.203125,
927
+ "learning_rate": 8.152763335422613e-05,
928
+ "loss": 0.5627524375915527,
929
+ "memory(GiB)": 75.82,
930
+ "step": 420,
931
+ "token_acc": 0.8593019632284201,
932
+ "train_speed(iter/s)": 0.308773
933
+ },
934
+ {
935
+ "epoch": 0.2860387834938796,
936
+ "grad_norm": 2.390625,
937
+ "learning_rate": 8.111538294891684e-05,
938
+ "loss": 0.5277237892150879,
939
+ "memory(GiB)": 75.82,
940
+ "step": 425,
941
+ "token_acc": 0.8635786802030457,
942
+ "train_speed(iter/s)": 0.310337
943
+ },
944
+ {
945
+ "epoch": 0.2894039456526311,
946
+ "grad_norm": 2.796875,
947
+ "learning_rate": 8.06996511113601e-05,
948
+ "loss": 0.610354232788086,
949
+ "memory(GiB)": 75.82,
950
+ "step": 430,
951
+ "token_acc": 0.842520795150148,
952
+ "train_speed(iter/s)": 0.311764
953
+ },
954
+ {
955
+ "epoch": 0.29276910781138266,
956
+ "grad_norm": 4.3125,
957
+ "learning_rate": 8.028048435688333e-05,
958
+ "loss": 0.5220999717712402,
959
+ "memory(GiB)": 75.82,
960
+ "step": 435,
961
+ "token_acc": 0.866747609652451,
962
+ "train_speed(iter/s)": 0.31318
963
+ },
964
+ {
965
+ "epoch": 0.2961342699701342,
966
+ "grad_norm": 2.890625,
967
+ "learning_rate": 7.985792958513931e-05,
968
+ "loss": 0.6387944698333741,
969
+ "memory(GiB)": 75.82,
970
+ "step": 440,
971
+ "token_acc": 0.8430452550325412,
972
+ "train_speed(iter/s)": 0.314765
973
+ },
974
+ {
975
+ "epoch": 0.2994994321288857,
976
+ "grad_norm": 2.265625,
977
+ "learning_rate": 7.943203407485864e-05,
978
+ "loss": 0.44078569412231444,
979
+ "memory(GiB)": 75.82,
980
+ "step": 445,
981
+ "token_acc": 0.8829911533967618,
982
+ "train_speed(iter/s)": 0.316117
983
+ },
984
+ {
985
+ "epoch": 0.30286459428763723,
986
+ "grad_norm": 2.34375,
987
+ "learning_rate": 7.900284547855991e-05,
988
+ "loss": 0.5065964698791504,
989
+ "memory(GiB)": 75.82,
990
+ "step": 450,
991
+ "token_acc": 0.8605196982397317,
992
+ "train_speed(iter/s)": 0.317607
993
+ },
994
+ {
995
+ "epoch": 0.30286459428763723,
996
+ "eval_loss": 0.5302485823631287,
997
+ "eval_runtime": 6.3503,
998
+ "eval_samples_per_second": 37.794,
999
+ "eval_steps_per_second": 37.794,
1000
+ "eval_token_acc": 0.8599894291754757,
1001
+ "step": 450
1002
+ },
1003
+ {
1004
+ "epoch": 0.3062297564463888,
1005
+ "grad_norm": 2.0625,
1006
+ "learning_rate": 7.857041181721787e-05,
1007
+ "loss": 0.4503211975097656,
1008
+ "memory(GiB)": 75.82,
1009
+ "step": 455,
1010
+ "token_acc": 0.8636938646426312,
1011
+ "train_speed(iter/s)": 0.303058
1012
+ },
1013
+ {
1014
+ "epoch": 0.30959491860514027,
1015
+ "grad_norm": 3.359375,
1016
+ "learning_rate": 7.813478147489052e-05,
1017
+ "loss": 0.5654148578643798,
1018
+ "memory(GiB)": 75.82,
1019
+ "step": 460,
1020
+ "token_acc": 0.8542982030111704,
1021
+ "train_speed(iter/s)": 0.304484
1022
+ },
1023
+ {
1024
+ "epoch": 0.3129600807638918,
1025
+ "grad_norm": 2.546875,
1026
+ "learning_rate": 7.769600319330552e-05,
1027
+ "loss": 0.47755861282348633,
1028
+ "memory(GiB)": 75.82,
1029
+ "step": 465,
1030
+ "token_acc": 0.8762720077531901,
1031
+ "train_speed(iter/s)": 0.305941
1032
+ },
1033
+ {
1034
+ "epoch": 0.31632524292264336,
1035
+ "grad_norm": 2.609375,
1036
+ "learning_rate": 7.725412606640658e-05,
1037
+ "loss": 0.5518892288208008,
1038
+ "memory(GiB)": 75.82,
1039
+ "step": 470,
1040
+ "token_acc": 0.8566623959000641,
1041
+ "train_speed(iter/s)": 0.307401
1042
+ },
1043
+ {
1044
+ "epoch": 0.31969040508139485,
1045
+ "grad_norm": 3.265625,
1046
+ "learning_rate": 7.680919953486048e-05,
1047
+ "loss": 0.5913249492645264,
1048
+ "memory(GiB)": 75.82,
1049
+ "step": 475,
1050
+ "token_acc": 0.8497417957687823,
1051
+ "train_speed(iter/s)": 0.308848
1052
+ },
1053
+ {
1054
+ "epoch": 0.3230555672401464,
1055
+ "grad_norm": 2.65625,
1056
+ "learning_rate": 7.636127338052512e-05,
1057
+ "loss": 0.5384829044342041,
1058
+ "memory(GiB)": 75.82,
1059
+ "step": 480,
1060
+ "token_acc": 0.8643092105263158,
1061
+ "train_speed(iter/s)": 0.31026
1062
+ },
1063
+ {
1064
+ "epoch": 0.32642072939889794,
1065
+ "grad_norm": 3.265625,
1066
+ "learning_rate": 7.591039772087977e-05,
1067
+ "loss": 0.5349913120269776,
1068
+ "memory(GiB)": 75.82,
1069
+ "step": 485,
1070
+ "token_acc": 0.8635131063573366,
1071
+ "train_speed(iter/s)": 0.311583
1072
+ },
1073
+ {
1074
+ "epoch": 0.3297858915576494,
1075
+ "grad_norm": 3.046875,
1076
+ "learning_rate": 7.545662300341736e-05,
1077
+ "loss": 0.48796830177307127,
1078
+ "memory(GiB)": 75.82,
1079
+ "step": 490,
1080
+ "token_acc": 0.8717186726102031,
1081
+ "train_speed(iter/s)": 0.312855
1082
+ },
1083
+ {
1084
+ "epoch": 0.33315105371640097,
1085
+ "grad_norm": 2.421875,
1086
+ "learning_rate": 7.500000000000001e-05,
1087
+ "loss": 0.5078158378601074,
1088
+ "memory(GiB)": 75.82,
1089
+ "step": 495,
1090
+ "token_acc": 0.8613126649076517,
1091
+ "train_speed(iter/s)": 0.314225
1092
+ },
1093
+ {
1094
+ "epoch": 0.33651621587515246,
1095
+ "grad_norm": 2.328125,
1096
+ "learning_rate": 7.454057980117841e-05,
1097
+ "loss": 0.484033203125,
1098
+ "memory(GiB)": 75.82,
1099
+ "step": 500,
1100
+ "token_acc": 0.8767056530214425,
1101
+ "train_speed(iter/s)": 0.315565
1102
+ },
1103
+ {
1104
+ "epoch": 0.33651621587515246,
1105
+ "eval_loss": 0.5204777121543884,
1106
+ "eval_runtime": 6.2841,
1107
+ "eval_samples_per_second": 38.192,
1108
+ "eval_steps_per_second": 38.192,
1109
+ "eval_token_acc": 0.8624207188160676,
1110
+ "step": 500
1111
+ },
1112
+ {
1113
+ "epoch": 0.339881378033904,
1114
+ "grad_norm": 3.25,
1115
+ "learning_rate": 7.407841381047532e-05,
1116
+ "loss": 0.5047823905944824,
1117
+ "memory(GiB)": 75.82,
1118
+ "step": 505,
1119
+ "token_acc": 0.8646571869925139,
1120
+ "train_speed(iter/s)": 0.303291
1121
+ },
1122
+ {
1123
+ "epoch": 0.34324654019265555,
1124
+ "grad_norm": 2.5,
1125
+ "learning_rate": 7.361355373863414e-05,
1126
+ "loss": 0.5279562950134278,
1127
+ "memory(GiB)": 75.82,
1128
+ "step": 510,
1129
+ "token_acc": 0.8671359436867576,
1130
+ "train_speed(iter/s)": 0.304627
1131
+ },
1132
+ {
1133
+ "epoch": 0.34661170235140704,
1134
+ "grad_norm": 2.28125,
1135
+ "learning_rate": 7.314605159783314e-05,
1136
+ "loss": 0.5070261001586914,
1137
+ "memory(GiB)": 75.82,
1138
+ "step": 515,
1139
+ "token_acc": 0.8705286483064261,
1140
+ "train_speed(iter/s)": 0.305896
1141
+ },
1142
+ {
1143
+ "epoch": 0.3499768645101586,
1144
+ "grad_norm": 2.671875,
1145
+ "learning_rate": 7.267595969586589e-05,
1146
+ "loss": 0.49044408798217776,
1147
+ "memory(GiB)": 75.82,
1148
+ "step": 520,
1149
+ "token_acc": 0.8736138290932811,
1150
+ "train_speed(iter/s)": 0.307204
1151
+ },
1152
+ {
1153
+ "epoch": 0.3533420266689101,
1154
+ "grad_norm": 2.46875,
1155
+ "learning_rate": 7.220333063028872e-05,
1156
+ "loss": 0.5966588497161865,
1157
+ "memory(GiB)": 75.82,
1158
+ "step": 525,
1159
+ "token_acc": 0.8524286815728604,
1160
+ "train_speed(iter/s)": 0.30845
1161
+ },
1162
+ {
1163
+ "epoch": 0.3567071888276616,
1164
+ "grad_norm": 2.796875,
1165
+ "learning_rate": 7.172821728253562e-05,
1166
+ "loss": 0.5701375007629395,
1167
+ "memory(GiB)": 75.82,
1168
+ "step": 530,
1169
+ "token_acc": 0.8540377863233573,
1170
+ "train_speed(iter/s)": 0.309712
1171
+ },
1172
+ {
1173
+ "epoch": 0.36007235098641316,
1174
+ "grad_norm": 2.796875,
1175
+ "learning_rate": 7.12506728120015e-05,
1176
+ "loss": 0.4613838195800781,
1177
+ "memory(GiB)": 75.82,
1178
+ "step": 535,
1179
+ "token_acc": 0.8718804641551423,
1180
+ "train_speed(iter/s)": 0.310968
1181
+ },
1182
+ {
1183
+ "epoch": 0.3634375131451647,
1184
+ "grad_norm": 2.59375,
1185
+ "learning_rate": 7.077075065009433e-05,
1186
+ "loss": 0.5259300708770752,
1187
+ "memory(GiB)": 75.82,
1188
+ "step": 540,
1189
+ "token_acc": 0.8661874904419636,
1190
+ "train_speed(iter/s)": 0.312118
1191
+ },
1192
+ {
1193
+ "epoch": 0.3668026753039162,
1194
+ "grad_norm": 2.671875,
1195
+ "learning_rate": 7.02885044942567e-05,
1196
+ "loss": 0.5487593173980713,
1197
+ "memory(GiB)": 75.82,
1198
+ "step": 545,
1199
+ "token_acc": 0.8593700787401575,
1200
+ "train_speed(iter/s)": 0.313405
1201
+ },
1202
+ {
1203
+ "epoch": 0.37016783746266774,
1204
+ "grad_norm": 2.40625,
1205
+ "learning_rate": 6.980398830195785e-05,
1206
+ "loss": 0.4660326957702637,
1207
+ "memory(GiB)": 75.82,
1208
+ "step": 550,
1209
+ "token_acc": 0.8734921592279855,
1210
+ "train_speed(iter/s)": 0.31456
1211
+ },
1212
+ {
1213
+ "epoch": 0.37016783746266774,
1214
+ "eval_loss": 0.49012547731399536,
1215
+ "eval_runtime": 6.3431,
1216
+ "eval_samples_per_second": 37.837,
1217
+ "eval_steps_per_second": 37.837,
1218
+ "eval_token_acc": 0.8673890063424947,
1219
+ "step": 550
1220
+ },
1221
+ {
1222
+ "epoch": 0.3735329996214193,
1223
+ "grad_norm": 2.5625,
1224
+ "learning_rate": 6.931725628465643e-05,
1225
+ "loss": 0.5410624027252198,
1226
+ "memory(GiB)": 75.82,
1227
+ "step": 555,
1228
+ "token_acc": 0.8664497667672768,
1229
+ "train_speed(iter/s)": 0.302648
1230
+ },
1231
+ {
1232
+ "epoch": 0.37689816178017077,
1233
+ "grad_norm": 2.328125,
1234
+ "learning_rate": 6.882836290173493e-05,
1235
+ "loss": 0.5354323387145996,
1236
+ "memory(GiB)": 75.82,
1237
+ "step": 560,
1238
+ "token_acc": 0.8606255012028869,
1239
+ "train_speed(iter/s)": 0.303869
1240
+ },
1241
+ {
1242
+ "epoch": 0.3802633239389223,
1243
+ "grad_norm": 2.390625,
1244
+ "learning_rate": 6.833736285440632e-05,
1245
+ "loss": 0.4386926174163818,
1246
+ "memory(GiB)": 75.82,
1247
+ "step": 565,
1248
+ "token_acc": 0.8871541196475499,
1249
+ "train_speed(iter/s)": 0.30496
1250
+ },
1251
+ {
1252
+ "epoch": 0.3836284860976738,
1253
+ "grad_norm": 1.6796875,
1254
+ "learning_rate": 6.784431107959359e-05,
1255
+ "loss": 0.5115750789642334,
1256
+ "memory(GiB)": 75.82,
1257
+ "step": 570,
1258
+ "token_acc": 0.8649976962064199,
1259
+ "train_speed(iter/s)": 0.306182
1260
+ },
1261
+ {
1262
+ "epoch": 0.38699364825642535,
1263
+ "grad_norm": 2.4375,
1264
+ "learning_rate": 6.734926274378312e-05,
1265
+ "loss": 0.48287324905395507,
1266
+ "memory(GiB)": 75.82,
1267
+ "step": 575,
1268
+ "token_acc": 0.8724030754130542,
1269
+ "train_speed(iter/s)": 0.307381
1270
+ },
1271
+ {
1272
+ "epoch": 0.3903588104151769,
1273
+ "grad_norm": 2.703125,
1274
+ "learning_rate": 6.685227323685209e-05,
1275
+ "loss": 0.5082109451293946,
1276
+ "memory(GiB)": 75.82,
1277
+ "step": 580,
1278
+ "token_acc": 0.8686852331606217,
1279
+ "train_speed(iter/s)": 0.30846
1280
+ },
1281
+ {
1282
+ "epoch": 0.3937239725739284,
1283
+ "grad_norm": 2.296875,
1284
+ "learning_rate": 6.635339816587109e-05,
1285
+ "loss": 0.46943073272705077,
1286
+ "memory(GiB)": 75.82,
1287
+ "step": 585,
1288
+ "token_acc": 0.8746086412022542,
1289
+ "train_speed(iter/s)": 0.309547
1290
+ },
1291
+ {
1292
+ "epoch": 0.3970891347326799,
1293
+ "grad_norm": 2.265625,
1294
+ "learning_rate": 6.585269334888234e-05,
1295
+ "loss": 0.4492472171783447,
1296
+ "memory(GiB)": 75.82,
1297
+ "step": 590,
1298
+ "token_acc": 0.8817360438851243,
1299
+ "train_speed(iter/s)": 0.31066
1300
+ },
1301
+ {
1302
+ "epoch": 0.40045429689143147,
1303
+ "grad_norm": 2.703125,
1304
+ "learning_rate": 6.535021480865439e-05,
1305
+ "loss": 0.4906127452850342,
1306
+ "memory(GiB)": 75.82,
1307
+ "step": 595,
1308
+ "token_acc": 0.8699199748940845,
1309
+ "train_speed(iter/s)": 0.311715
1310
+ },
1311
+ {
1312
+ "epoch": 0.40381945905018296,
1313
+ "grad_norm": 1.9375,
1314
+ "learning_rate": 6.484601876641375e-05,
1315
+ "loss": 0.4776750564575195,
1316
+ "memory(GiB)": 75.82,
1317
+ "step": 600,
1318
+ "token_acc": 0.8830860534124629,
1319
+ "train_speed(iter/s)": 0.312795
1320
+ },
1321
+ {
1322
+ "epoch": 0.40381945905018296,
1323
+ "eval_loss": 0.4834407567977905,
1324
+ "eval_runtime": 6.359,
1325
+ "eval_samples_per_second": 37.742,
1326
+ "eval_steps_per_second": 37.742,
1327
+ "eval_token_acc": 0.8705602536997886,
1328
+ "step": 600
1329
+ },
1330
+ {
1331
+ "epoch": 0.4071846212089345,
1332
+ "grad_norm": 2.09375,
1333
+ "learning_rate": 6.434016163555452e-05,
1334
+ "loss": 0.42650303840637205,
1335
+ "memory(GiB)": 75.82,
1336
+ "step": 605,
1337
+ "token_acc": 0.873349786500568,
1338
+ "train_speed(iter/s)": 0.302036
1339
+ },
1340
+ {
1341
+ "epoch": 0.41054978336768605,
1342
+ "grad_norm": 2.375,
1343
+ "learning_rate": 6.383270001532635e-05,
1344
+ "loss": 0.47733469009399415,
1345
+ "memory(GiB)": 75.82,
1346
+ "step": 610,
1347
+ "token_acc": 0.8673865361903155,
1348
+ "train_speed(iter/s)": 0.303179
1349
+ },
1350
+ {
1351
+ "epoch": 0.41391494552643754,
1352
+ "grad_norm": 2.3125,
1353
+ "learning_rate": 6.332369068450174e-05,
1354
+ "loss": 0.4712835788726807,
1355
+ "memory(GiB)": 75.82,
1356
+ "step": 615,
1357
+ "token_acc": 0.8805620608899297,
1358
+ "train_speed(iter/s)": 0.304281
1359
+ },
1360
+ {
1361
+ "epoch": 0.4172801076851891,
1362
+ "grad_norm": 2.625,
1363
+ "learning_rate": 6.281319059502313e-05,
1364
+ "loss": 0.45713419914245607,
1365
+ "memory(GiB)": 75.82,
1366
+ "step": 620,
1367
+ "token_acc": 0.877295995182174,
1368
+ "train_speed(iter/s)": 0.305295
1369
+ },
1370
+ {
1371
+ "epoch": 0.42064526984394063,
1372
+ "grad_norm": 2.453125,
1373
+ "learning_rate": 6.230125686563068e-05,
1374
+ "loss": 0.3812277317047119,
1375
+ "memory(GiB)": 75.82,
1376
+ "step": 625,
1377
+ "token_acc": 0.8977045908183633,
1378
+ "train_speed(iter/s)": 0.306432
1379
+ },
1380
+ {
1381
+ "epoch": 0.4240104320026921,
1382
+ "grad_norm": 2.125,
1383
+ "learning_rate": 6.178794677547137e-05,
1384
+ "loss": 0.48100833892822265,
1385
+ "memory(GiB)": 75.82,
1386
+ "step": 630,
1387
+ "token_acc": 0.8763653633053665,
1388
+ "train_speed(iter/s)": 0.307582
1389
+ },
1390
+ {
1391
+ "epoch": 0.42737559416144366,
1392
+ "grad_norm": 2.8125,
1393
+ "learning_rate": 6.127331775769023e-05,
1394
+ "loss": 0.42731170654296874,
1395
+ "memory(GiB)": 75.82,
1396
+ "step": 635,
1397
+ "token_acc": 0.8863417762103238,
1398
+ "train_speed(iter/s)": 0.308599
1399
+ },
1400
+ {
1401
+ "epoch": 0.4307407563201952,
1402
+ "grad_norm": 2.09375,
1403
+ "learning_rate": 6.0757427393004195e-05,
1404
+ "loss": 0.3901322603225708,
1405
+ "memory(GiB)": 75.82,
1406
+ "step": 640,
1407
+ "token_acc": 0.8963465035543065,
1408
+ "train_speed(iter/s)": 0.309577
1409
+ },
1410
+ {
1411
+ "epoch": 0.4341059184789467,
1412
+ "grad_norm": 2.28125,
1413
+ "learning_rate": 6.024033340325954e-05,
1414
+ "loss": 0.41823792457580566,
1415
+ "memory(GiB)": 75.82,
1416
+ "step": 645,
1417
+ "token_acc": 0.8839628681177977,
1418
+ "train_speed(iter/s)": 0.310568
1419
+ },
1420
+ {
1421
+ "epoch": 0.43747108063769824,
1422
+ "grad_norm": 2.625,
1423
+ "learning_rate": 5.9722093644973546e-05,
1424
+ "loss": 0.45659918785095216,
1425
+ "memory(GiB)": 75.82,
1426
+ "step": 650,
1427
+ "token_acc": 0.8752002563281,
1428
+ "train_speed(iter/s)": 0.311578
1429
+ },
1430
+ {
1431
+ "epoch": 0.43747108063769824,
1432
+ "eval_loss": 0.4680798649787903,
1433
+ "eval_runtime": 6.359,
1434
+ "eval_samples_per_second": 37.742,
1435
+ "eval_steps_per_second": 37.742,
1436
+ "eval_token_acc": 0.8731501057082452,
1437
+ "step": 650
1438
+ },
1439
+ {
1440
+ "epoch": 0.44083624279644973,
1441
+ "grad_norm": 2.34375,
1442
+ "learning_rate": 5.920276610286102e-05,
1443
+ "loss": 0.45874710083007814,
1444
+ "memory(GiB)": 75.82,
1445
+ "step": 655,
1446
+ "token_acc": 0.874843798812871,
1447
+ "train_speed(iter/s)": 0.303426
1448
+ },
1449
+ {
1450
+ "epoch": 0.4442014049552013,
1451
+ "grad_norm": 1.890625,
1452
+ "learning_rate": 5.868240888334653e-05,
1453
+ "loss": 0.40706768035888674,
1454
+ "memory(GiB)": 75.82,
1455
+ "step": 660,
1456
+ "token_acc": 0.8987694831829368,
1457
+ "train_speed(iter/s)": 0.304386
1458
+ },
1459
+ {
1460
+ "epoch": 0.4475665671139528,
1461
+ "grad_norm": 2.3125,
1462
+ "learning_rate": 5.816108020806297e-05,
1463
+ "loss": 0.4790656566619873,
1464
+ "memory(GiB)": 75.82,
1465
+ "step": 665,
1466
+ "token_acc": 0.8695376820772641,
1467
+ "train_speed(iter/s)": 0.30531
1468
+ },
1469
+ {
1470
+ "epoch": 0.4509317292727043,
1471
+ "grad_norm": 2.625,
1472
+ "learning_rate": 5.763883840733736e-05,
1473
+ "loss": 0.4840695858001709,
1474
+ "memory(GiB)": 75.82,
1475
+ "step": 670,
1476
+ "token_acc": 0.8794635643884311,
1477
+ "train_speed(iter/s)": 0.306292
1478
+ },
1479
+ {
1480
+ "epoch": 0.45429689143145585,
1481
+ "grad_norm": 2.703125,
1482
+ "learning_rate": 5.7115741913664264e-05,
1483
+ "loss": 0.4588040351867676,
1484
+ "memory(GiB)": 75.82,
1485
+ "step": 675,
1486
+ "token_acc": 0.8740804106073568,
1487
+ "train_speed(iter/s)": 0.307251
1488
+ },
1489
+ {
1490
+ "epoch": 0.4576620535902074,
1491
+ "grad_norm": 2.3125,
1492
+ "learning_rate": 5.6591849255168015e-05,
1493
+ "loss": 0.39728033542633057,
1494
+ "memory(GiB)": 75.82,
1495
+ "step": 680,
1496
+ "token_acc": 0.893990116371752,
1497
+ "train_speed(iter/s)": 0.308265
1498
+ },
1499
+ {
1500
+ "epoch": 0.4610272157489589,
1501
+ "grad_norm": 2.1875,
1502
+ "learning_rate": 5.60672190490541e-05,
1503
+ "loss": 0.422639799118042,
1504
+ "memory(GiB)": 75.82,
1505
+ "step": 685,
1506
+ "token_acc": 0.8848245180425112,
1507
+ "train_speed(iter/s)": 0.309233
1508
+ },
1509
+ {
1510
+ "epoch": 0.46439237790771043,
1511
+ "grad_norm": 2.265625,
1512
+ "learning_rate": 5.5541909995050554e-05,
1513
+ "loss": 0.39331207275390623,
1514
+ "memory(GiB)": 75.82,
1515
+ "step": 690,
1516
+ "token_acc": 0.8947537301459971,
1517
+ "train_speed(iter/s)": 0.310207
1518
+ },
1519
+ {
1520
+ "epoch": 0.467757540066462,
1521
+ "grad_norm": 2.21875,
1522
+ "learning_rate": 5.501598086884025e-05,
1523
+ "loss": 0.43639063835144043,
1524
+ "memory(GiB)": 75.82,
1525
+ "step": 695,
1526
+ "token_acc": 0.8884950048340315,
1527
+ "train_speed(iter/s)": 0.311107
1528
+ },
1529
+ {
1530
+ "epoch": 0.47112270222521346,
1531
+ "grad_norm": 2.375,
1532
+ "learning_rate": 5.448949051548459e-05,
1533
+ "loss": 0.413299560546875,
1534
+ "memory(GiB)": 75.82,
1535
+ "step": 700,
1536
+ "token_acc": 0.8879505353641984,
1537
+ "train_speed(iter/s)": 0.312013
1538
+ },
1539
+ {
1540
+ "epoch": 0.47112270222521346,
1541
+ "eval_loss": 0.4406000077724457,
1542
+ "eval_runtime": 6.3948,
1543
+ "eval_samples_per_second": 37.531,
1544
+ "eval_steps_per_second": 37.531,
1545
+ "eval_token_acc": 0.8806025369978858,
1546
+ "step": 700
1547
+ },
1548
+ {
1549
+ "epoch": 0.474487864383965,
1550
+ "grad_norm": 2.421875,
1551
+ "learning_rate": 5.396249784283942e-05,
1552
+ "loss": 0.43725104331970216,
1553
+ "memory(GiB)": 75.82,
1554
+ "step": 705,
1555
+ "token_acc": 0.8810175054704595,
1556
+ "train_speed(iter/s)": 0.304357
1557
+ },
1558
+ {
1559
+ "epoch": 0.47785302654271655,
1560
+ "grad_norm": 2.34375,
1561
+ "learning_rate": 5.343506181496405e-05,
1562
+ "loss": 0.41141476631164553,
1563
+ "memory(GiB)": 75.82,
1564
+ "step": 710,
1565
+ "token_acc": 0.8912708204811844,
1566
+ "train_speed(iter/s)": 0.305281
1567
+ },
1568
+ {
1569
+ "epoch": 0.48121818870146804,
1570
+ "grad_norm": 2.609375,
1571
+ "learning_rate": 5.290724144552379e-05,
1572
+ "loss": 0.5024977684020996,
1573
+ "memory(GiB)": 75.82,
1574
+ "step": 715,
1575
+ "token_acc": 0.8692831144168381,
1576
+ "train_speed(iter/s)": 0.306216
1577
+ },
1578
+ {
1579
+ "epoch": 0.4845833508602196,
1580
+ "grad_norm": 1.8828125,
1581
+ "learning_rate": 5.2379095791187124e-05,
1582
+ "loss": 0.37138142585754397,
1583
+ "memory(GiB)": 75.82,
1584
+ "step": 720,
1585
+ "token_acc": 0.8965241069998399,
1586
+ "train_speed(iter/s)": 0.307078
1587
+ },
1588
+ {
1589
+ "epoch": 0.4879485130189711,
1590
+ "grad_norm": 2.484375,
1591
+ "learning_rate": 5.185068394501791e-05,
1592
+ "loss": 0.46549081802368164,
1593
+ "memory(GiB)": 75.82,
1594
+ "step": 725,
1595
+ "token_acc": 0.8741355463347165,
1596
+ "train_speed(iter/s)": 0.308012
1597
+ },
1598
+ {
1599
+ "epoch": 0.4913136751777226,
1600
+ "grad_norm": 2.828125,
1601
+ "learning_rate": 5.132206502986368e-05,
1602
+ "loss": 0.5339263916015625,
1603
+ "memory(GiB)": 75.82,
1604
+ "step": 730,
1605
+ "token_acc": 0.8623881049916553,
1606
+ "train_speed(iter/s)": 0.308868
1607
+ },
1608
+ {
1609
+ "epoch": 0.49467883733647416,
1610
+ "grad_norm": 2.5625,
1611
+ "learning_rate": 5.0793298191740404e-05,
1612
+ "loss": 0.4308777809143066,
1613
+ "memory(GiB)": 75.82,
1614
+ "step": 735,
1615
+ "token_acc": 0.8834385624089364,
1616
+ "train_speed(iter/s)": 0.309717
1617
+ },
1618
+ {
1619
+ "epoch": 0.49804399949522565,
1620
+ "grad_norm": 3.0,
1621
+ "learning_rate": 5.026444259321489e-05,
1622
+ "loss": 0.3827210903167725,
1623
+ "memory(GiB)": 75.82,
1624
+ "step": 740,
1625
+ "token_acc": 0.8980582524271845,
1626
+ "train_speed(iter/s)": 0.310525
1627
+ },
1628
+ {
1629
+ "epoch": 0.5014091616539772,
1630
+ "grad_norm": 2.1875,
1631
+ "learning_rate": 4.973555740678511e-05,
1632
+ "loss": 0.4466721534729004,
1633
+ "memory(GiB)": 75.82,
1634
+ "step": 745,
1635
+ "token_acc": 0.8842266462480858,
1636
+ "train_speed(iter/s)": 0.311445
1637
+ },
1638
+ {
1639
+ "epoch": 0.5047743238127287,
1640
+ "grad_norm": 2.765625,
1641
+ "learning_rate": 4.92067018082596e-05,
1642
+ "loss": 0.5502868175506592,
1643
+ "memory(GiB)": 75.82,
1644
+ "step": 750,
1645
+ "token_acc": 0.8586033117350612,
1646
+ "train_speed(iter/s)": 0.312324
1647
+ },
1648
+ {
1649
+ "epoch": 0.5047743238127287,
1650
+ "eval_loss": 0.42037147283554077,
1651
+ "eval_runtime": 6.3843,
1652
+ "eval_samples_per_second": 37.592,
1653
+ "eval_steps_per_second": 37.592,
1654
+ "eval_token_acc": 0.8845665961945032,
1655
+ "step": 750
1656
+ },
1657
+ {
1658
+ "epoch": 0.5081394859714803,
1659
+ "grad_norm": 2.125,
1660
+ "learning_rate": 4.8677934970136335e-05,
1661
+ "loss": 0.5509189128875732,
1662
+ "memory(GiB)": 75.82,
1663
+ "step": 755,
1664
+ "token_acc": 0.8773990147783252,
1665
+ "train_speed(iter/s)": 0.304846
1666
+ },
1667
+ {
1668
+ "epoch": 0.5115046481302318,
1669
+ "grad_norm": 1.8203125,
1670
+ "learning_rate": 4.8149316054982095e-05,
1671
+ "loss": 0.392488431930542,
1672
+ "memory(GiB)": 75.82,
1673
+ "step": 760,
1674
+ "token_acc": 0.8881675052751177,
1675
+ "train_speed(iter/s)": 0.305699
1676
+ },
1677
+ {
1678
+ "epoch": 0.5148698102889833,
1679
+ "grad_norm": 2.125,
1680
+ "learning_rate": 4.762090420881289e-05,
1681
+ "loss": 0.34769492149353026,
1682
+ "memory(GiB)": 75.82,
1683
+ "step": 765,
1684
+ "token_acc": 0.904643578195372,
1685
+ "train_speed(iter/s)": 0.306491
1686
+ },
1687
+ {
1688
+ "epoch": 0.5182349724477349,
1689
+ "grad_norm": 1.9453125,
1690
+ "learning_rate": 4.709275855447621e-05,
1691
+ "loss": 0.34389894008636473,
1692
+ "memory(GiB)": 75.82,
1693
+ "step": 770,
1694
+ "token_acc": 0.9075886411038023,
1695
+ "train_speed(iter/s)": 0.307358
1696
+ },
1697
+ {
1698
+ "epoch": 0.5216001346064864,
1699
+ "grad_norm": 2.234375,
1700
+ "learning_rate": 4.6564938185035956e-05,
1701
+ "loss": 0.3195344924926758,
1702
+ "memory(GiB)": 75.82,
1703
+ "step": 775,
1704
+ "token_acc": 0.9082976621666118,
1705
+ "train_speed(iter/s)": 0.308159
1706
+ },
1707
+ {
1708
+ "epoch": 0.5249652967652378,
1709
+ "grad_norm": 2.21875,
1710
+ "learning_rate": 4.603750215716057e-05,
1711
+ "loss": 0.38263275623321535,
1712
+ "memory(GiB)": 75.82,
1713
+ "step": 780,
1714
+ "token_acc": 0.8930994539136191,
1715
+ "train_speed(iter/s)": 0.30896
1716
+ },
1717
+ {
1718
+ "epoch": 0.5283304589239894,
1719
+ "grad_norm": 2.4375,
1720
+ "learning_rate": 4.551050948451542e-05,
1721
+ "loss": 0.4862419605255127,
1722
+ "memory(GiB)": 75.82,
1723
+ "step": 785,
1724
+ "token_acc": 0.8823529411764706,
1725
+ "train_speed(iter/s)": 0.309802
1726
+ },
1727
+ {
1728
+ "epoch": 0.5316956210827409,
1729
+ "grad_norm": 2.359375,
1730
+ "learning_rate": 4.498401913115975e-05,
1731
+ "loss": 0.46417646408081054,
1732
+ "memory(GiB)": 75.82,
1733
+ "step": 790,
1734
+ "token_acc": 0.8798179059180576,
1735
+ "train_speed(iter/s)": 0.310657
1736
+ },
1737
+ {
1738
+ "epoch": 0.5350607832414924,
1739
+ "grad_norm": 2.03125,
1740
+ "learning_rate": 4.445809000494946e-05,
1741
+ "loss": 0.5157633304595948,
1742
+ "memory(GiB)": 75.82,
1743
+ "step": 795,
1744
+ "token_acc": 0.8697006636868482,
1745
+ "train_speed(iter/s)": 0.311416
1746
+ },
1747
+ {
1748
+ "epoch": 0.538425945400244,
1749
+ "grad_norm": 2.28125,
1750
+ "learning_rate": 4.393278095094591e-05,
1751
+ "loss": 0.36940808296203614,
1752
+ "memory(GiB)": 75.82,
1753
+ "step": 800,
1754
+ "token_acc": 0.8981329839502129,
1755
+ "train_speed(iter/s)": 0.312237
1756
+ },
1757
+ {
1758
+ "epoch": 0.538425945400244,
1759
+ "eval_loss": 0.3978128135204315,
1760
+ "eval_runtime": 6.3728,
1761
+ "eval_samples_per_second": 37.66,
1762
+ "eval_steps_per_second": 37.66,
1763
+ "eval_token_acc": 0.8903276955602537,
1764
+ "step": 800
1765
+ },
1766
+ {
1767
+ "epoch": 0.5417911075589955,
1768
+ "grad_norm": 1.9453125,
1769
+ "learning_rate": 4.340815074483199e-05,
1770
+ "loss": 0.34389219284057615,
1771
+ "memory(GiB)": 75.82,
1772
+ "step": 805,
1773
+ "token_acc": 0.8938571824626718,
1774
+ "train_speed(iter/s)": 0.305594
1775
+ },
1776
+ {
1777
+ "epoch": 0.545156269717747,
1778
+ "grad_norm": 2.109375,
1779
+ "learning_rate": 4.288425808633575e-05,
1780
+ "loss": 0.31910243034362795,
1781
+ "memory(GiB)": 75.82,
1782
+ "step": 810,
1783
+ "token_acc": 0.9097661188369153,
1784
+ "train_speed(iter/s)": 0.306364
1785
+ },
1786
+ {
1787
+ "epoch": 0.5485214318764986,
1788
+ "grad_norm": 2.109375,
1789
+ "learning_rate": 4.236116159266265e-05,
1790
+ "loss": 0.3916430950164795,
1791
+ "memory(GiB)": 75.82,
1792
+ "step": 815,
1793
+ "token_acc": 0.8918385922330098,
1794
+ "train_speed(iter/s)": 0.307181
1795
+ },
1796
+ {
1797
+ "epoch": 0.5518865940352501,
1798
+ "grad_norm": 2.265625,
1799
+ "learning_rate": 4.1838919791937034e-05,
1800
+ "loss": 0.38680903911590575,
1801
+ "memory(GiB)": 75.82,
1802
+ "step": 820,
1803
+ "token_acc": 0.8982239382239382,
1804
+ "train_speed(iter/s)": 0.307962
1805
+ },
1806
+ {
1807
+ "epoch": 0.5552517561940016,
1808
+ "grad_norm": 3.09375,
1809
+ "learning_rate": 4.131759111665349e-05,
1810
+ "loss": 0.39566311836242674,
1811
+ "memory(GiB)": 75.82,
1812
+ "step": 825,
1813
+ "token_acc": 0.8838128359152703,
1814
+ "train_speed(iter/s)": 0.308723
1815
+ },
1816
+ {
1817
+ "epoch": 0.5586169183527532,
1818
+ "grad_norm": 2.609375,
1819
+ "learning_rate": 4.0797233897138985e-05,
1820
+ "loss": 0.3857170820236206,
1821
+ "memory(GiB)": 75.82,
1822
+ "step": 830,
1823
+ "token_acc": 0.8954257979114576,
1824
+ "train_speed(iter/s)": 0.309512
1825
+ },
1826
+ {
1827
+ "epoch": 0.5619820805115047,
1828
+ "grad_norm": 2.109375,
1829
+ "learning_rate": 4.027790635502646e-05,
1830
+ "loss": 0.4497522354125977,
1831
+ "memory(GiB)": 75.82,
1832
+ "step": 835,
1833
+ "token_acc": 0.8830426939266386,
1834
+ "train_speed(iter/s)": 0.310275
1835
+ },
1836
+ {
1837
+ "epoch": 0.5653472426702562,
1838
+ "grad_norm": 2.484375,
1839
+ "learning_rate": 3.9759666596740476e-05,
1840
+ "loss": 0.36929664611816404,
1841
+ "memory(GiB)": 75.82,
1842
+ "step": 840,
1843
+ "token_acc": 0.9050632911392406,
1844
+ "train_speed(iter/s)": 0.31104
1845
+ },
1846
+ {
1847
+ "epoch": 0.5687124048290076,
1848
+ "grad_norm": 2.828125,
1849
+ "learning_rate": 3.924257260699583e-05,
1850
+ "loss": 0.4202712535858154,
1851
+ "memory(GiB)": 75.82,
1852
+ "step": 845,
1853
+ "token_acc": 0.8854133418448771,
1854
+ "train_speed(iter/s)": 0.311816
1855
+ },
1856
+ {
1857
+ "epoch": 0.5720775669877592,
1858
+ "grad_norm": 1.875,
1859
+ "learning_rate": 3.8726682242309794e-05,
1860
+ "loss": 0.3440741777420044,
1861
+ "memory(GiB)": 75.82,
1862
+ "step": 850,
1863
+ "token_acc": 0.9036319612590799,
1864
+ "train_speed(iter/s)": 0.312528
1865
+ },
1866
+ {
1867
+ "epoch": 0.5720775669877592,
1868
+ "eval_loss": 0.38969123363494873,
1869
+ "eval_runtime": 6.3806,
1870
+ "eval_samples_per_second": 37.614,
1871
+ "eval_steps_per_second": 37.614,
1872
+ "eval_token_acc": 0.8914376321353066,
1873
+ "step": 850
1874
+ },
1875
+ {
1876
+ "epoch": 0.5754427291465107,
1877
+ "grad_norm": 2.25,
1878
+ "learning_rate": 3.821205322452863e-05,
1879
+ "loss": 0.3591428756713867,
1880
+ "memory(GiB)": 75.82,
1881
+ "step": 855,
1882
+ "token_acc": 0.8932054420676646,
1883
+ "train_speed(iter/s)": 0.30595
1884
+ },
1885
+ {
1886
+ "epoch": 0.5788078913052622,
1887
+ "grad_norm": 2.109375,
1888
+ "learning_rate": 3.769874313436933e-05,
1889
+ "loss": 0.4184281349182129,
1890
+ "memory(GiB)": 75.82,
1891
+ "step": 860,
1892
+ "token_acc": 0.8862865449846551,
1893
+ "train_speed(iter/s)": 0.306767
1894
+ },
1895
+ {
1896
+ "epoch": 0.5821730534640138,
1897
+ "grad_norm": 1.890625,
1898
+ "learning_rate": 3.718680940497687e-05,
1899
+ "loss": 0.4002545833587646,
1900
+ "memory(GiB)": 75.82,
1901
+ "step": 865,
1902
+ "token_acc": 0.896395693555937,
1903
+ "train_speed(iter/s)": 0.307516
1904
+ },
1905
+ {
1906
+ "epoch": 0.5855382156227653,
1907
+ "grad_norm": 2.71875,
1908
+ "learning_rate": 3.6676309315498256e-05,
1909
+ "loss": 0.43825640678405764,
1910
+ "memory(GiB)": 75.82,
1911
+ "step": 870,
1912
+ "token_acc": 0.8854260764829871,
1913
+ "train_speed(iter/s)": 0.308318
1914
+ },
1915
+ {
1916
+ "epoch": 0.5889033777815168,
1917
+ "grad_norm": 2.109375,
1918
+ "learning_rate": 3.616729998467365e-05,
1919
+ "loss": 0.4464766025543213,
1920
+ "memory(GiB)": 75.82,
1921
+ "step": 875,
1922
+ "token_acc": 0.8829486224869695,
1923
+ "train_speed(iter/s)": 0.309057
1924
+ },
1925
+ {
1926
+ "epoch": 0.5922685399402684,
1927
+ "grad_norm": 2.328125,
1928
+ "learning_rate": 3.5659838364445505e-05,
1929
+ "loss": 0.3885576486587524,
1930
+ "memory(GiB)": 75.82,
1931
+ "step": 880,
1932
+ "token_acc": 0.8979990239141045,
1933
+ "train_speed(iter/s)": 0.309821
1934
+ },
1935
+ {
1936
+ "epoch": 0.5956337020990199,
1937
+ "grad_norm": 2.359375,
1938
+ "learning_rate": 3.515398123358627e-05,
1939
+ "loss": 0.29079430103302,
1940
+ "memory(GiB)": 75.82,
1941
+ "step": 885,
1942
+ "token_acc": 0.9171314741035856,
1943
+ "train_speed(iter/s)": 0.310587
1944
+ },
1945
+ {
1946
+ "epoch": 0.5989988642577714,
1947
+ "grad_norm": 1.8984375,
1948
+ "learning_rate": 3.464978519134561e-05,
1949
+ "loss": 0.35250732898712156,
1950
+ "memory(GiB)": 75.82,
1951
+ "step": 890,
1952
+ "token_acc": 0.9011910753229324,
1953
+ "train_speed(iter/s)": 0.311324
1954
+ },
1955
+ {
1956
+ "epoch": 0.602364026416523,
1957
+ "grad_norm": 2.796875,
1958
+ "learning_rate": 3.414730665111766e-05,
1959
+ "loss": 0.5763841152191163,
1960
+ "memory(GiB)": 75.82,
1961
+ "step": 895,
1962
+ "token_acc": 0.850735294117647,
1963
+ "train_speed(iter/s)": 0.312078
1964
+ },
1965
+ {
1966
+ "epoch": 0.6057291885752745,
1967
+ "grad_norm": 1.8125,
1968
+ "learning_rate": 3.364660183412892e-05,
1969
+ "loss": 0.3474919319152832,
1970
+ "memory(GiB)": 75.82,
1971
+ "step": 900,
1972
+ "token_acc": 0.9043635170603674,
1973
+ "train_speed(iter/s)": 0.312842
1974
+ },
1975
+ {
1976
+ "epoch": 0.6057291885752745,
1977
+ "eval_loss": 0.3748236298561096,
1978
+ "eval_runtime": 6.318,
1979
+ "eval_samples_per_second": 37.987,
1980
+ "eval_steps_per_second": 37.987,
1981
+ "eval_token_acc": 0.896247357293869,
1982
+ "step": 900
1983
+ },
1984
+ {
1985
+ "epoch": 0.609094350734026,
1986
+ "grad_norm": 2.453125,
1987
+ "learning_rate": 3.314772676314791e-05,
1988
+ "loss": 0.3573255777359009,
1989
+ "memory(GiB)": 75.82,
1990
+ "step": 905,
1991
+ "token_acc": 0.8959614555995142,
1992
+ "train_speed(iter/s)": 0.306732
1993
+ },
1994
+ {
1995
+ "epoch": 0.6124595128927776,
1996
+ "grad_norm": 2.671875,
1997
+ "learning_rate": 3.2650737256216886e-05,
1998
+ "loss": 0.35031719207763673,
1999
+ "memory(GiB)": 75.82,
2000
+ "step": 910,
2001
+ "token_acc": 0.901930971567584,
2002
+ "train_speed(iter/s)": 0.307389
2003
+ },
2004
+ {
2005
+ "epoch": 0.615824675051529,
2006
+ "grad_norm": 2.359375,
2007
+ "learning_rate": 3.215568892040641e-05,
2008
+ "loss": 0.42179441452026367,
2009
+ "memory(GiB)": 75.82,
2010
+ "step": 915,
2011
+ "token_acc": 0.8914620966496835,
2012
+ "train_speed(iter/s)": 0.308116
2013
+ },
2014
+ {
2015
+ "epoch": 0.6191898372102805,
2016
+ "grad_norm": 1.9375,
2017
+ "learning_rate": 3.16626371455937e-05,
2018
+ "loss": 0.3349519968032837,
2019
+ "memory(GiB)": 75.82,
2020
+ "step": 920,
2021
+ "token_acc": 0.9102564102564102,
2022
+ "train_speed(iter/s)": 0.3088
2023
+ },
2024
+ {
2025
+ "epoch": 0.6225549993690321,
2026
+ "grad_norm": 2.265625,
2027
+ "learning_rate": 3.1171637098265064e-05,
2028
+ "loss": 0.3286914587020874,
2029
+ "memory(GiB)": 75.82,
2030
+ "step": 925,
2031
+ "token_acc": 0.9120385232744783,
2032
+ "train_speed(iter/s)": 0.309527
2033
+ },
2034
+ {
2035
+ "epoch": 0.6259201615277836,
2036
+ "grad_norm": 2.078125,
2037
+ "learning_rate": 3.0682743715343564e-05,
2038
+ "loss": 0.32795827388763427,
2039
+ "memory(GiB)": 75.82,
2040
+ "step": 930,
2041
+ "token_acc": 0.9038667278007031,
2042
+ "train_speed(iter/s)": 0.3102
2043
+ },
2044
+ {
2045
+ "epoch": 0.6292853236865351,
2046
+ "grad_norm": 1.7890625,
2047
+ "learning_rate": 3.019601169804216e-05,
2048
+ "loss": 0.35293123722076414,
2049
+ "memory(GiB)": 75.82,
2050
+ "step": 935,
2051
+ "token_acc": 0.9023519870235198,
2052
+ "train_speed(iter/s)": 0.310922
2053
+ },
2054
+ {
2055
+ "epoch": 0.6326504858452867,
2056
+ "grad_norm": 1.9609375,
2057
+ "learning_rate": 2.9711495505743313e-05,
2058
+ "loss": 0.2731185436248779,
2059
+ "memory(GiB)": 75.82,
2060
+ "step": 940,
2061
+ "token_acc": 0.923992673992674,
2062
+ "train_speed(iter/s)": 0.311559
2063
+ },
2064
+ {
2065
+ "epoch": 0.6360156480040382,
2066
+ "grad_norm": 2.25,
2067
+ "learning_rate": 2.9229249349905684e-05,
2068
+ "loss": 0.46314468383789065,
2069
+ "memory(GiB)": 75.82,
2070
+ "step": 945,
2071
+ "token_acc": 0.8777718407694363,
2072
+ "train_speed(iter/s)": 0.312255
2073
+ },
2074
+ {
2075
+ "epoch": 0.6393808101627897,
2076
+ "grad_norm": 2.109375,
2077
+ "learning_rate": 2.8749327187998515e-05,
2078
+ "loss": 0.3386242151260376,
2079
+ "memory(GiB)": 75.82,
2080
+ "step": 950,
2081
+ "token_acc": 0.906275336468299,
2082
+ "train_speed(iter/s)": 0.312929
2083
+ },
2084
+ {
2085
+ "epoch": 0.6393808101627897,
2086
+ "eval_loss": 0.36514538526535034,
2087
+ "eval_runtime": 6.288,
2088
+ "eval_samples_per_second": 38.168,
2089
+ "eval_steps_per_second": 38.168,
2090
+ "eval_token_acc": 0.9002642706131079,
2091
+ "step": 950
2092
+ },
2093
+ {
2094
+ "epoch": 0.6427459723215413,
2095
+ "grad_norm": 2.0625,
2096
+ "learning_rate": 2.827178271746441e-05,
2097
+ "loss": 0.3628067970275879,
2098
+ "memory(GiB)": 75.82,
2099
+ "step": 955,
2100
+ "token_acc": 0.9006407689227073,
2101
+ "train_speed(iter/s)": 0.307148
2102
+ },
2103
+ {
2104
+ "epoch": 0.6461111344802928,
2105
+ "grad_norm": 2.171875,
2106
+ "learning_rate": 2.7796669369711294e-05,
2107
+ "loss": 0.35329625606536863,
2108
+ "memory(GiB)": 75.82,
2109
+ "step": 960,
2110
+ "token_acc": 0.9036697247706422,
2111
+ "train_speed(iter/s)": 0.307809
2112
+ },
2113
+ {
2114
+ "epoch": 0.6494762966390443,
2115
+ "grad_norm": 2.03125,
2116
+ "learning_rate": 2.7324040304134123e-05,
2117
+ "loss": 0.3087867259979248,
2118
+ "memory(GiB)": 75.82,
2119
+ "step": 965,
2120
+ "token_acc": 0.915299187800431,
2121
+ "train_speed(iter/s)": 0.30851
2122
+ },
2123
+ {
2124
+ "epoch": 0.6528414587977959,
2125
+ "grad_norm": 2.25,
2126
+ "learning_rate": 2.6853948402166878e-05,
2127
+ "loss": 0.3155955791473389,
2128
+ "memory(GiB)": 75.82,
2129
+ "step": 970,
2130
+ "token_acc": 0.912190414924413,
2131
+ "train_speed(iter/s)": 0.309237
2132
+ },
2133
+ {
2134
+ "epoch": 0.6562066209565474,
2135
+ "grad_norm": 2.078125,
2136
+ "learning_rate": 2.638644626136587e-05,
2137
+ "loss": 0.31089558601379397,
2138
+ "memory(GiB)": 75.82,
2139
+ "step": 975,
2140
+ "token_acc": 0.9141963109354414,
2141
+ "train_speed(iter/s)": 0.30992
2142
+ },
2143
+ {
2144
+ "epoch": 0.6595717831152988,
2145
+ "grad_norm": 2.390625,
2146
+ "learning_rate": 2.5921586189524694e-05,
2147
+ "loss": 0.38663172721862793,
2148
+ "memory(GiB)": 75.82,
2149
+ "step": 980,
2150
+ "token_acc": 0.9041765169424744,
2151
+ "train_speed(iter/s)": 0.310581
2152
+ },
2153
+ {
2154
+ "epoch": 0.6629369452740504,
2155
+ "grad_norm": 2.140625,
2156
+ "learning_rate": 2.5459420198821605e-05,
2157
+ "loss": 0.34139630794525144,
2158
+ "memory(GiB)": 75.82,
2159
+ "step": 985,
2160
+ "token_acc": 0.908329455560726,
2161
+ "train_speed(iter/s)": 0.311278
2162
+ },
2163
+ {
2164
+ "epoch": 0.6663021074328019,
2165
+ "grad_norm": 1.953125,
2166
+ "learning_rate": 2.500000000000001e-05,
2167
+ "loss": 0.30913376808166504,
2168
+ "memory(GiB)": 75.82,
2169
+ "step": 990,
2170
+ "token_acc": 0.9187433922368222,
2171
+ "train_speed(iter/s)": 0.311944
2172
+ },
2173
+ {
2174
+ "epoch": 0.6696672695915534,
2175
+ "grad_norm": 2.53125,
2176
+ "learning_rate": 2.454337699658267e-05,
2177
+ "loss": 0.3810436248779297,
2178
+ "memory(GiB)": 75.82,
2179
+ "step": 995,
2180
+ "token_acc": 0.8920236336779911,
2181
+ "train_speed(iter/s)": 0.312598
2182
+ },
2183
+ {
2184
+ "epoch": 0.6730324317503049,
2185
+ "grad_norm": 2.546875,
2186
+ "learning_rate": 2.4089602279120222e-05,
2187
+ "loss": 0.3346914768218994,
2188
+ "memory(GiB)": 75.82,
2189
+ "step": 1000,
2190
+ "token_acc": 0.9072181670721817,
2191
+ "train_speed(iter/s)": 0.313245
2192
+ },
2193
+ {
2194
+ "epoch": 0.6730324317503049,
2195
+ "eval_loss": 0.35263723134994507,
2196
+ "eval_runtime": 6.3269,
2197
+ "eval_samples_per_second": 37.933,
2198
+ "eval_steps_per_second": 37.933,
2199
+ "eval_token_acc": 0.9024841437632135,
2200
+ "step": 1000
2201
+ },
2202
+ {
2203
+ "epoch": 0.6763975939090565,
2204
+ "grad_norm": 2.53125,
2205
+ "learning_rate": 2.363872661947488e-05,
2206
+ "loss": 0.3666444063186646,
2207
+ "memory(GiB)": 75.82,
2208
+ "step": 1005,
2209
+ "token_acc": 0.901596274033677,
2210
+ "train_speed(iter/s)": 0.307343
2211
+ },
2212
+ {
2213
+ "epoch": 0.679762756067808,
2214
+ "grad_norm": 1.8828125,
2215
+ "learning_rate": 2.319080046513954e-05,
2216
+ "loss": 0.2809803247451782,
2217
+ "memory(GiB)": 75.82,
2218
+ "step": 1010,
2219
+ "token_acc": 0.9265139116202946,
2220
+ "train_speed(iter/s)": 0.308004
2221
+ },
2222
+ {
2223
+ "epoch": 0.6831279182265595,
2224
+ "grad_norm": 1.734375,
2225
+ "learning_rate": 2.274587393359342e-05,
2226
+ "loss": 0.3413016557693481,
2227
+ "memory(GiB)": 75.82,
2228
+ "step": 1015,
2229
+ "token_acc": 0.9119178921568627,
2230
+ "train_speed(iter/s)": 0.30864
2231
+ },
2232
+ {
2233
+ "epoch": 0.6864930803853111,
2234
+ "grad_norm": 2.625,
2235
+ "learning_rate": 2.2303996806694488e-05,
2236
+ "loss": 0.378632116317749,
2237
+ "memory(GiB)": 75.82,
2238
+ "step": 1020,
2239
+ "token_acc": 0.9010562286424355,
2240
+ "train_speed(iter/s)": 0.309282
2241
+ },
2242
+ {
2243
+ "epoch": 0.6898582425440626,
2244
+ "grad_norm": 1.8359375,
2245
+ "learning_rate": 2.1865218525109495e-05,
2246
+ "loss": 0.32521207332611085,
2247
+ "memory(GiB)": 75.82,
2248
+ "step": 1025,
2249
+ "token_acc": 0.9113677264547091,
2250
+ "train_speed(iter/s)": 0.309938
2251
+ },
2252
+ {
2253
+ "epoch": 0.6932234047028141,
2254
+ "grad_norm": 2.125,
2255
+ "learning_rate": 2.1429588182782144e-05,
2256
+ "loss": 0.3491218090057373,
2257
+ "memory(GiB)": 75.82,
2258
+ "step": 1030,
2259
+ "token_acc": 0.9001129578828465,
2260
+ "train_speed(iter/s)": 0.310579
2261
+ },
2262
+ {
2263
+ "epoch": 0.6965885668615657,
2264
+ "grad_norm": 1.9921875,
2265
+ "learning_rate": 2.09971545214401e-05,
2266
+ "loss": 0.32745966911315916,
2267
+ "memory(GiB)": 75.82,
2268
+ "step": 1035,
2269
+ "token_acc": 0.9061001818482394,
2270
+ "train_speed(iter/s)": 0.311215
2271
+ },
2272
+ {
2273
+ "epoch": 0.6999537290203172,
2274
+ "grad_norm": 2.40625,
2275
+ "learning_rate": 2.0567965925141363e-05,
2276
+ "loss": 0.4002220153808594,
2277
+ "memory(GiB)": 75.82,
2278
+ "step": 1040,
2279
+ "token_acc": 0.8949858088930936,
2280
+ "train_speed(iter/s)": 0.311883
2281
+ },
2282
+ {
2283
+ "epoch": 0.7033188911790687,
2284
+ "grad_norm": 2.234375,
2285
+ "learning_rate": 2.0142070414860704e-05,
2286
+ "loss": 0.33180482387542726,
2287
+ "memory(GiB)": 75.82,
2288
+ "step": 1045,
2289
+ "token_acc": 0.9061082552162081,
2290
+ "train_speed(iter/s)": 0.31252
2291
+ },
2292
+ {
2293
+ "epoch": 0.7066840533378203,
2294
+ "grad_norm": 2.375,
2295
+ "learning_rate": 1.9719515643116674e-05,
2296
+ "loss": 0.2780953884124756,
2297
+ "memory(GiB)": 75.82,
2298
+ "step": 1050,
2299
+ "token_acc": 0.9193521731945133,
2300
+ "train_speed(iter/s)": 0.313162
2301
+ },
2302
+ {
2303
+ "epoch": 0.7066840533378203,
2304
+ "eval_loss": 0.3458307981491089,
2305
+ "eval_runtime": 6.3244,
2306
+ "eval_samples_per_second": 37.948,
2307
+ "eval_steps_per_second": 37.948,
2308
+ "eval_token_acc": 0.9045983086680761,
2309
+ "step": 1050
2310
+ },
2311
+ {
2312
+ "epoch": 0.7100492154965717,
2313
+ "grad_norm": 1.8359375,
2314
+ "learning_rate": 1.9300348888639914e-05,
2315
+ "loss": 0.2843871355056763,
2316
+ "memory(GiB)": 75.82,
2317
+ "step": 1055,
2318
+ "token_acc": 0.908830434955629,
2319
+ "train_speed(iter/s)": 0.307659
2320
+ },
2321
+ {
2322
+ "epoch": 0.7134143776553232,
2323
+ "grad_norm": 2.265625,
2324
+ "learning_rate": 1.888461705108318e-05,
2325
+ "loss": 0.31880433559417726,
2326
+ "memory(GiB)": 75.82,
2327
+ "step": 1060,
2328
+ "token_acc": 0.9162755488266465,
2329
+ "train_speed(iter/s)": 0.308265
2330
+ },
2331
+ {
2332
+ "epoch": 0.7167795398140748,
2333
+ "grad_norm": 2.53125,
2334
+ "learning_rate": 1.847236664577389e-05,
2335
+ "loss": 0.3458749055862427,
2336
+ "memory(GiB)": 75.82,
2337
+ "step": 1065,
2338
+ "token_acc": 0.905449976441024,
2339
+ "train_speed(iter/s)": 0.308846
2340
+ },
2341
+ {
2342
+ "epoch": 0.7201447019728263,
2343
+ "grad_norm": 2.296875,
2344
+ "learning_rate": 1.8063643798509593e-05,
2345
+ "loss": 0.3110387325286865,
2346
+ "memory(GiB)": 75.82,
2347
+ "step": 1070,
2348
+ "token_acc": 0.9163458691145988,
2349
+ "train_speed(iter/s)": 0.309494
2350
+ },
2351
+ {
2352
+ "epoch": 0.7235098641315778,
2353
+ "grad_norm": 2.484375,
2354
+ "learning_rate": 1.7658494240397126e-05,
2355
+ "loss": 0.3040132522583008,
2356
+ "memory(GiB)": 75.82,
2357
+ "step": 1075,
2358
+ "token_acc": 0.9117370892018779,
2359
+ "train_speed(iter/s)": 0.310049
2360
+ },
2361
+ {
2362
+ "epoch": 0.7268750262903294,
2363
+ "grad_norm": 2.125,
2364
+ "learning_rate": 1.725696330273575e-05,
2365
+ "loss": 0.28740246295928956,
2366
+ "memory(GiB)": 75.82,
2367
+ "step": 1080,
2368
+ "token_acc": 0.9206708975521306,
2369
+ "train_speed(iter/s)": 0.310661
2370
+ },
2371
+ {
2372
+ "epoch": 0.7302401884490809,
2373
+ "grad_norm": 2.171875,
2374
+ "learning_rate": 1.68590959119452e-05,
2375
+ "loss": 0.3025235176086426,
2376
+ "memory(GiB)": 75.82,
2377
+ "step": 1085,
2378
+ "token_acc": 0.912094861660079,
2379
+ "train_speed(iter/s)": 0.311236
2380
+ },
2381
+ {
2382
+ "epoch": 0.7336053506078324,
2383
+ "grad_norm": 1.9375,
2384
+ "learning_rate": 1.646493658453896e-05,
2385
+ "loss": 0.3215456485748291,
2386
+ "memory(GiB)": 75.82,
2387
+ "step": 1090,
2388
+ "token_acc": 0.914180252230083,
2389
+ "train_speed(iter/s)": 0.31184
2390
+ },
2391
+ {
2392
+ "epoch": 0.736970512766584,
2393
+ "grad_norm": 2.609375,
2394
+ "learning_rate": 1.60745294221434e-05,
2395
+ "loss": 0.35790162086486815,
2396
+ "memory(GiB)": 75.82,
2397
+ "step": 1095,
2398
+ "token_acc": 0.8988936693300553,
2399
+ "train_speed(iter/s)": 0.31246
2400
+ },
2401
+ {
2402
+ "epoch": 0.7403356749253355,
2403
+ "grad_norm": 2.078125,
2404
+ "learning_rate": 1.5687918106563326e-05,
2405
+ "loss": 0.3286574840545654,
2406
+ "memory(GiB)": 75.82,
2407
+ "step": 1100,
2408
+ "token_acc": 0.9104522765088489,
2409
+ "train_speed(iter/s)": 0.313052
2410
+ },
2411
+ {
2412
+ "epoch": 0.7403356749253355,
2413
+ "eval_loss": 0.34071242809295654,
2414
+ "eval_runtime": 6.4053,
2415
+ "eval_samples_per_second": 37.469,
2416
+ "eval_steps_per_second": 37.469,
2417
+ "eval_token_acc": 0.9052325581395348,
2418
+ "step": 1100
2419
+ },
2420
+ {
2421
+ "epoch": 0.743700837084087,
2422
+ "grad_norm": 2.5625,
2423
+ "learning_rate": 1.5305145894894547e-05,
2424
+ "loss": 0.3178743600845337,
2425
+ "memory(GiB)": 75.82,
2426
+ "step": 1105,
2427
+ "token_acc": 0.9053732762719924,
2428
+ "train_speed(iter/s)": 0.30749
2429
+ },
2430
+ {
2431
+ "epoch": 0.7470659992428386,
2432
+ "grad_norm": 1.9453125,
2433
+ "learning_rate": 1.4926255614683932e-05,
2434
+ "loss": 0.28967604637145994,
2435
+ "memory(GiB)": 75.82,
2436
+ "step": 1110,
2437
+ "token_acc": 0.9175757575757576,
2438
+ "train_speed(iter/s)": 0.308051
2439
+ },
2440
+ {
2441
+ "epoch": 0.75043116140159,
2442
+ "grad_norm": 1.921875,
2443
+ "learning_rate": 1.4551289659137496e-05,
2444
+ "loss": 0.3481321096420288,
2445
+ "memory(GiB)": 75.82,
2446
+ "step": 1115,
2447
+ "token_acc": 0.9100428367444074,
2448
+ "train_speed(iter/s)": 0.308681
2449
+ },
2450
+ {
2451
+ "epoch": 0.7537963235603415,
2452
+ "grad_norm": 2.34375,
2453
+ "learning_rate": 1.4180289982377137e-05,
2454
+ "loss": 0.3199401617050171,
2455
+ "memory(GiB)": 75.82,
2456
+ "step": 1120,
2457
+ "token_acc": 0.9106353591160221,
2458
+ "train_speed(iter/s)": 0.309223
2459
+ },
2460
+ {
2461
+ "epoch": 0.7571614857190931,
2462
+ "grad_norm": 2.421875,
2463
+ "learning_rate": 1.3813298094746491e-05,
2464
+ "loss": 0.2976421356201172,
2465
+ "memory(GiB)": 75.82,
2466
+ "step": 1125,
2467
+ "token_acc": 0.9175590435675517,
2468
+ "train_speed(iter/s)": 0.309803
2469
+ },
2470
+ {
2471
+ "epoch": 0.7605266478778446,
2472
+ "grad_norm": 3.046875,
2473
+ "learning_rate": 1.345035505816642e-05,
2474
+ "loss": 0.2948709726333618,
2475
+ "memory(GiB)": 75.82,
2476
+ "step": 1130,
2477
+ "token_acc": 0.9145778364116095,
2478
+ "train_speed(iter/s)": 0.310376
2479
+ },
2480
+ {
2481
+ "epoch": 0.7638918100365961,
2482
+ "grad_norm": 2.21875,
2483
+ "learning_rate": 1.3091501481540674e-05,
2484
+ "loss": 0.3075523853302002,
2485
+ "memory(GiB)": 75.82,
2486
+ "step": 1135,
2487
+ "token_acc": 0.9111648285239462,
2488
+ "train_speed(iter/s)": 0.310961
2489
+ },
2490
+ {
2491
+ "epoch": 0.7672569721953476,
2492
+ "grad_norm": 2.390625,
2493
+ "learning_rate": 1.2736777516212266e-05,
2494
+ "loss": 0.2970130205154419,
2495
+ "memory(GiB)": 75.82,
2496
+ "step": 1140,
2497
+ "token_acc": 0.914568783498457,
2498
+ "train_speed(iter/s)": 0.311547
2499
+ },
2500
+ {
2501
+ "epoch": 0.7706221343540992,
2502
+ "grad_norm": 2.171875,
2503
+ "learning_rate": 1.238622285147103e-05,
2504
+ "loss": 0.29731974601745603,
2505
+ "memory(GiB)": 75.82,
2506
+ "step": 1145,
2507
+ "token_acc": 0.9114565731666103,
2508
+ "train_speed(iter/s)": 0.312152
2509
+ },
2510
+ {
2511
+ "epoch": 0.7739872965128507,
2512
+ "grad_norm": 2.8125,
2513
+ "learning_rate": 1.2039876710112847e-05,
2514
+ "loss": 0.3596015930175781,
2515
+ "memory(GiB)": 75.82,
2516
+ "step": 1150,
2517
+ "token_acc": 0.9056197074672825,
2518
+ "train_speed(iter/s)": 0.312745
2519
+ },
2520
+ {
2521
+ "epoch": 0.7739872965128507,
2522
+ "eval_loss": 0.33650800585746765,
2523
+ "eval_runtime": 6.306,
2524
+ "eval_samples_per_second": 38.059,
2525
+ "eval_steps_per_second": 38.059,
2526
+ "eval_token_acc": 0.9056025369978858,
2527
+ "step": 1150
2528
+ },
2529
+ {
2530
+ "epoch": 0.7773524586716022,
2531
+ "grad_norm": 1.8203125,
2532
+ "learning_rate": 1.1697777844051105e-05,
2533
+ "loss": 0.280198860168457,
2534
+ "memory(GiB)": 75.82,
2535
+ "step": 1155,
2536
+ "token_acc": 0.9082849646179534,
2537
+ "train_speed(iter/s)": 0.306862
2538
+ },
2539
+ {
2540
+ "epoch": 0.7807176208303538,
2541
+ "grad_norm": 3.359375,
2542
+ "learning_rate": 1.1359964529980849e-05,
2543
+ "loss": 0.31822154521942136,
2544
+ "memory(GiB)": 75.82,
2545
+ "step": 1160,
2546
+ "token_acc": 0.9165397502284496,
2547
+ "train_speed(iter/s)": 0.307449
2548
+ },
2549
+ {
2550
+ "epoch": 0.7840827829891053,
2551
+ "grad_norm": 2.046875,
2552
+ "learning_rate": 1.1026474565096068e-05,
2553
+ "loss": 0.3568312883377075,
2554
+ "memory(GiB)": 75.82,
2555
+ "step": 1165,
2556
+ "token_acc": 0.9036163277976098,
2557
+ "train_speed(iter/s)": 0.30803
2558
+ },
2559
+ {
2560
+ "epoch": 0.7874479451478568,
2561
+ "grad_norm": 2.984375,
2562
+ "learning_rate": 1.0697345262860636e-05,
2563
+ "loss": 0.3734995603561401,
2564
+ "memory(GiB)": 75.82,
2565
+ "step": 1170,
2566
+ "token_acc": 0.9021773935318604,
2567
+ "train_speed(iter/s)": 0.308568
2568
+ },
2569
+ {
2570
+ "epoch": 0.7908131073066084,
2571
+ "grad_norm": 2.5,
2572
+ "learning_rate": 1.037261344883343e-05,
2573
+ "loss": 0.3525618314743042,
2574
+ "memory(GiB)": 75.82,
2575
+ "step": 1175,
2576
+ "token_acc": 0.9067342073897497,
2577
+ "train_speed(iter/s)": 0.309122
2578
+ },
2579
+ {
2580
+ "epoch": 0.7941782694653599,
2581
+ "grad_norm": 1.5859375,
2582
+ "learning_rate": 1.0052315456547934e-05,
2583
+ "loss": 0.2623563289642334,
2584
+ "memory(GiB)": 75.82,
2585
+ "step": 1180,
2586
+ "token_acc": 0.9251740139211136,
2587
+ "train_speed(iter/s)": 0.309652
2588
+ },
2589
+ {
2590
+ "epoch": 0.7975434316241113,
2591
+ "grad_norm": 2.015625,
2592
+ "learning_rate": 9.73648712344707e-06,
2593
+ "loss": 0.27451634407043457,
2594
+ "memory(GiB)": 75.82,
2595
+ "step": 1185,
2596
+ "token_acc": 0.9259454705364996,
2597
+ "train_speed(iter/s)": 0.310245
2598
+ },
2599
+ {
2600
+ "epoch": 0.8009085937828629,
2601
+ "grad_norm": 2.1875,
2602
+ "learning_rate": 9.425163786873292e-06,
2603
+ "loss": 0.3021913290023804,
2604
+ "memory(GiB)": 75.82,
2605
+ "step": 1190,
2606
+ "token_acc": 0.9174669867947179,
2607
+ "train_speed(iter/s)": 0.310785
2608
+ },
2609
+ {
2610
+ "epoch": 0.8042737559416144,
2611
+ "grad_norm": 2.578125,
2612
+ "learning_rate": 9.118380280114857e-06,
2613
+ "loss": 0.3496464014053345,
2614
+ "memory(GiB)": 75.82,
2615
+ "step": 1195,
2616
+ "token_acc": 0.9070032573289902,
2617
+ "train_speed(iter/s)": 0.311393
2618
+ },
2619
+ {
2620
+ "epoch": 0.8076389181003659,
2621
+ "grad_norm": 1.9765625,
2622
+ "learning_rate": 8.816170928508365e-06,
2623
+ "loss": 0.25614950656890867,
2624
+ "memory(GiB)": 75.82,
2625
+ "step": 1200,
2626
+ "token_acc": 0.9283973187081048,
2627
+ "train_speed(iter/s)": 0.311948
2628
+ },
2629
+ {
2630
+ "epoch": 0.8076389181003659,
2631
+ "eval_loss": 0.3316649794578552,
2632
+ "eval_runtime": 6.3957,
2633
+ "eval_samples_per_second": 37.525,
2634
+ "eval_steps_per_second": 37.525,
2635
+ "eval_token_acc": 0.9070824524312896,
2636
+ "step": 1200
2637
+ },
2638
+ {
2639
+ "epoch": 0.8110040802591175,
2640
+ "grad_norm": 2.1875,
2641
+ "learning_rate": 8.5185695455982e-06,
2642
+ "loss": 0.3539767026901245,
2643
+ "memory(GiB)": 75.82,
2644
+ "step": 1205,
2645
+ "token_acc": 0.9054973715310323,
2646
+ "train_speed(iter/s)": 0.306783
2647
+ },
2648
+ {
2649
+ "epoch": 0.814369242417869,
2650
+ "grad_norm": 1.5859375,
2651
+ "learning_rate": 8.225609429353187e-06,
2652
+ "loss": 0.24862987995147706,
2653
+ "memory(GiB)": 76.54,
2654
+ "step": 1210,
2655
+ "token_acc": 0.9274929223026109,
2656
+ "train_speed(iter/s)": 0.307289
2657
+ },
2658
+ {
2659
+ "epoch": 0.8177344045766205,
2660
+ "grad_norm": 2.875,
2661
+ "learning_rate": 7.937323358440935e-06,
2662
+ "loss": 0.30047638416290284,
2663
+ "memory(GiB)": 76.54,
2664
+ "step": 1215,
2665
+ "token_acc": 0.9201830198271479,
2666
+ "train_speed(iter/s)": 0.307808
2667
+ },
2668
+ {
2669
+ "epoch": 0.8210995667353721,
2670
+ "grad_norm": 2.21875,
2671
+ "learning_rate": 7.653743588560386e-06,
2672
+ "loss": 0.34635608196258544,
2673
+ "memory(GiB)": 77.35,
2674
+ "step": 1220,
2675
+ "token_acc": 0.90292348580221,
2676
+ "train_speed(iter/s)": 0.308334
2677
+ },
2678
+ {
2679
+ "epoch": 0.8244647288941236,
2680
+ "grad_norm": 2.15625,
2681
+ "learning_rate": 7.374901848832683e-06,
2682
+ "loss": 0.2774034976959229,
2683
+ "memory(GiB)": 77.35,
2684
+ "step": 1225,
2685
+ "token_acc": 0.9153208206023571,
2686
+ "train_speed(iter/s)": 0.30888
2687
+ },
2688
+ {
2689
+ "epoch": 0.8278298910528751,
2690
+ "grad_norm": 2.765625,
2691
+ "learning_rate": 7.100829338251147e-06,
2692
+ "loss": 0.3617110729217529,
2693
+ "memory(GiB)": 77.35,
2694
+ "step": 1230,
2695
+ "token_acc": 0.901881936625382,
2696
+ "train_speed(iter/s)": 0.309428
2697
+ },
2698
+ {
2699
+ "epoch": 0.8311950532116267,
2700
+ "grad_norm": 2.515625,
2701
+ "learning_rate": 6.831556722190452e-06,
2702
+ "loss": 0.3457359790802002,
2703
+ "memory(GiB)": 77.35,
2704
+ "step": 1235,
2705
+ "token_acc": 0.9088393543428133,
2706
+ "train_speed(iter/s)": 0.309927
2707
+ },
2708
+ {
2709
+ "epoch": 0.8345602153703782,
2710
+ "grad_norm": 2.140625,
2711
+ "learning_rate": 6.567114128975571e-06,
2712
+ "loss": 0.2862051486968994,
2713
+ "memory(GiB)": 77.35,
2714
+ "step": 1240,
2715
+ "token_acc": 0.9152869313615174,
2716
+ "train_speed(iter/s)": 0.310429
2717
+ },
2718
+ {
2719
+ "epoch": 0.8379253775291297,
2720
+ "grad_norm": 2.09375,
2721
+ "learning_rate": 6.3075311465107535e-06,
2722
+ "loss": 0.32160158157348634,
2723
+ "memory(GiB)": 77.35,
2724
+ "step": 1245,
2725
+ "token_acc": 0.9136561235197121,
2726
+ "train_speed(iter/s)": 0.310932
2727
+ },
2728
+ {
2729
+ "epoch": 0.8412905396878813,
2730
+ "grad_norm": 2.09375,
2731
+ "learning_rate": 6.052836818969026e-06,
2732
+ "loss": 0.3995193958282471,
2733
+ "memory(GiB)": 78.27,
2734
+ "step": 1250,
2735
+ "token_acc": 0.8993502188038721,
2736
+ "train_speed(iter/s)": 0.311402
2737
+ },
2738
+ {
2739
+ "epoch": 0.8412905396878813,
2740
+ "eval_loss": 0.33036008477211,
2741
+ "eval_runtime": 6.3924,
2742
+ "eval_samples_per_second": 37.544,
2743
+ "eval_steps_per_second": 37.544,
2744
+ "eval_token_acc": 0.9071881606765327,
2745
+ "step": 1250
2746
+ },
2747
+ {
2748
+ "epoch": 0.8446557018466327,
2749
+ "grad_norm": 1.8046875,
2750
+ "learning_rate": 5.803059643542491e-06,
2751
+ "loss": 0.3073962926864624,
2752
+ "memory(GiB)": 78.27,
2753
+ "step": 1255,
2754
+ "token_acc": 0.9106831510540497,
2755
+ "train_speed(iter/s)": 0.30711
2756
+ },
2757
+ {
2758
+ "epoch": 0.8480208640053842,
2759
+ "grad_norm": 1.9140625,
2760
+ "learning_rate": 5.558227567253832e-06,
2761
+ "loss": 0.2895121812820435,
2762
+ "memory(GiB)": 78.27,
2763
+ "step": 1260,
2764
+ "token_acc": 0.9245490196078432,
2765
+ "train_speed(iter/s)": 0.307631
2766
+ },
2767
+ {
2768
+ "epoch": 0.8513860261641358,
2769
+ "grad_norm": 2.484375,
2770
+ "learning_rate": 5.318367983829392e-06,
2771
+ "loss": 0.354207706451416,
2772
+ "memory(GiB)": 78.27,
2773
+ "step": 1265,
2774
+ "token_acc": 0.9037365421152628,
2775
+ "train_speed(iter/s)": 0.308129
2776
+ },
2777
+ {
2778
+ "epoch": 0.8547511883228873,
2779
+ "grad_norm": 2.734375,
2780
+ "learning_rate": 5.083507730634152e-06,
2781
+ "loss": 0.3204244375228882,
2782
+ "memory(GiB)": 78.27,
2783
+ "step": 1270,
2784
+ "token_acc": 0.9147783251231527,
2785
+ "train_speed(iter/s)": 0.308617
2786
+ },
2787
+ {
2788
+ "epoch": 0.8581163504816388,
2789
+ "grad_norm": 2.328125,
2790
+ "learning_rate": 4.853673085668947e-06,
2791
+ "loss": 0.29280381202697753,
2792
+ "memory(GiB)": 78.27,
2793
+ "step": 1275,
2794
+ "token_acc": 0.9152905198776758,
2795
+ "train_speed(iter/s)": 0.309122
2796
+ },
2797
+ {
2798
+ "epoch": 0.8614815126403904,
2799
+ "grad_norm": 1.7578125,
2800
+ "learning_rate": 4.6288897646302785e-06,
2801
+ "loss": 0.29522113800048827,
2802
+ "memory(GiB)": 78.27,
2803
+ "step": 1280,
2804
+ "token_acc": 0.9161471321695761,
2805
+ "train_speed(iter/s)": 0.309632
2806
+ },
2807
+ {
2808
+ "epoch": 0.8648466747991419,
2809
+ "grad_norm": 2.265625,
2810
+ "learning_rate": 4.4091829180330505e-06,
2811
+ "loss": 0.36226553916931153,
2812
+ "memory(GiB)": 78.27,
2813
+ "step": 1285,
2814
+ "token_acc": 0.9011563440519563,
2815
+ "train_speed(iter/s)": 0.310141
2816
+ },
2817
+ {
2818
+ "epoch": 0.8682118369578934,
2819
+ "grad_norm": 2.5,
2820
+ "learning_rate": 4.19457712839652e-06,
2821
+ "loss": 0.31820580959320066,
2822
+ "memory(GiB)": 78.27,
2823
+ "step": 1290,
2824
+ "token_acc": 0.9157977883096367,
2825
+ "train_speed(iter/s)": 0.310634
2826
+ },
2827
+ {
2828
+ "epoch": 0.8715769991166449,
2829
+ "grad_norm": 1.875,
2830
+ "learning_rate": 3.9850964074938375e-06,
2831
+ "loss": 0.2922437906265259,
2832
+ "memory(GiB)": 78.27,
2833
+ "step": 1295,
2834
+ "token_acc": 0.9246329526916802,
2835
+ "train_speed(iter/s)": 0.311131
2836
+ },
2837
+ {
2838
+ "epoch": 0.8749421612753965,
2839
+ "grad_norm": 2.421875,
2840
+ "learning_rate": 3.780764193665398e-06,
2841
+ "loss": 0.2996021509170532,
2842
+ "memory(GiB)": 78.27,
2843
+ "step": 1300,
2844
+ "token_acc": 0.912474373127267,
2845
+ "train_speed(iter/s)": 0.311627
2846
+ },
2847
+ {
2848
+ "epoch": 0.8749421612753965,
2849
+ "eval_loss": 0.3295805752277374,
2850
+ "eval_runtime": 6.303,
2851
+ "eval_samples_per_second": 38.077,
2852
+ "eval_steps_per_second": 38.077,
2853
+ "eval_token_acc": 0.9073467230443975,
2854
+ "step": 1300
2855
+ },
2856
+ {
2857
+ "epoch": 0.878307323434148,
2858
+ "grad_norm": 2.25,
2859
+ "learning_rate": 3.581603349196372e-06,
2860
+ "loss": 0.31300716400146483,
2861
+ "memory(GiB)": 78.27,
2862
+ "step": 1305,
2863
+ "token_acc": 0.9083045669166369,
2864
+ "train_speed(iter/s)": 0.307342
2865
+ },
2866
+ {
2867
+ "epoch": 0.8816724855928995,
2868
+ "grad_norm": 1.8671875,
2869
+ "learning_rate": 3.3876361577587113e-06,
2870
+ "loss": 0.2798715114593506,
2871
+ "memory(GiB)": 78.27,
2872
+ "step": 1310,
2873
+ "token_acc": 0.9237044145873321,
2874
+ "train_speed(iter/s)": 0.307815
2875
+ },
2876
+ {
2877
+ "epoch": 0.8850376477516511,
2878
+ "grad_norm": 2.921875,
2879
+ "learning_rate": 3.1988843219178777e-06,
2880
+ "loss": 0.3821584701538086,
2881
+ "memory(GiB)": 78.27,
2882
+ "step": 1315,
2883
+ "token_acc": 0.9002973861324151,
2884
+ "train_speed(iter/s)": 0.3083
2885
+ },
2886
+ {
2887
+ "epoch": 0.8884028099104025,
2888
+ "grad_norm": 2.296875,
2889
+ "learning_rate": 3.0153689607045845e-06,
2890
+ "loss": 0.28262839317321775,
2891
+ "memory(GiB)": 78.27,
2892
+ "step": 1320,
2893
+ "token_acc": 0.921765601217656,
2894
+ "train_speed(iter/s)": 0.308778
2895
+ },
2896
+ {
2897
+ "epoch": 0.891767972069154,
2898
+ "grad_norm": 2.421875,
2899
+ "learning_rate": 2.8371106072518195e-06,
2900
+ "loss": 0.3282342433929443,
2901
+ "memory(GiB)": 78.27,
2902
+ "step": 1325,
2903
+ "token_acc": 0.9107642467972317,
2904
+ "train_speed(iter/s)": 0.309285
2905
+ },
2906
+ {
2907
+ "epoch": 0.8951331342279056,
2908
+ "grad_norm": 2.390625,
2909
+ "learning_rate": 2.664129206497479e-06,
2910
+ "loss": 0.3914219617843628,
2911
+ "memory(GiB)": 78.27,
2912
+ "step": 1330,
2913
+ "token_acc": 0.8952534427190155,
2914
+ "train_speed(iter/s)": 0.309779
2915
+ },
2916
+ {
2917
+ "epoch": 0.8984982963866571,
2918
+ "grad_norm": 1.71875,
2919
+ "learning_rate": 2.496444112952734e-06,
2920
+ "loss": 0.315179443359375,
2921
+ "memory(GiB)": 78.27,
2922
+ "step": 1335,
2923
+ "token_acc": 0.9142040038131554,
2924
+ "train_speed(iter/s)": 0.310272
2925
+ },
2926
+ {
2927
+ "epoch": 0.9018634585454086,
2928
+ "grad_norm": 2.265625,
2929
+ "learning_rate": 2.334074088536492e-06,
2930
+ "loss": 0.36266069412231444,
2931
+ "memory(GiB)": 78.27,
2932
+ "step": 1340,
2933
+ "token_acc": 0.9021089077746302,
2934
+ "train_speed(iter/s)": 0.310752
2935
+ },
2936
+ {
2937
+ "epoch": 0.9052286207041602,
2938
+ "grad_norm": 2.5625,
2939
+ "learning_rate": 2.1770373004762035e-06,
2940
+ "loss": 0.4528657913208008,
2941
+ "memory(GiB)": 78.27,
2942
+ "step": 1345,
2943
+ "token_acc": 0.8829328404189772,
2944
+ "train_speed(iter/s)": 0.31123
2945
+ },
2946
+ {
2947
+ "epoch": 0.9085937828629117,
2948
+ "grad_norm": 2.390625,
2949
+ "learning_rate": 2.0253513192751373e-06,
2950
+ "loss": 0.3175913095474243,
2951
+ "memory(GiB)": 78.27,
2952
+ "step": 1350,
2953
+ "token_acc": 0.9110584518167456,
2954
+ "train_speed(iter/s)": 0.311745
2955
+ },
2956
+ {
2957
+ "epoch": 0.9085937828629117,
2958
+ "eval_loss": 0.32911160588264465,
2959
+ "eval_runtime": 6.3063,
2960
+ "eval_samples_per_second": 38.057,
2961
+ "eval_steps_per_second": 38.057,
2962
+ "eval_token_acc": 0.9075052854122622,
2963
+ "step": 1350
2964
+ },
2965
+ {
2966
+ "epoch": 0.9119589450216632,
2967
+ "grad_norm": 2.875,
2968
+ "learning_rate": 1.879033116746476e-06,
2969
+ "loss": 0.33309078216552734,
2970
+ "memory(GiB)": 78.27,
2971
+ "step": 1355,
2972
+ "token_acc": 0.9085878012402857,
2973
+ "train_speed(iter/s)": 0.30713
2974
+ },
2975
+ {
2976
+ "epoch": 0.9153241071804148,
2977
+ "grad_norm": 1.8046875,
2978
+ "learning_rate": 1.738099064114368e-06,
2979
+ "loss": 0.27440056800842283,
2980
+ "memory(GiB)": 78.27,
2981
+ "step": 1360,
2982
+ "token_acc": 0.9196319018404908,
2983
+ "train_speed(iter/s)": 0.307612
2984
+ },
2985
+ {
2986
+ "epoch": 0.9186892693391663,
2987
+ "grad_norm": 2.1875,
2988
+ "learning_rate": 1.6025649301821876e-06,
2989
+ "loss": 0.3164578199386597,
2990
+ "memory(GiB)": 78.27,
2991
+ "step": 1365,
2992
+ "token_acc": 0.912046908315565,
2993
+ "train_speed(iter/s)": 0.308062
2994
+ },
2995
+ {
2996
+ "epoch": 0.9220544314979178,
2997
+ "grad_norm": 2.359375,
2998
+ "learning_rate": 1.4724458795681962e-06,
2999
+ "loss": 0.35011940002441405,
3000
+ "memory(GiB)": 78.27,
3001
+ "step": 1370,
3002
+ "token_acc": 0.9021322288694585,
3003
+ "train_speed(iter/s)": 0.30853
3004
+ },
3005
+ {
3006
+ "epoch": 0.9254195936566694,
3007
+ "grad_norm": 1.78125,
3008
+ "learning_rate": 1.3477564710088098e-06,
3009
+ "loss": 0.26722261905670164,
3010
+ "memory(GiB)": 78.27,
3011
+ "step": 1375,
3012
+ "token_acc": 0.929877564000636,
3013
+ "train_speed(iter/s)": 0.309022
3014
+ },
3015
+ {
3016
+ "epoch": 0.9287847558154209,
3017
+ "grad_norm": 2.09375,
3018
+ "learning_rate": 1.2285106557296477e-06,
3019
+ "loss": 0.2892286777496338,
3020
+ "memory(GiB)": 78.27,
3021
+ "step": 1380,
3022
+ "token_acc": 0.9129415442325727,
3023
+ "train_speed(iter/s)": 0.309492
3024
+ },
3025
+ {
3026
+ "epoch": 0.9321499179741723,
3027
+ "grad_norm": 6.71875,
3028
+ "learning_rate": 1.1147217758845751e-06,
3029
+ "loss": 0.32126703262329104,
3030
+ "memory(GiB)": 78.27,
3031
+ "step": 1385,
3032
+ "token_acc": 0.9072528883183568,
3033
+ "train_speed(iter/s)": 0.309961
3034
+ },
3035
+ {
3036
+ "epoch": 0.935515080132924,
3037
+ "grad_norm": 2.46875,
3038
+ "learning_rate": 1.0064025630628582e-06,
3039
+ "loss": 0.29770004749298096,
3040
+ "memory(GiB)": 78.27,
3041
+ "step": 1390,
3042
+ "token_acc": 0.9165990588998865,
3043
+ "train_speed(iter/s)": 0.310404
3044
+ },
3045
+ {
3046
+ "epoch": 0.9388802422916754,
3047
+ "grad_norm": 2.0,
3048
+ "learning_rate": 9.035651368646648e-07,
3049
+ "loss": 0.32735161781311034,
3050
+ "memory(GiB)": 78.27,
3051
+ "step": 1395,
3052
+ "token_acc": 0.910242711036483,
3053
+ "train_speed(iter/s)": 0.310858
3054
+ },
3055
+ {
3056
+ "epoch": 0.9422454044504269,
3057
+ "grad_norm": 1.7578125,
3058
+ "learning_rate": 8.062210035450379e-07,
3059
+ "loss": 0.3620461463928223,
3060
+ "memory(GiB)": 78.27,
3061
+ "step": 1400,
3062
+ "token_acc": 0.9046025104602511,
3063
+ "train_speed(iter/s)": 0.311345
3064
+ },
3065
+ {
3066
+ "epoch": 0.9422454044504269,
3067
+ "eval_loss": 0.3291800916194916,
3068
+ "eval_runtime": 6.3197,
3069
+ "eval_samples_per_second": 37.976,
3070
+ "eval_steps_per_second": 37.976,
3071
+ "eval_token_acc": 0.90776955602537,
3072
+ "step": 1400
3073
+ },
3074
+ {
3075
+ "epoch": 0.9456105666091785,
3076
+ "grad_norm": 1.7734375,
3077
+ "learning_rate": 7.143810547264762e-07,
3078
+ "loss": 0.3065107583999634,
3079
+ "memory(GiB)": 78.27,
3080
+ "step": 1405,
3081
+ "token_acc": 0.9103534858174114,
3082
+ "train_speed(iter/s)": 0.307325
3083
+ },
3084
+ {
3085
+ "epoch": 0.94897572876793,
3086
+ "grad_norm": 1.59375,
3087
+ "learning_rate": 6.280555661802856e-07,
3088
+ "loss": 0.260418701171875,
3089
+ "memory(GiB)": 78.27,
3090
+ "step": 1410,
3091
+ "token_acc": 0.9281273692191054,
3092
+ "train_speed(iter/s)": 0.307789
3093
+ },
3094
+ {
3095
+ "epoch": 0.9523408909266815,
3096
+ "grad_norm": 1.53125,
3097
+ "learning_rate": 5.472541966768551e-07,
3098
+ "loss": 0.23598823547363282,
3099
+ "memory(GiB)": 78.27,
3100
+ "step": 1415,
3101
+ "token_acc": 0.9302064991195774,
3102
+ "train_speed(iter/s)": 0.308273
3103
+ },
3104
+ {
3105
+ "epoch": 0.9557060530854331,
3106
+ "grad_norm": 1.8125,
3107
+ "learning_rate": 4.7198598690496585e-07,
3108
+ "loss": 0.2811413288116455,
3109
+ "memory(GiB)": 78.27,
3110
+ "step": 1420,
3111
+ "token_acc": 0.9181454836131095,
3112
+ "train_speed(iter/s)": 0.308702
3113
+ },
3114
+ {
3115
+ "epoch": 0.9590712152441846,
3116
+ "grad_norm": 2.28125,
3117
+ "learning_rate": 4.02259358460233e-07,
3118
+ "loss": 0.23685033321380616,
3119
+ "memory(GiB)": 78.27,
3120
+ "step": 1425,
3121
+ "token_acc": 0.9318723201524536,
3122
+ "train_speed(iter/s)": 0.30913
3123
+ },
3124
+ {
3125
+ "epoch": 0.9624363774029361,
3126
+ "grad_norm": 2.6875,
3127
+ "learning_rate": 3.380821129028489e-07,
3128
+ "loss": 0.28534321784973143,
3129
+ "memory(GiB)": 78.27,
3130
+ "step": 1430,
3131
+ "token_acc": 0.9202698558724318,
3132
+ "train_speed(iter/s)": 0.309604
3133
+ },
3134
+ {
3135
+ "epoch": 0.9658015395616877,
3136
+ "grad_norm": 1.765625,
3137
+ "learning_rate": 2.794614308846644e-07,
3138
+ "loss": 0.30334537029266356,
3139
+ "memory(GiB)": 78.27,
3140
+ "step": 1435,
3141
+ "token_acc": 0.9193307439498059,
3142
+ "train_speed(iter/s)": 0.310037
3143
+ },
3144
+ {
3145
+ "epoch": 0.9691667017204392,
3146
+ "grad_norm": 2.796875,
3147
+ "learning_rate": 2.2640387134577058e-07,
3148
+ "loss": 0.30445032119750975,
3149
+ "memory(GiB)": 78.27,
3150
+ "step": 1440,
3151
+ "token_acc": 0.9128602730490477,
3152
+ "train_speed(iter/s)": 0.310511
3153
+ },
3154
+ {
3155
+ "epoch": 0.9725318638791907,
3156
+ "grad_norm": 2.265625,
3157
+ "learning_rate": 1.789153707806357e-07,
3158
+ "loss": 0.3070082187652588,
3159
+ "memory(GiB)": 78.27,
3160
+ "step": 1445,
3161
+ "token_acc": 0.9169588779088301,
3162
+ "train_speed(iter/s)": 0.310939
3163
+ },
3164
+ {
3165
+ "epoch": 0.9758970260379422,
3166
+ "grad_norm": 2.1875,
3167
+ "learning_rate": 1.3700124257388092e-07,
3168
+ "loss": 0.2871255874633789,
3169
+ "memory(GiB)": 78.27,
3170
+ "step": 1450,
3171
+ "token_acc": 0.9249602543720191,
3172
+ "train_speed(iter/s)": 0.311389
3173
+ },
3174
+ {
3175
+ "epoch": 0.9758970260379422,
3176
+ "eval_loss": 0.32921192049980164,
3177
+ "eval_runtime": 6.3471,
3178
+ "eval_samples_per_second": 37.813,
3179
+ "eval_steps_per_second": 37.813,
3180
+ "eval_token_acc": 0.9072938689217759,
3181
+ "step": 1450
3182
+ },
3183
+ {
3184
+ "epoch": 0.9792621881966938,
3185
+ "grad_norm": 2.1875,
3186
+ "learning_rate": 1.0066617640578368e-07,
3187
+ "loss": 0.32105896472930906,
3188
+ "memory(GiB)": 78.27,
3189
+ "step": 1455,
3190
+ "token_acc": 0.9072366364488903,
3191
+ "train_speed(iter/s)": 0.307431
3192
+ },
3193
+ {
3194
+ "epoch": 0.9826273503554452,
3195
+ "grad_norm": 2.625,
3196
+ "learning_rate": 6.991423772753636e-08,
3197
+ "loss": 0.3220836639404297,
3198
+ "memory(GiB)": 78.27,
3199
+ "step": 1460,
3200
+ "token_acc": 0.9141094834232845,
3201
+ "train_speed(iter/s)": 0.307867
3202
+ },
3203
+ {
3204
+ "epoch": 0.9859925125141967,
3205
+ "grad_norm": 2.609375,
3206
+ "learning_rate": 4.474886730641004e-08,
3207
+ "loss": 0.330595874786377,
3208
+ "memory(GiB)": 78.27,
3209
+ "step": 1465,
3210
+ "token_acc": 0.9087763447625039,
3211
+ "train_speed(iter/s)": 0.308304
3212
+ },
3213
+ {
3214
+ "epoch": 0.9893576746729483,
3215
+ "grad_norm": 1.8515625,
3216
+ "learning_rate": 2.5172880840745873e-08,
3217
+ "loss": 0.3304997444152832,
3218
+ "memory(GiB)": 78.27,
3219
+ "step": 1470,
3220
+ "token_acc": 0.9131480090157776,
3221
+ "train_speed(iter/s)": 0.308715
3222
+ },
3223
+ {
3224
+ "epoch": 0.9927228368316998,
3225
+ "grad_norm": 2.5625,
3226
+ "learning_rate": 1.1188468644907079e-08,
3227
+ "loss": 0.29615800380706786,
3228
+ "memory(GiB)": 78.27,
3229
+ "step": 1475,
3230
+ "token_acc": 0.919311727363849,
3231
+ "train_speed(iter/s)": 0.309186
3232
+ },
3233
+ {
3234
+ "epoch": 0.9960879989904513,
3235
+ "grad_norm": 2.515625,
3236
+ "learning_rate": 2.797195404247166e-09,
3237
+ "loss": 0.37999179363250735,
3238
+ "memory(GiB)": 78.27,
3239
+ "step": 1480,
3240
+ "token_acc": 0.8994573890839451,
3241
+ "train_speed(iter/s)": 0.309628
3242
+ },
3243
+ {
3244
+ "epoch": 0.9994531611492029,
3245
+ "grad_norm": 2.453125,
3246
+ "learning_rate": 0.0,
3247
+ "loss": 0.29352550506591796,
3248
+ "memory(GiB)": 78.27,
3249
+ "step": 1485,
3250
+ "token_acc": 0.9119153858866303,
3251
+ "train_speed(iter/s)": 0.310084
3252
+ },
3253
+ {
3254
+ "epoch": 0.9994531611492029,
3255
+ "eval_loss": 0.3291037976741791,
3256
+ "eval_runtime": 6.3842,
3257
+ "eval_samples_per_second": 37.593,
3258
+ "eval_steps_per_second": 37.593,
3259
+ "eval_token_acc": 0.9075581395348837,
3260
+ "step": 1485
3261
+ }
3262
+ ],
3263
+ "logging_steps": 5,
3264
+ "max_steps": 1485,
3265
+ "num_input_tokens_seen": 0,
3266
+ "num_train_epochs": 1,
3267
+ "save_steps": 50,
3268
+ "stateful_callbacks": {
3269
+ "TrainerControl": {
3270
+ "args": {
3271
+ "should_epoch_stop": false,
3272
+ "should_evaluate": false,
3273
+ "should_log": false,
3274
+ "should_save": true,
3275
+ "should_training_stop": true
3276
+ },
3277
+ "attributes": {}
3278
+ }
3279
+ },
3280
+ "total_flos": 6.771720449253366e+17,
3281
+ "train_batch_size": 1,
3282
+ "trial_name": null,
3283
+ "trial_params": null
3284
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3d63b6a45496c3ac22140d27c3bb9577ef2919dd9f68b4d741432bb1dbe1287
3
+ size 6289
vocab.json ADDED
The diff for this file is too large to render. See raw diff