Training in progress, step 10
Browse files- adapter_config.json +7 -7
- adapter_model.safetensors +1 -1
- args.json +30 -31
- logging.jsonl +21 -169
- special_tokens_map.json +7 -1
- training_args.bin +2 -2
adapter_config.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
{
|
| 2 |
"alpha_pattern": {},
|
| 3 |
"auto_mapping": null,
|
| 4 |
-
"base_model_name_or_path": "/root/
|
| 5 |
"bias": "none",
|
| 6 |
"eva_config": null,
|
| 7 |
"exclude_modules": null,
|
|
@@ -14,7 +14,7 @@
|
|
| 14 |
"loftq_config": {},
|
| 15 |
"lora_alpha": 256,
|
| 16 |
"lora_bias": false,
|
| 17 |
-
"lora_dropout": 0.
|
| 18 |
"megatron_config": null,
|
| 19 |
"megatron_core": "megatron.core",
|
| 20 |
"modules_to_save": [],
|
|
@@ -23,13 +23,13 @@
|
|
| 23 |
"rank_pattern": {},
|
| 24 |
"revision": null,
|
| 25 |
"target_modules": [
|
| 26 |
-
"v_proj",
|
| 27 |
-
"q_proj",
|
| 28 |
"k_proj",
|
| 29 |
-
"
|
| 30 |
-
"up_proj",
|
| 31 |
"o_proj",
|
| 32 |
-
"
|
|
|
|
|
|
|
|
|
|
| 33 |
],
|
| 34 |
"task_type": "CAUSAL_LM",
|
| 35 |
"use_dora": false,
|
|
|
|
| 1 |
{
|
| 2 |
"alpha_pattern": {},
|
| 3 |
"auto_mapping": null,
|
| 4 |
+
"base_model_name_or_path": "/root/dataDisk/output/v63-20250312-123826/checkpoint-160-merged",
|
| 5 |
"bias": "none",
|
| 6 |
"eva_config": null,
|
| 7 |
"exclude_modules": null,
|
|
|
|
| 14 |
"loftq_config": {},
|
| 15 |
"lora_alpha": 256,
|
| 16 |
"lora_bias": false,
|
| 17 |
+
"lora_dropout": 0.1,
|
| 18 |
"megatron_config": null,
|
| 19 |
"megatron_core": "megatron.core",
|
| 20 |
"modules_to_save": [],
|
|
|
|
| 23 |
"rank_pattern": {},
|
| 24 |
"revision": null,
|
| 25 |
"target_modules": [
|
|
|
|
|
|
|
| 26 |
"k_proj",
|
| 27 |
+
"down_proj",
|
|
|
|
| 28 |
"o_proj",
|
| 29 |
+
"v_proj",
|
| 30 |
+
"gate_proj",
|
| 31 |
+
"q_proj",
|
| 32 |
+
"up_proj"
|
| 33 |
],
|
| 34 |
"task_type": "CAUSAL_LM",
|
| 35 |
"use_dora": false,
|
adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 13254157312
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6584293598035a8eddf8812fb57e22a49169777cd99a0c1549462db262f1b8eb
|
| 3 |
size 13254157312
|
args.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
-
"model": "/root/
|
| 3 |
"model_type": "llama3_2",
|
| 4 |
"model_revision": null,
|
| 5 |
"task_type": "causal_lm",
|
|
@@ -11,29 +11,25 @@
|
|
| 11 |
"local_repo_path": null,
|
| 12 |
"template": "llama3_2",
|
| 13 |
"system": "",
|
| 14 |
-
"max_length":
|
| 15 |
"truncation_strategy": "left",
|
| 16 |
"max_pixels": null,
|
| 17 |
"tools_prompt": "react_en",
|
| 18 |
-
"norm_bbox": null,
|
| 19 |
-
"padding_side": "right",
|
| 20 |
"loss_scale": "default",
|
| 21 |
"sequence_parallel_size": 1,
|
| 22 |
"use_chat_template": true,
|
| 23 |
"template_backend": "swift",
|
| 24 |
"dataset": [
|
| 25 |
-
"
|
| 26 |
],
|
| 27 |
"val_dataset": [],
|
| 28 |
"split_dataset_ratio": 0.03,
|
| 29 |
"data_seed": 42,
|
| 30 |
"dataset_num_proc": 1,
|
| 31 |
"streaming": false,
|
| 32 |
-
"
|
| 33 |
"download_mode": "reuse_dataset_if_exists",
|
| 34 |
-
"columns": {},
|
| 35 |
"strict": false,
|
| 36 |
-
"remove_unused_columns": true,
|
| 37 |
"model_name": [
|
| 38 |
null,
|
| 39 |
null
|
|
@@ -59,8 +55,7 @@
|
|
| 59 |
"stream": false,
|
| 60 |
"stop_words": [],
|
| 61 |
"logprobs": false,
|
| 62 |
-
"
|
| 63 |
-
"ckpt_dir": null,
|
| 64 |
"load_dataset_config": null,
|
| 65 |
"lora_modules": [],
|
| 66 |
"tuner_backend": "peft",
|
|
@@ -68,14 +63,14 @@
|
|
| 68 |
"adapters": [],
|
| 69 |
"seed": 42,
|
| 70 |
"model_kwargs": {},
|
| 71 |
-
"load_args":
|
| 72 |
"load_data_args": false,
|
| 73 |
"use_hf": true,
|
| 74 |
"hub_token": null,
|
| 75 |
"custom_register_path": [],
|
| 76 |
"ignore_args_error": false,
|
| 77 |
"use_swift_lora": false,
|
| 78 |
-
"output_dir": "/root/dataDisk/output/
|
| 79 |
"overwrite_output_dir": false,
|
| 80 |
"do_train": false,
|
| 81 |
"do_eval": false,
|
|
@@ -86,7 +81,7 @@
|
|
| 86 |
"per_device_eval_batch_size": 1,
|
| 87 |
"per_gpu_train_batch_size": null,
|
| 88 |
"per_gpu_eval_batch_size": null,
|
| 89 |
-
"gradient_accumulation_steps":
|
| 90 |
"eval_accumulation_steps": null,
|
| 91 |
"eval_delay": 0,
|
| 92 |
"torch_empty_cache_steps": null,
|
|
@@ -105,7 +100,7 @@
|
|
| 105 |
"log_level": "passive",
|
| 106 |
"log_level_replica": "warning",
|
| 107 |
"log_on_each_node": true,
|
| 108 |
-
"logging_dir": "/root/dataDisk/output/
|
| 109 |
"logging_strategy": "steps",
|
| 110 |
"logging_first_step": true,
|
| 111 |
"logging_steps": 1,
|
|
@@ -141,6 +136,7 @@
|
|
| 141 |
"past_index": -1,
|
| 142 |
"run_name": null,
|
| 143 |
"disable_tqdm": null,
|
|
|
|
| 144 |
"label_names": null,
|
| 145 |
"load_best_model_at_end": false,
|
| 146 |
"metric_for_best_model": "loss",
|
|
@@ -168,7 +164,7 @@
|
|
| 168 |
"zero_optimization": {
|
| 169 |
"stage": 3,
|
| 170 |
"offload_optimizer": {
|
| 171 |
-
"device": "
|
| 172 |
"pin_memory": true
|
| 173 |
},
|
| 174 |
"offload_param": {
|
|
@@ -264,7 +260,7 @@
|
|
| 264 |
"modules_to_save": [],
|
| 265 |
"lora_rank": 512,
|
| 266 |
"lora_alpha": 256,
|
| 267 |
-
"lora_dropout": 0.
|
| 268 |
"lora_bias": "none",
|
| 269 |
"lora_dtype": null,
|
| 270 |
"lorap_lr_ratio": null,
|
|
@@ -326,29 +322,32 @@
|
|
| 326 |
"metric_warmup_step": 0,
|
| 327 |
"fsdp_num": 1,
|
| 328 |
"acc_steps": 1,
|
| 329 |
-
"swanlab_token": null,
|
| 330 |
-
"swanlab_project": null,
|
| 331 |
-
"swanlab_workspace": null,
|
| 332 |
-
"swanlab_exp_name": null,
|
| 333 |
-
"swanlab_mode": "cloud",
|
| 334 |
"add_version": true,
|
| 335 |
"resume_only_model": false,
|
| 336 |
"check_model": true,
|
| 337 |
-
"
|
| 338 |
"packing": false,
|
| 339 |
"lazy_tokenize": false,
|
| 340 |
-
"
|
| 341 |
-
"loss_type": null,
|
| 342 |
"optimizer": null,
|
| 343 |
"metric": null,
|
| 344 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 345 |
"rank": 0,
|
| 346 |
-
"global_world_size": 8,
|
| 347 |
"local_world_size": 8,
|
| 348 |
-
"model_suffix": "
|
| 349 |
-
"model_info": "ModelInfo(model_type='llama3_2', model_dir='/root/
|
| 350 |
-
"model_meta": "ModelMeta(model_type='llama3_2', model_groups=[ModelGroup(models=[Model(ms_model_id='LLM-Research/Llama-3.2-1B', hf_model_id='meta-llama/Llama-3.2-1B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='LLM-Research/Llama-3.2-3B', hf_model_id='meta-llama/Llama-3.2-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='LLM-Research/Llama-3.2-1B-Instruct', hf_model_id='meta-llama/Llama-3.2-1B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='LLM-Research/Llama-3.2-3B-Instruct', hf_model_id='meta-llama/Llama-3.2-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='LLM-Research/Llama-3.3-70B-Instruct', hf_model_id='meta-llama/Llama-3.3-70B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='unsloth/Llama-3.3-70B-Instruct-bnb-4bit', hf_model_id='unsloth/Llama-3.3-70B-Instruct-bnb-4bit', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='llama3_2', get_function=<function get_model_tokenizer_with_flash_attn at
|
| 351 |
-
"model_dir": "/root/
|
| 352 |
"hub": "<class 'swift.hub.hub.HFHub'>",
|
| 353 |
-
"training_args": "
|
| 354 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"model": "/root/dataDisk/output/v63-20250312-123826/checkpoint-160-merged",
|
| 3 |
"model_type": "llama3_2",
|
| 4 |
"model_revision": null,
|
| 5 |
"task_type": "causal_lm",
|
|
|
|
| 11 |
"local_repo_path": null,
|
| 12 |
"template": "llama3_2",
|
| 13 |
"system": "",
|
| 14 |
+
"max_length": 14000,
|
| 15 |
"truncation_strategy": "left",
|
| 16 |
"max_pixels": null,
|
| 17 |
"tools_prompt": "react_en",
|
|
|
|
|
|
|
| 18 |
"loss_scale": "default",
|
| 19 |
"sequence_parallel_size": 1,
|
| 20 |
"use_chat_template": true,
|
| 21 |
"template_backend": "swift",
|
| 22 |
"dataset": [
|
| 23 |
+
"dpo_data.jsonl"
|
| 24 |
],
|
| 25 |
"val_dataset": [],
|
| 26 |
"split_dataset_ratio": 0.03,
|
| 27 |
"data_seed": 42,
|
| 28 |
"dataset_num_proc": 1,
|
| 29 |
"streaming": false,
|
| 30 |
+
"load_from_cache_file": false,
|
| 31 |
"download_mode": "reuse_dataset_if_exists",
|
|
|
|
| 32 |
"strict": false,
|
|
|
|
| 33 |
"model_name": [
|
| 34 |
null,
|
| 35 |
null
|
|
|
|
| 55 |
"stream": false,
|
| 56 |
"stop_words": [],
|
| 57 |
"logprobs": false,
|
| 58 |
+
"ckpt_dir": "/root/dataDisk/output/v63-20250312-123826/checkpoint-160-merged",
|
|
|
|
| 59 |
"load_dataset_config": null,
|
| 60 |
"lora_modules": [],
|
| 61 |
"tuner_backend": "peft",
|
|
|
|
| 63 |
"adapters": [],
|
| 64 |
"seed": 42,
|
| 65 |
"model_kwargs": {},
|
| 66 |
+
"load_args": true,
|
| 67 |
"load_data_args": false,
|
| 68 |
"use_hf": true,
|
| 69 |
"hub_token": null,
|
| 70 |
"custom_register_path": [],
|
| 71 |
"ignore_args_error": false,
|
| 72 |
"use_swift_lora": false,
|
| 73 |
+
"output_dir": "/root/dataDisk/output/v68-20250313-073537",
|
| 74 |
"overwrite_output_dir": false,
|
| 75 |
"do_train": false,
|
| 76 |
"do_eval": false,
|
|
|
|
| 81 |
"per_device_eval_batch_size": 1,
|
| 82 |
"per_gpu_train_batch_size": null,
|
| 83 |
"per_gpu_eval_batch_size": null,
|
| 84 |
+
"gradient_accumulation_steps": 1,
|
| 85 |
"eval_accumulation_steps": null,
|
| 86 |
"eval_delay": 0,
|
| 87 |
"torch_empty_cache_steps": null,
|
|
|
|
| 100 |
"log_level": "passive",
|
| 101 |
"log_level_replica": "warning",
|
| 102 |
"log_on_each_node": true,
|
| 103 |
+
"logging_dir": "/root/dataDisk/output/v68-20250313-073537/runs",
|
| 104 |
"logging_strategy": "steps",
|
| 105 |
"logging_first_step": true,
|
| 106 |
"logging_steps": 1,
|
|
|
|
| 136 |
"past_index": -1,
|
| 137 |
"run_name": null,
|
| 138 |
"disable_tqdm": null,
|
| 139 |
+
"remove_unused_columns": false,
|
| 140 |
"label_names": null,
|
| 141 |
"load_best_model_at_end": false,
|
| 142 |
"metric_for_best_model": "loss",
|
|
|
|
| 164 |
"zero_optimization": {
|
| 165 |
"stage": 3,
|
| 166 |
"offload_optimizer": {
|
| 167 |
+
"device": "none",
|
| 168 |
"pin_memory": true
|
| 169 |
},
|
| 170 |
"offload_param": {
|
|
|
|
| 260 |
"modules_to_save": [],
|
| 261 |
"lora_rank": 512,
|
| 262 |
"lora_alpha": 256,
|
| 263 |
+
"lora_dropout": 0.1,
|
| 264 |
"lora_bias": "none",
|
| 265 |
"lora_dtype": null,
|
| 266 |
"lorap_lr_ratio": null,
|
|
|
|
| 322 |
"metric_warmup_step": 0,
|
| 323 |
"fsdp_num": 1,
|
| 324 |
"acc_steps": 1,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 325 |
"add_version": true,
|
| 326 |
"resume_only_model": false,
|
| 327 |
"check_model": true,
|
| 328 |
+
"loss_type": null,
|
| 329 |
"packing": false,
|
| 330 |
"lazy_tokenize": false,
|
| 331 |
+
"acc_strategy": "token",
|
|
|
|
| 332 |
"optimizer": null,
|
| 333 |
"metric": null,
|
| 334 |
+
"rlhf_type": "orpo",
|
| 335 |
+
"ref_model": null,
|
| 336 |
+
"ref_model_type": null,
|
| 337 |
+
"ref_model_revision": null,
|
| 338 |
+
"beta": 0.1,
|
| 339 |
+
"label_smoothing": 0,
|
| 340 |
+
"rpo_alpha": 1.0,
|
| 341 |
+
"cpo_alpha": 1.0,
|
| 342 |
+
"simpo_gamma": 1,
|
| 343 |
+
"desirable_weight": 1.0,
|
| 344 |
+
"undesirable_weight": 1.0,
|
| 345 |
"rank": 0,
|
|
|
|
| 346 |
"local_world_size": 8,
|
| 347 |
+
"model_suffix": "checkpoint-160-merged",
|
| 348 |
+
"model_info": "ModelInfo(model_type='llama3_2', model_dir='/root/dataDisk/output/v63-20250312-123826/checkpoint-160-merged', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config={'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, task_type=None)",
|
| 349 |
+
"model_meta": "ModelMeta(model_type='llama3_2', model_groups=[ModelGroup(models=[Model(ms_model_id='LLM-Research/Llama-3.2-1B', hf_model_id='meta-llama/Llama-3.2-1B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='LLM-Research/Llama-3.2-3B', hf_model_id='meta-llama/Llama-3.2-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='LLM-Research/Llama-3.2-1B-Instruct', hf_model_id='meta-llama/Llama-3.2-1B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='LLM-Research/Llama-3.2-3B-Instruct', hf_model_id='meta-llama/Llama-3.2-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='LLM-Research/Llama-3.3-70B-Instruct', hf_model_id='meta-llama/Llama-3.3-70B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='unsloth/Llama-3.3-70B-Instruct-bnb-4bit', hf_model_id='unsloth/Llama-3.3-70B-Instruct-bnb-4bit', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='llama3_2', get_function=<function get_model_tokenizer_with_flash_attn at 0x7f03a47424d0>, model_arch='llama', architectures=['LlamaForCausalLM'], is_multimodal=False, additional_saved_files=[], torch_dtype=None, ignore_patterns=[], requires=['transformers>=4.45'], tags=[])",
|
| 350 |
+
"model_dir": "/root/dataDisk/output/v63-20250312-123826/checkpoint-160-merged",
|
| 351 |
"hub": "<class 'swift.hub.hub.HFHub'>",
|
| 352 |
+
"training_args": "ORPOConfig(output_dir='/root/dataDisk/output/v68-20250313-073537', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.STEPS: 'steps'>, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=1e-05, weight_decay=0.01, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=1.0, max_steps=-1, lr_scheduler_type=<SchedulerType.LINEAR: 'linear'>, lr_scheduler_kwargs=None, warmup_ratio=0.1, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/root/dataDisk/output/v68-20250313-073537/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=1, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.STEPS: 'steps'>, save_steps=10, save_total_limit=2, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=10, dataloader_num_workers=0, dataloader_prefetch_factor=None, past_index=-1, run_name='/root/dataDisk/output/v68-20250313-073537', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'cpu', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH: 'adamw_torch'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['wandb'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=True, resume_from_checkpoint=None, hub_model_id='TheAgenticAI/LLAMA-3.3-70B-Reasoning', hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=True, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs={'use_reentrant': True}, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, max_length=14000, max_prompt_length=None, max_completion_length=None, beta=0.1, disable_dropout=True, label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', generate_during_eval=False, is_encoder_decoder=False, model_init_kwargs=None, dataset_num_proc=1, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora')"
|
| 353 |
}
|
logging.jsonl
CHANGED
|
@@ -1,169 +1,21 @@
|
|
| 1 |
-
{"loss":
|
| 2 |
-
{"loss":
|
| 3 |
-
{"loss":
|
| 4 |
-
{"loss": 0.
|
| 5 |
-
{"loss": 0.
|
| 6 |
-
{"loss": 0.
|
| 7 |
-
{"loss": 0.
|
| 8 |
-
{"loss": 0.
|
| 9 |
-
{"loss": 0.
|
| 10 |
-
{"loss": 0.
|
| 11 |
-
{"eval_loss": 0.
|
| 12 |
-
{"loss": 0.
|
| 13 |
-
{"loss": 0.
|
| 14 |
-
{"loss": 0.
|
| 15 |
-
{"loss": 0.
|
| 16 |
-
{"loss": 0.
|
| 17 |
-
{"loss": 0.
|
| 18 |
-
{"loss": 0.
|
| 19 |
-
{"loss": 0.
|
| 20 |
-
{"loss": 0.
|
| 21 |
-
{"loss": 0.
|
| 22 |
-
{"eval_loss": 0.6061545, "eval_token_acc": 0.80982449, "eval_runtime": 233.1289, "eval_samples_per_second": 1.982, "eval_steps_per_second": 0.249, "epoch": 0.08560728, "global_step/max_steps": "20/233", "percentage": "8.58%", "elapsed_time": "45m 10s", "remaining_time": "8h 1m 8s"}
|
| 23 |
-
{"loss": 0.64495265, "token_acc": 0.81662134, "grad_norm": 0.11575121, "learning_rate": 8.75e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007161, "epoch": 0.08988764, "global_step/max_steps": "21/233", "percentage": "9.01%", "elapsed_time": "48m 26s", "remaining_time": "8h 9m 4s"}
|
| 24 |
-
{"loss": 0.56515968, "token_acc": 0.80879168, "grad_norm": 0.14889614, "learning_rate": 9.17e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007331, "epoch": 0.094168, "global_step/max_steps": "22/233", "percentage": "9.44%", "elapsed_time": "49m 35s", "remaining_time": "7h 55m 34s"}
|
| 25 |
-
{"loss": 0.61245996, "token_acc": 0.81094324, "grad_norm": 0.50602204, "learning_rate": 9.58e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007474, "epoch": 0.09844837, "global_step/max_steps": "23/233", "percentage": "9.87%", "elapsed_time": "50m 51s", "remaining_time": "7h 44m 18s"}
|
| 26 |
-
{"loss": 0.62092638, "token_acc": 0.82057519, "grad_norm": 0.1378572, "learning_rate": 1e-05, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007544, "epoch": 0.10272873, "global_step/max_steps": "24/233", "percentage": "10.30%", "elapsed_time": "52m 35s", "remaining_time": "7h 37m 58s"}
|
| 27 |
-
{"loss": 0.5545224, "token_acc": 0.84725024, "grad_norm": 0.13007531, "learning_rate": 9.95e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007633, "epoch": 0.1070091, "global_step/max_steps": "25/233", "percentage": "10.73%", "elapsed_time": "54m 9s", "remaining_time": "7h 30m 33s"}
|
| 28 |
-
{"loss": 0.57198393, "token_acc": 0.84888653, "grad_norm": 0.42071208, "learning_rate": 9.9e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007755, "epoch": 0.11128946, "global_step/max_steps": "26/233", "percentage": "11.16%", "elapsed_time": "55m 26s", "remaining_time": "7h 21m 25s"}
|
| 29 |
-
{"loss": 0.59884644, "token_acc": 0.81060166, "grad_norm": 0.1219664, "learning_rate": 9.86e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007849, "epoch": 0.11556982, "global_step/max_steps": "27/233", "percentage": "11.59%", "elapsed_time": "56m 53s", "remaining_time": "7h 14m 7s"}
|
| 30 |
-
{"loss": 0.60813272, "token_acc": 0.8097131, "grad_norm": 0.14412673, "learning_rate": 9.81e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007949, "epoch": 0.11985019, "global_step/max_steps": "28/233", "percentage": "12.02%", "elapsed_time": "58m 16s", "remaining_time": "7h 6m 38s"}
|
| 31 |
-
{"loss": 0.58964062, "token_acc": 0.82500748, "grad_norm": 0.18986589, "learning_rate": 9.76e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007997, "epoch": 0.12413055, "global_step/max_steps": "29/233", "percentage": "12.45%", "elapsed_time": "1h 0m 0s", "remaining_time": "7h 2m 7s"}
|
| 32 |
-
{"loss": 0.59038109, "token_acc": 0.82640086, "grad_norm": 0.14620499, "learning_rate": 9.71e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.00813, "epoch": 0.12841091, "global_step/max_steps": "30/233", "percentage": "12.88%", "elapsed_time": "1h 1m 4s", "remaining_time": "6h 53m 14s"}
|
| 33 |
-
{"eval_loss": 0.57404786, "eval_token_acc": 0.81633597, "eval_runtime": 233.9161, "eval_samples_per_second": 1.975, "eval_steps_per_second": 0.248, "epoch": 0.12841091, "global_step/max_steps": "30/233", "percentage": "12.88%", "elapsed_time": "1h 4m 58s", "remaining_time": "7h 19m 37s"}
|
| 34 |
-
{"loss": 0.61220074, "token_acc": 0.82002309, "grad_norm": 0.12467663, "learning_rate": 9.67e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007603, "epoch": 0.13269128, "global_step/max_steps": "31/233", "percentage": "13.30%", "elapsed_time": "1h 7m 31s", "remaining_time": "7h 20m 0s"}
|
| 35 |
-
{"loss": 0.57690763, "token_acc": 0.82229942, "grad_norm": 0.12858853, "learning_rate": 9.62e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007548, "epoch": 0.13697164, "global_step/max_steps": "32/233", "percentage": "13.73%", "elapsed_time": "1h 10m 13s", "remaining_time": "7h 21m 8s"}
|
| 36 |
-
{"loss": 0.57390118, "token_acc": 0.82656848, "grad_norm": 0.12490374, "learning_rate": 9.57e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007576, "epoch": 0.14125201, "global_step/max_steps": "33/233", "percentage": "14.16%", "elapsed_time": "1h 12m 9s", "remaining_time": "7h 17m 21s"}
|
| 37 |
-
{"loss": 0.54597116, "token_acc": 0.81826962, "grad_norm": 0.14487278, "learning_rate": 9.52e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.00766, "epoch": 0.14553237, "global_step/max_steps": "34/233", "percentage": "14.59%", "elapsed_time": "1h 13m 32s", "remaining_time": "7h 10m 26s"}
|
| 38 |
-
{"loss": 0.60774434, "token_acc": 0.83905093, "grad_norm": 0.14599162, "learning_rate": 9.47e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007699, "epoch": 0.14981273, "global_step/max_steps": "35/233", "percentage": "15.02%", "elapsed_time": "1h 15m 20s", "remaining_time": "7h 6m 12s"}
|
| 39 |
-
{"loss": 0.58154261, "token_acc": 0.8071638, "grad_norm": 0.15756345, "learning_rate": 9.43e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.0076, "epoch": 0.1540931, "global_step/max_steps": "36/233", "percentage": "15.45%", "elapsed_time": "1h 18m 31s", "remaining_time": "7h 9m 40s"}
|
| 40 |
-
{"loss": 0.54503143, "token_acc": 0.82044124, "grad_norm": 0.12564433, "learning_rate": 9.38e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.00764, "epoch": 0.15837346, "global_step/max_steps": "37/233", "percentage": "15.88%", "elapsed_time": "1h 20m 16s", "remaining_time": "7h 5m 16s"}
|
| 41 |
-
{"loss": 0.54243863, "token_acc": 0.8351083, "grad_norm": 0.13263634, "learning_rate": 9.33e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007717, "epoch": 0.16265383, "global_step/max_steps": "38/233", "percentage": "16.31%", "elapsed_time": "1h 21m 38s", "remaining_time": "6h 58m 54s"}
|
| 42 |
-
{"loss": 0.51341844, "token_acc": 0.84136878, "grad_norm": 0.15488632, "learning_rate": 9.28e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.0078, "epoch": 0.16693419, "global_step/max_steps": "39/233", "percentage": "16.74%", "elapsed_time": "1h 22m 53s", "remaining_time": "6h 52m 22s"}
|
| 43 |
-
{"loss": 0.5711, "token_acc": 0.81400517, "grad_norm": 0.14193577, "learning_rate": 9.23e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007837, "epoch": 0.17121455, "global_step/max_steps": "40/233", "percentage": "17.17%", "elapsed_time": "1h 24m 37s", "remaining_time": "6h 48m 20s"}
|
| 44 |
-
{"eval_loss": 0.55634356, "eval_token_acc": 0.82101112, "eval_runtime": 233.8969, "eval_samples_per_second": 1.975, "eval_steps_per_second": 0.248, "epoch": 0.17121455, "global_step/max_steps": "40/233", "percentage": "17.17%", "elapsed_time": "1h 28m 31s", "remaining_time": "7h 7m 9s"}
|
| 45 |
-
{"loss": 0.55852044, "token_acc": 0.82246999, "grad_norm": 0.13955821, "learning_rate": 9.19e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007417, "epoch": 0.17549492, "global_step/max_steps": "41/233", "percentage": "17.60%", "elapsed_time": "1h 31m 42s", "remaining_time": "7h 9m 26s"}
|
| 46 |
-
{"loss": 0.5492382, "token_acc": 0.81834246, "grad_norm": 0.18386385, "learning_rate": 9.14e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.00745, "epoch": 0.17977528, "global_step/max_steps": "42/233", "percentage": "18.03%", "elapsed_time": "1h 33m 31s", "remaining_time": "7h 5m 20s"}
|
| 47 |
-
{"loss": 0.53807271, "token_acc": 0.83674288, "grad_norm": 0.17359699, "learning_rate": 9.09e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007485, "epoch": 0.18405564, "global_step/max_steps": "43/233", "percentage": "18.45%", "elapsed_time": "1h 35m 18s", "remaining_time": "7h 1m 9s"}
|
| 48 |
-
{"loss": 0.54787058, "token_acc": 0.83625366, "grad_norm": 0.10988069, "learning_rate": 9.04e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007475, "epoch": 0.18833601, "global_step/max_steps": "44/233", "percentage": "18.88%", "elapsed_time": "1h 37m 40s", "remaining_time": "6h 59m 33s"}
|
| 49 |
-
{"loss": 0.5589273, "token_acc": 0.84398263, "grad_norm": 0.11602305, "learning_rate": 9e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007511, "epoch": 0.19261637, "global_step/max_steps": "45/233", "percentage": "19.31%", "elapsed_time": "1h 39m 24s", "remaining_time": "6h 55m 20s"}
|
| 50 |
-
{"loss": 0.53926349, "token_acc": 0.83050162, "grad_norm": 0.12073734, "learning_rate": 8.95e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007563, "epoch": 0.19689674, "global_step/max_steps": "46/233", "percentage": "19.74%", "elapsed_time": "1h 40m 56s", "remaining_time": "6h 50m 21s"}
|
| 51 |
-
{"loss": 0.5202747, "token_acc": 0.83425064, "grad_norm": 0.11006246, "learning_rate": 8.9e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007615, "epoch": 0.2011771, "global_step/max_steps": "47/233", "percentage": "20.17%", "elapsed_time": "1h 42m 26s", "remaining_time": "6h 45m 24s"}
|
| 52 |
-
{"loss": 0.54124808, "token_acc": 0.85483645, "grad_norm": 0.11072037, "learning_rate": 8.85e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007602, "epoch": 0.20545746, "global_step/max_steps": "48/233", "percentage": "20.60%", "elapsed_time": "1h 44m 48s", "remaining_time": "6h 43m 56s"}
|
| 53 |
-
{"loss": 0.52816677, "token_acc": 0.82739567, "grad_norm": 0.11390109, "learning_rate": 8.8e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007721, "epoch": 0.20973783, "global_step/max_steps": "49/233", "percentage": "21.03%", "elapsed_time": "1h 45m 20s", "remaining_time": "6h 35m 34s"}
|
| 54 |
-
{"loss": 0.50078875, "token_acc": 0.8451592, "grad_norm": 0.10181949, "learning_rate": 8.76e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.00775, "epoch": 0.21401819, "global_step/max_steps": "50/233", "percentage": "21.46%", "elapsed_time": "1h 47m 5s", "remaining_time": "6h 31m 57s"}
|
| 55 |
-
{"eval_loss": 0.54290175, "eval_token_acc": 0.82446886, "eval_runtime": 233.5934, "eval_samples_per_second": 1.978, "eval_steps_per_second": 0.248, "epoch": 0.21401819, "global_step/max_steps": "50/233", "percentage": "21.46%", "elapsed_time": "1h 50m 59s", "remaining_time": "6h 46m 12s"}
|
| 56 |
-
{"loss": 0.53490806, "token_acc": 0.82852007, "grad_norm": 0.26775351, "learning_rate": 8.71e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.00748, "epoch": 0.21829856, "global_step/max_steps": "51/233", "percentage": "21.89%", "elapsed_time": "1h 53m 11s", "remaining_time": "6h 43m 57s"}
|
| 57 |
-
{"loss": 0.54204899, "token_acc": 0.83737674, "grad_norm": 0.18461479, "learning_rate": 8.66e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.00755, "epoch": 0.22257892, "global_step/max_steps": "52/233", "percentage": "22.32%", "elapsed_time": "1h 54m 21s", "remaining_time": "6h 38m 4s"}
|
| 58 |
-
{"loss": 0.57908702, "token_acc": 0.84690497, "grad_norm": 0.11942858, "learning_rate": 8.61e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007558, "epoch": 0.22685928, "global_step/max_steps": "53/233", "percentage": "22.75%", "elapsed_time": "1h 56m 26s", "remaining_time": "6h 35m 27s"}
|
| 59 |
-
{"loss": 0.52622569, "token_acc": 0.82521804, "grad_norm": 0.13052414, "learning_rate": 8.56e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007578, "epoch": 0.23113965, "global_step/max_steps": "54/233", "percentage": "23.18%", "elapsed_time": "1h 58m 20s", "remaining_time": "6h 32m 16s"}
|
| 60 |
-
{"loss": 0.56600893, "token_acc": 0.83141038, "grad_norm": 0.11560788, "learning_rate": 8.52e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007591, "epoch": 0.23542001, "global_step/max_steps": "55/233", "percentage": "23.61%", "elapsed_time": "2h 0m 19s", "remaining_time": "6h 29m 23s"}
|
| 61 |
-
{"loss": 0.51500416, "token_acc": 0.82068742, "grad_norm": 0.17905267, "learning_rate": 8.47e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007629, "epoch": 0.23970037, "global_step/max_steps": "56/233", "percentage": "24.03%", "elapsed_time": "2h 1m 53s", "remaining_time": "6h 25m 17s"}
|
| 62 |
-
{"loss": 0.52793306, "token_acc": 0.86082048, "grad_norm": 0.12558642, "learning_rate": 8.42e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007673, "epoch": 0.24398074, "global_step/max_steps": "57/233", "percentage": "24.46%", "elapsed_time": "2h 3m 23s", "remaining_time": "6h 20m 58s"}
|
| 63 |
-
{"loss": 0.51401901, "token_acc": 0.82928567, "grad_norm": 0.1479127, "learning_rate": 8.37e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007701, "epoch": 0.2482611, "global_step/max_steps": "58/233", "percentage": "24.89%", "elapsed_time": "2h 5m 5s", "remaining_time": "6h 17m 27s"}
|
| 64 |
-
{"loss": 0.54972792, "token_acc": 0.82862575, "grad_norm": 0.13245791, "learning_rate": 8.33e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007743, "epoch": 0.25254147, "global_step/max_steps": "59/233", "percentage": "25.32%", "elapsed_time": "2h 6m 34s", "remaining_time": "6h 13m 15s"}
|
| 65 |
-
{"loss": 0.50546074, "token_acc": 0.84307301, "grad_norm": 0.12975469, "learning_rate": 8.28e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007704, "epoch": 0.25682183, "global_step/max_steps": "60/233", "percentage": "25.75%", "elapsed_time": "2h 9m 22s", "remaining_time": "6h 13m 0s"}
|
| 66 |
-
{"eval_loss": 0.5343886, "eval_token_acc": 0.82678127, "eval_runtime": 233.6975, "eval_samples_per_second": 1.977, "eval_steps_per_second": 0.248, "epoch": 0.25682183, "global_step/max_steps": "60/233", "percentage": "25.75%", "elapsed_time": "2h 13m 15s", "remaining_time": "6h 24m 14s"}
|
| 67 |
-
{"loss": 0.534343, "token_acc": 0.82905063, "grad_norm": 0.13970451, "learning_rate": 8.23e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007459, "epoch": 0.26110219, "global_step/max_steps": "61/233", "percentage": "26.18%", "elapsed_time": "2h 15m 52s", "remaining_time": "6h 23m 6s"}
|
| 68 |
-
{"loss": 0.51031673, "token_acc": 0.83080435, "grad_norm": 0.11731356, "learning_rate": 8.18e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007501, "epoch": 0.26538256, "global_step/max_steps": "62/233", "percentage": "26.61%", "elapsed_time": "2h 17m 19s", "remaining_time": "6h 18m 44s"}
|
| 69 |
-
{"loss": 0.56087667, "token_acc": 0.85323266, "grad_norm": 0.12921853, "learning_rate": 8.13e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007493, "epoch": 0.26966292, "global_step/max_steps": "63/233", "percentage": "27.04%", "elapsed_time": "2h 19m 41s", "remaining_time": "6h 16m 56s"}
|
| 70 |
-
{"loss": 0.5064438, "token_acc": 0.86674917, "grad_norm": 0.11894882, "learning_rate": 8.09e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007516, "epoch": 0.27394329, "global_step/max_steps": "64/233", "percentage": "27.47%", "elapsed_time": "2h 21m 28s", "remaining_time": "6h 13m 35s"}
|
| 71 |
-
{"loss": 0.49739748, "token_acc": 0.84879067, "grad_norm": 0.26122409, "learning_rate": 8.04e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007601, "epoch": 0.27822365, "global_step/max_steps": "65/233", "percentage": "27.90%", "elapsed_time": "2h 22m 5s", "remaining_time": "6h 7m 14s"}
|
| 72 |
-
{"loss": 0.54737341, "token_acc": 0.82416686, "grad_norm": 0.10891951, "learning_rate": 7.99e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007629, "epoch": 0.28250401, "global_step/max_steps": "66/233", "percentage": "28.33%", "elapsed_time": "2h 23m 44s", "remaining_time": "6h 3m 43s"}
|
| 73 |
-
{"loss": 0.56025583, "token_acc": 0.790977, "grad_norm": 0.1199242, "learning_rate": 7.94e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007675, "epoch": 0.28678438, "global_step/max_steps": "67/233", "percentage": "28.76%", "elapsed_time": "2h 25m 4s", "remaining_time": "5h 59m 25s"}
|
| 74 |
-
{"loss": 0.51193327, "token_acc": 0.84465331, "grad_norm": 0.1607635, "learning_rate": 7.89e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007679, "epoch": 0.29106474, "global_step/max_steps": "68/233", "percentage": "29.18%", "elapsed_time": "2h 27m 9s", "remaining_time": "5h 57m 5s"}
|
| 75 |
-
{"loss": 0.52289128, "token_acc": 0.83648469, "grad_norm": 0.12519571, "learning_rate": 7.85e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007698, "epoch": 0.2953451, "global_step/max_steps": "69/233", "percentage": "29.61%", "elapsed_time": "2h 28m 57s", "remaining_time": "5h 54m 2s"}
|
| 76 |
-
{"loss": 0.50253069, "token_acc": 0.83147491, "grad_norm": 0.10590418, "learning_rate": 7.8e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007715, "epoch": 0.29962547, "global_step/max_steps": "70/233", "percentage": "30.04%", "elapsed_time": "2h 30m 47s", "remaining_time": "5h 51m 7s"}
|
| 77 |
-
{"eval_loss": 0.52436066, "eval_token_acc": 0.82876859, "eval_runtime": 233.6281, "eval_samples_per_second": 1.978, "eval_steps_per_second": 0.248, "epoch": 0.29962547, "global_step/max_steps": "70/233", "percentage": "30.04%", "elapsed_time": "2h 34m 40s", "remaining_time": "6h 0m 11s"}
|
| 78 |
-
{"loss": 0.48546743, "token_acc": 0.83361594, "grad_norm": 0.10634065, "learning_rate": 7.75e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007503, "epoch": 0.30390583, "global_step/max_steps": "71/233", "percentage": "30.47%", "elapsed_time": "2h 37m 16s", "remaining_time": "5h 58m 51s"}
|
| 79 |
-
{"loss": 0.52685374, "token_acc": 0.82080856, "grad_norm": 0.12289236, "learning_rate": 7.7e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007523, "epoch": 0.3081862, "global_step/max_steps": "72/233", "percentage": "30.90%", "elapsed_time": "2h 39m 5s", "remaining_time": "5h 55m 44s"}
|
| 80 |
-
{"loss": 0.50480282, "token_acc": 0.83330189, "grad_norm": 0.1234926, "learning_rate": 7.66e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007574, "epoch": 0.31246656, "global_step/max_steps": "73/233", "percentage": "31.33%", "elapsed_time": "2h 40m 12s", "remaining_time": "5h 51m 7s"}
|
| 81 |
-
{"loss": 0.52709985, "token_acc": 0.83077331, "grad_norm": 0.20998599, "learning_rate": 7.61e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.00758, "epoch": 0.31674692, "global_step/max_steps": "74/233", "percentage": "31.76%", "elapsed_time": "2h 42m 16s", "remaining_time": "5h 48m 39s"}
|
| 82 |
-
{"loss": 0.51562619, "token_acc": 0.85752608, "grad_norm": 0.11690234, "learning_rate": 7.56e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007622, "epoch": 0.32102729, "global_step/max_steps": "75/233", "percentage": "32.19%", "elapsed_time": "2h 43m 34s", "remaining_time": "5h 44m 34s"}
|
| 83 |
-
{"loss": 0.53282851, "token_acc": 0.83144705, "grad_norm": 0.1492236, "learning_rate": 7.51e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007626, "epoch": 0.32530765, "global_step/max_steps": "76/233", "percentage": "32.62%", "elapsed_time": "2h 45m 39s", "remaining_time": "5h 42m 13s"}
|
| 84 |
-
{"loss": 0.53681839, "token_acc": 0.8160215, "grad_norm": 0.12850326, "learning_rate": 7.46e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007615, "epoch": 0.32958801, "global_step/max_steps": "77/233", "percentage": "33.05%", "elapsed_time": "2h 48m 5s", "remaining_time": "5h 40m 32s"}
|
| 85 |
-
{"loss": 0.53259099, "token_acc": 0.81180987, "grad_norm": 0.12752953, "learning_rate": 7.42e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007602, "epoch": 0.33386838, "global_step/max_steps": "78/233", "percentage": "33.48%", "elapsed_time": "2h 50m 34s", "remaining_time": "5h 38m 58s"}
|
| 86 |
-
{"loss": 0.53562546, "token_acc": 0.82184946, "grad_norm": 0.15158969, "learning_rate": 7.37e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007612, "epoch": 0.33814874, "global_step/max_steps": "79/233", "percentage": "33.91%", "elapsed_time": "2h 52m 32s", "remaining_time": "5h 36m 21s"}
|
| 87 |
-
{"loss": 0.51222068, "token_acc": 0.84699546, "grad_norm": 0.1310516, "learning_rate": 7.32e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007659, "epoch": 0.34242911, "global_step/max_steps": "80/233", "percentage": "34.33%", "elapsed_time": "2h 53m 39s", "remaining_time": "5h 32m 7s"}
|
| 88 |
-
{"eval_loss": 0.51669514, "eval_token_acc": 0.83052194, "eval_runtime": 234.1299, "eval_samples_per_second": 1.973, "eval_steps_per_second": 0.248, "epoch": 0.34242911, "global_step/max_steps": "80/233", "percentage": "34.33%", "elapsed_time": "2h 57m 33s", "remaining_time": "5h 39m 35s"}
|
| 89 |
-
{"loss": 0.53691947, "token_acc": 0.83763546, "grad_norm": 0.11690947, "learning_rate": 7.27e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007443, "epoch": 0.34670947, "global_step/max_steps": "81/233", "percentage": "34.76%", "elapsed_time": "3h 0m 57s", "remaining_time": "5h 39m 34s"}
|
| 90 |
-
{"loss": 0.50225633, "token_acc": 0.85966191, "grad_norm": 0.11442987, "learning_rate": 7.22e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007462, "epoch": 0.35098983, "global_step/max_steps": "82/233", "percentage": "35.19%", "elapsed_time": "3h 2m 42s", "remaining_time": "5h 36m 27s"}
|
| 91 |
-
{"loss": 0.4828037, "token_acc": 0.82932011, "grad_norm": 0.13160026, "learning_rate": 7.18e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007516, "epoch": 0.3552702, "global_step/max_steps": "83/233", "percentage": "35.62%", "elapsed_time": "3h 3m 37s", "remaining_time": "5h 31m 51s"}
|
| 92 |
-
{"loss": 0.51871783, "token_acc": 0.84281236, "grad_norm": 0.1129145, "learning_rate": 7.13e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007533, "epoch": 0.35955056, "global_step/max_steps": "84/233", "percentage": "36.05%", "elapsed_time": "3h 5m 25s", "remaining_time": "5h 28m 55s"}
|
| 93 |
-
{"loss": 0.51214552, "token_acc": 0.82529789, "grad_norm": 0.1373262, "learning_rate": 7.08e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007551, "epoch": 0.36383093, "global_step/max_steps": "85/233", "percentage": "36.48%", "elapsed_time": "3h 7m 11s", "remaining_time": "5h 25m 55s"}
|
| 94 |
-
{"loss": 0.49339923, "token_acc": 0.82989313, "grad_norm": 0.11337092, "learning_rate": 7.03e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007587, "epoch": 0.36811129, "global_step/max_steps": "86/233", "percentage": "36.91%", "elapsed_time": "3h 8m 29s", "remaining_time": "5h 22m 11s"}
|
| 95 |
-
{"loss": 0.48843583, "token_acc": 0.84886903, "grad_norm": 0.15754931, "learning_rate": 6.99e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007588, "epoch": 0.37239165, "global_step/max_steps": "87/233", "percentage": "37.34%", "elapsed_time": "3h 10m 39s", "remaining_time": "5h 19m 56s"}
|
| 96 |
-
{"loss": 0.53001589, "token_acc": 0.83002833, "grad_norm": 0.15598585, "learning_rate": 6.94e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007635, "epoch": 0.37667202, "global_step/max_steps": "88/233", "percentage": "37.77%", "elapsed_time": "3h 11m 39s", "remaining_time": "5h 15m 48s"}
|
| 97 |
-
{"loss": 0.50424647, "token_acc": 0.83222175, "grad_norm": 0.12133142, "learning_rate": 6.89e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007628, "epoch": 0.38095238, "global_step/max_steps": "89/233", "percentage": "38.20%", "elapsed_time": "3h 14m 2s", "remaining_time": "5h 13m 57s"}
|
| 98 |
-
{"loss": 0.49213964, "token_acc": 0.82991934, "grad_norm": 0.10147729, "learning_rate": 6.84e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007664, "epoch": 0.38523274, "global_step/max_steps": "90/233", "percentage": "38.63%", "elapsed_time": "3h 15m 16s", "remaining_time": "5h 10m 16s"}
|
| 99 |
-
{"eval_loss": 0.51017594, "eval_token_acc": 0.83137481, "eval_runtime": 234.0416, "eval_samples_per_second": 1.974, "eval_steps_per_second": 0.248, "epoch": 0.38523274, "global_step/max_steps": "90/233", "percentage": "38.63%", "elapsed_time": "3h 19m 10s", "remaining_time": "5h 16m 28s"}
|
| 100 |
-
{"loss": 0.49563336, "token_acc": 0.83621232, "grad_norm": 0.11565997, "learning_rate": 6.79e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007508, "epoch": 0.38951311, "global_step/max_steps": "91/233", "percentage": "39.06%", "elapsed_time": "3h 21m 34s", "remaining_time": "5h 14m 33s"}
|
| 101 |
-
{"loss": 0.52433681, "token_acc": 0.82337625, "grad_norm": 0.12434755, "learning_rate": 6.75e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007493, "epoch": 0.39379347, "global_step/max_steps": "92/233", "percentage": "39.48%", "elapsed_time": "3h 24m 12s", "remaining_time": "5h 12m 57s"}
|
| 102 |
-
{"loss": 0.5221467, "token_acc": 0.8442623, "grad_norm": 0.11333634, "learning_rate": 6.7e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007489, "epoch": 0.39807384, "global_step/max_steps": "93/233", "percentage": "39.91%", "elapsed_time": "3h 26m 32s", "remaining_time": "5h 10m 55s"}
|
| 103 |
-
{"loss": 0.50553328, "token_acc": 0.8583166, "grad_norm": 0.13680269, "learning_rate": 6.65e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.00748, "epoch": 0.4023542, "global_step/max_steps": "94/233", "percentage": "40.34%", "elapsed_time": "3h 29m 0s", "remaining_time": "5h 9m 3s"}
|
| 104 |
-
{"loss": 0.49891001, "token_acc": 0.855139, "grad_norm": 0.08764607, "learning_rate": 6.6e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007474, "epoch": 0.40663456, "global_step/max_steps": "95/233", "percentage": "40.77%", "elapsed_time": "3h 31m 24s", "remaining_time": "5h 7m 6s"}
|
| 105 |
-
{"loss": 0.51290613, "token_acc": 0.84636644, "grad_norm": 0.12013482, "learning_rate": 6.56e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.00751, "epoch": 0.41091493, "global_step/max_steps": "96/233", "percentage": "41.20%", "elapsed_time": "3h 32m 36s", "remaining_time": "5h 3m 24s"}
|
| 106 |
-
{"loss": 0.49172401, "token_acc": 0.8548566, "grad_norm": 0.13774136, "learning_rate": 6.51e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007528, "epoch": 0.41519529, "global_step/max_steps": "97/233", "percentage": "41.63%", "elapsed_time": "3h 34m 18s", "remaining_time": "5h 0m 28s"}
|
| 107 |
-
{"loss": 0.55053842, "token_acc": 0.8340143, "grad_norm": 0.13488252, "learning_rate": 6.46e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007554, "epoch": 0.41947566, "global_step/max_steps": "98/233", "percentage": "42.06%", "elapsed_time": "3h 35m 47s", "remaining_time": "4h 57m 15s"}
|
| 108 |
-
{"loss": 0.5442782, "token_acc": 0.84363808, "grad_norm": 0.10564359, "learning_rate": 6.41e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007526, "epoch": 0.42375602, "global_step/max_steps": "99/233", "percentage": "42.49%", "elapsed_time": "3h 38m 47s", "remaining_time": "4h 56m 8s"}
|
| 109 |
-
{"loss": 0.51258826, "token_acc": 0.83983287, "grad_norm": 0.17585698, "learning_rate": 6.36e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007543, "epoch": 0.42803638, "global_step/max_steps": "100/233", "percentage": "42.92%", "elapsed_time": "3h 40m 31s", "remaining_time": "4h 53m 18s"}
|
| 110 |
-
{"eval_loss": 0.50452524, "eval_token_acc": 0.83303703, "eval_runtime": 234.2336, "eval_samples_per_second": 1.972, "eval_steps_per_second": 0.248, "epoch": 0.42803638, "global_step/max_steps": "100/233", "percentage": "42.92%", "elapsed_time": "3h 44m 26s", "remaining_time": "4h 58m 30s"}
|
| 111 |
-
{"loss": 0.50650519, "token_acc": 0.83758287, "grad_norm": 0.12161291, "learning_rate": 6.32e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007409, "epoch": 0.43231675, "global_step/max_steps": "101/233", "percentage": "43.35%", "elapsed_time": "3h 46m 45s", "remaining_time": "4h 56m 21s"}
|
| 112 |
-
{"loss": 0.46387315, "token_acc": 0.84282257, "grad_norm": 0.17145611, "learning_rate": 6.27e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007454, "epoch": 0.43659711, "global_step/max_steps": "102/233", "percentage": "43.78%", "elapsed_time": "3h 47m 37s", "remaining_time": "4h 52m 20s"}
|
| 113 |
-
{"loss": 0.51760161, "token_acc": 0.84220632, "grad_norm": 0.12423951, "learning_rate": 6.22e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007479, "epoch": 0.44087747, "global_step/max_steps": "103/233", "percentage": "44.21%", "elapsed_time": "3h 49m 5s", "remaining_time": "4h 49m 9s"}
|
| 114 |
-
{"loss": 0.48974538, "token_acc": 0.85497655, "grad_norm": 0.11668554, "learning_rate": 6.17e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007488, "epoch": 0.44515784, "global_step/max_steps": "104/233", "percentage": "44.64%", "elapsed_time": "3h 51m 2s", "remaining_time": "4h 46m 34s"}
|
| 115 |
-
{"loss": 0.46702838, "token_acc": 0.84152495, "grad_norm": 0.12153333, "learning_rate": 6.12e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.00752, "epoch": 0.4494382, "global_step/max_steps": "105/233", "percentage": "45.06%", "elapsed_time": "3h 52m 17s", "remaining_time": "4h 43m 10s"}
|
| 116 |
-
{"loss": 0.47563401, "token_acc": 0.84894142, "grad_norm": 0.11127526, "learning_rate": 6.08e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007559, "epoch": 0.45371857, "global_step/max_steps": "106/233", "percentage": "45.49%", "elapsed_time": "3h 53m 17s", "remaining_time": "4h 39m 30s"}
|
| 117 |
-
{"loss": 0.56198275, "token_acc": 0.83849832, "grad_norm": 0.11438177, "learning_rate": 6.03e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.00757, "epoch": 0.45799893, "global_step/max_steps": "107/233", "percentage": "45.92%", "elapsed_time": "3h 55m 7s", "remaining_time": "4h 36m 53s"}
|
| 118 |
-
{"loss": 0.53137517, "token_acc": 0.83432408, "grad_norm": 0.11901586, "learning_rate": 5.98e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.00757, "epoch": 0.46227929, "global_step/max_steps": "108/233", "percentage": "46.35%", "elapsed_time": "3h 57m 20s", "remaining_time": "4h 34m 42s"}
|
| 119 |
-
{"loss": 0.49352676, "token_acc": 0.84623256, "grad_norm": 0.11566687, "learning_rate": 5.93e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007602, "epoch": 0.46655966, "global_step/max_steps": "109/233", "percentage": "46.78%", "elapsed_time": "3h 58m 33s", "remaining_time": "4h 31m 22s"}
|
| 120 |
-
{"loss": 0.45819372, "token_acc": 0.84602131, "grad_norm": 0.1290195, "learning_rate": 5.89e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007643, "epoch": 0.47084002, "global_step/max_steps": "110/233", "percentage": "47.21%", "elapsed_time": "3h 59m 26s", "remaining_time": "4h 27m 44s"}
|
| 121 |
-
{"eval_loss": 0.49978775, "eval_token_acc": 0.83414018, "eval_runtime": 233.5668, "eval_samples_per_second": 1.978, "eval_steps_per_second": 0.248, "epoch": 0.47084002, "global_step/max_steps": "110/233", "percentage": "47.21%", "elapsed_time": "4h 3m 20s", "remaining_time": "4h 32m 5s"}
|
| 122 |
-
{"loss": 0.49648049, "token_acc": 0.83760248, "grad_norm": 0.12439388, "learning_rate": 5.84e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007504, "epoch": 0.47512039, "global_step/max_steps": "111/233", "percentage": "47.64%", "elapsed_time": "4h 6m 7s", "remaining_time": "4h 30m 30s"}
|
| 123 |
-
{"loss": 0.54757476, "token_acc": 0.84584637, "grad_norm": 0.11718772, "learning_rate": 5.79e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007514, "epoch": 0.47940075, "global_step/max_steps": "112/233", "percentage": "48.07%", "elapsed_time": "4h 8m 0s", "remaining_time": "4h 27m 56s"}
|
| 124 |
-
{"loss": 0.4847149, "token_acc": 0.84780194, "grad_norm": 0.11900615, "learning_rate": 5.74e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.00753, "epoch": 0.48368111, "global_step/max_steps": "113/233", "percentage": "48.50%", "elapsed_time": "4h 9m 40s", "remaining_time": "4h 25m 8s"}
|
| 125 |
-
{"loss": 0.54250658, "token_acc": 0.8307074, "grad_norm": 0.13745169, "learning_rate": 5.69e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007533, "epoch": 0.48796148, "global_step/max_steps": "114/233", "percentage": "48.93%", "elapsed_time": "4h 11m 48s", "remaining_time": "4h 22m 50s"}
|
| 126 |
-
{"loss": 0.82845831, "token_acc": 0.83484427, "grad_norm": 0.12465348, "learning_rate": 5.65e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007558, "epoch": 0.49224184, "global_step/max_steps": "115/233", "percentage": "49.36%", "elapsed_time": "4h 13m 10s", "remaining_time": "4h 19m 46s"}
|
| 127 |
-
{"loss": 0.50414205, "token_acc": 0.855043, "grad_norm": 0.15168351, "learning_rate": 5.6e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.00753, "epoch": 0.4965222, "global_step/max_steps": "116/233", "percentage": "49.79%", "elapsed_time": "4h 16m 18s", "remaining_time": "4h 18m 30s"}
|
| 128 |
-
{"loss": 0.47190434, "token_acc": 0.83568548, "grad_norm": 0.12488054, "learning_rate": 5.55e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007556, "epoch": 0.50080257, "global_step/max_steps": "117/233", "percentage": "50.21%", "elapsed_time": "4h 17m 37s", "remaining_time": "4h 15m 25s"}
|
| 129 |
-
{"loss": 0.52818644, "token_acc": 0.84974471, "grad_norm": 0.24668963, "learning_rate": 5.5e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.00759, "epoch": 0.50508293, "global_step/max_steps": "118/233", "percentage": "50.64%", "elapsed_time": "4h 18m 39s", "remaining_time": "4h 12m 5s"}
|
| 130 |
-
{"loss": 0.48247573, "token_acc": 0.8508541, "grad_norm": 0.12629485, "learning_rate": 5.45e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007587, "epoch": 0.5093633, "global_step/max_steps": "119/233", "percentage": "51.07%", "elapsed_time": "4h 20m 59s", "remaining_time": "4h 10m 1s"}
|
| 131 |
-
{"loss": 0.50020522, "token_acc": 0.84359375, "grad_norm": 0.1088229, "learning_rate": 5.41e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007567, "epoch": 0.51364366, "global_step/max_steps": "120/233", "percentage": "51.50%", "elapsed_time": "4h 23m 52s", "remaining_time": "4h 8m 28s"}
|
| 132 |
-
{"eval_loss": 0.49559635, "eval_token_acc": 0.8350121, "eval_runtime": 234.1304, "eval_samples_per_second": 1.973, "eval_steps_per_second": 0.248, "epoch": 0.51364366, "global_step/max_steps": "120/233", "percentage": "51.50%", "elapsed_time": "4h 27m 46s", "remaining_time": "4h 12m 9s"}
|
| 133 |
-
{"loss": 0.46793276, "token_acc": 0.84055563, "grad_norm": 0.16117993, "learning_rate": 5.36e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007476, "epoch": 0.51792402, "global_step/max_steps": "121/233", "percentage": "51.93%", "elapsed_time": "4h 29m 19s", "remaining_time": "4h 9m 17s"}
|
| 134 |
-
{"loss": 0.49023747, "token_acc": 0.8382855, "grad_norm": 0.13395602, "learning_rate": 5.31e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007493, "epoch": 0.52220439, "global_step/max_steps": "122/233", "percentage": "52.36%", "elapsed_time": "4h 30m 55s", "remaining_time": "4h 6m 30s"}
|
| 135 |
-
{"loss": 0.47423121, "token_acc": 0.83460689, "grad_norm": 0.14448754, "learning_rate": 5.26e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007516, "epoch": 0.52648475, "global_step/max_steps": "123/233", "percentage": "52.79%", "elapsed_time": "4h 32m 18s", "remaining_time": "4h 3m 31s"}
|
| 136 |
-
{"loss": 0.48822367, "token_acc": 0.85513403, "grad_norm": 0.21798755, "learning_rate": 5.22e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007509, "epoch": 0.53076512, "global_step/max_steps": "124/233", "percentage": "53.22%", "elapsed_time": "4h 34m 46s", "remaining_time": "4h 1m 32s"}
|
| 137 |
-
{"loss": 0.49620324, "token_acc": 0.86353582, "grad_norm": 0.1151548, "learning_rate": 5.17e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.00754, "epoch": 0.53504548, "global_step/max_steps": "125/233", "percentage": "53.65%", "elapsed_time": "4h 35m 52s", "remaining_time": "3h 58m 21s"}
|
| 138 |
-
{"loss": 0.5336532, "token_acc": 0.81213332, "grad_norm": 0.1721856, "learning_rate": 5.12e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007551, "epoch": 0.53932584, "global_step/max_steps": "126/233", "percentage": "54.08%", "elapsed_time": "4h 37m 41s", "remaining_time": "3h 55m 49s"}
|
| 139 |
-
{"loss": 0.49009401, "token_acc": 0.86326662, "grad_norm": 0.11931138, "learning_rate": 5.07e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.00757, "epoch": 0.54360621, "global_step/max_steps": "127/233", "percentage": "54.51%", "elapsed_time": "4h 39m 11s", "remaining_time": "3h 53m 1s"}
|
| 140 |
-
{"loss": 0.52549112, "token_acc": 0.83741613, "grad_norm": 0.13655488, "learning_rate": 5.02e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007581, "epoch": 0.54788657, "global_step/max_steps": "128/233", "percentage": "54.94%", "elapsed_time": "4h 40m 58s", "remaining_time": "3h 50m 29s"}
|
| 141 |
-
{"loss": 0.5040428, "token_acc": 0.85339168, "grad_norm": 0.13167702, "learning_rate": 4.98e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007592, "epoch": 0.55216693, "global_step/max_steps": "129/233", "percentage": "55.36%", "elapsed_time": "4h 42m 45s", "remaining_time": "3h 47m 57s"}
|
| 142 |
-
{"loss": 0.4640165, "token_acc": 0.84589084, "grad_norm": 0.13566221, "learning_rate": 4.93e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007614, "epoch": 0.5564473, "global_step/max_steps": "130/233", "percentage": "55.79%", "elapsed_time": "4h 44m 7s", "remaining_time": "3h 45m 6s"}
|
| 143 |
-
{"eval_loss": 0.49200445, "eval_token_acc": 0.83594251, "eval_runtime": 234.1648, "eval_samples_per_second": 1.973, "eval_steps_per_second": 0.248, "epoch": 0.5564473, "global_step/max_steps": "130/233", "percentage": "55.79%", "elapsed_time": "4h 48m 1s", "remaining_time": "3h 48m 12s"}
|
| 144 |
-
{"loss": 0.48091298, "token_acc": 0.83779916, "grad_norm": 0.12161104, "learning_rate": 4.88e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007505, "epoch": 0.56072766, "global_step/max_steps": "131/233", "percentage": "56.22%", "elapsed_time": "4h 50m 28s", "remaining_time": "3h 46m 9s"}
|
| 145 |
-
{"loss": 0.49842244, "token_acc": 0.81520841, "grad_norm": 0.12285878, "learning_rate": 4.83e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007538, "epoch": 0.56500803, "global_step/max_steps": "132/233", "percentage": "56.65%", "elapsed_time": "4h 51m 25s", "remaining_time": "3h 42m 59s"}
|
| 146 |
-
{"loss": 0.48106787, "token_acc": 0.80721846, "grad_norm": 0.13136606, "learning_rate": 4.78e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007557, "epoch": 0.56928839, "global_step/max_steps": "133/233", "percentage": "57.08%", "elapsed_time": "4h 52m 53s", "remaining_time": "3h 40m 13s"}
|
| 147 |
-
{"loss": 0.48102638, "token_acc": 0.85745879, "grad_norm": 0.13375333, "learning_rate": 4.74e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007588, "epoch": 0.57356875, "global_step/max_steps": "134/233", "percentage": "57.51%", "elapsed_time": "4h 53m 53s", "remaining_time": "3h 37m 7s"}
|
| 148 |
-
{"loss": 0.49373215, "token_acc": 0.84182651, "grad_norm": 0.11789739, "learning_rate": 4.69e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007588, "epoch": 0.57784912, "global_step/max_steps": "135/233", "percentage": "57.94%", "elapsed_time": "4h 56m 6s", "remaining_time": "3h 34m 57s"}
|
| 149 |
-
{"loss": 0.50271904, "token_acc": 0.86526884, "grad_norm": 0.11597518, "learning_rate": 4.64e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007594, "epoch": 0.58212948, "global_step/max_steps": "136/233", "percentage": "58.37%", "elapsed_time": "4h 58m 1s", "remaining_time": "3h 32m 33s"}
|
| 150 |
-
{"loss": 0.48411375, "token_acc": 0.82616099, "grad_norm": 0.14201498, "learning_rate": 4.59e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007623, "epoch": 0.58640984, "global_step/max_steps": "137/233", "percentage": "58.80%", "elapsed_time": "4h 59m 7s", "remaining_time": "3h 29m 36s"}
|
| 151 |
-
{"loss": 0.47382951, "token_acc": 0.81373225, "grad_norm": 0.11298946, "learning_rate": 4.55e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007638, "epoch": 0.59069021, "global_step/max_steps": "138/233", "percentage": "59.23%", "elapsed_time": "5h 0m 41s", "remaining_time": "3h 26m 59s"}
|
| 152 |
-
{"loss": 0.52312583, "token_acc": 0.85347195, "grad_norm": 0.10149517, "learning_rate": 4.5e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007625, "epoch": 0.59497057, "global_step/max_steps": "139/233", "percentage": "59.66%", "elapsed_time": "5h 3m 22s", "remaining_time": "3h 25m 9s"}
|
| 153 |
-
{"loss": 0.49635562, "token_acc": 0.84268554, "grad_norm": 0.28712398, "learning_rate": 4.45e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007616, "epoch": 0.59925094, "global_step/max_steps": "140/233", "percentage": "60.09%", "elapsed_time": "5h 5m 57s", "remaining_time": "3h 23m 14s"}
|
| 154 |
-
{"eval_loss": 0.4882482, "eval_token_acc": 0.83673689, "eval_runtime": 234.0956, "eval_samples_per_second": 1.974, "eval_steps_per_second": 0.248, "epoch": 0.59925094, "global_step/max_steps": "140/233", "percentage": "60.09%", "elapsed_time": "5h 9m 51s", "remaining_time": "3h 25m 49s"}
|
| 155 |
-
{"loss": 0.47461641, "token_acc": 0.84012604, "grad_norm": 0.14535584, "learning_rate": 4.4e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007522, "epoch": 0.6035313, "global_step/max_steps": "141/233", "percentage": "60.52%", "elapsed_time": "5h 11m 57s", "remaining_time": "3h 23m 33s"}
|
| 156 |
-
{"loss": 0.5000332, "token_acc": 0.83833177, "grad_norm": 0.11992717, "learning_rate": 4.35e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007548, "epoch": 0.60781166, "global_step/max_steps": "142/233", "percentage": "60.94%", "elapsed_time": "5h 13m 5s", "remaining_time": "3h 20m 38s"}
|
| 157 |
-
{"loss": 0.48448735, "token_acc": 0.84470637, "grad_norm": 0.16265908, "learning_rate": 4.31e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007574, "epoch": 0.61209203, "global_step/max_steps": "143/233", "percentage": "61.37%", "elapsed_time": "5h 14m 14s", "remaining_time": "3h 17m 46s"}
|
| 158 |
-
{"loss": 0.46533874, "token_acc": 0.85549313, "grad_norm": 0.12145889, "learning_rate": 4.26e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.00755, "epoch": 0.61637239, "global_step/max_steps": "144/233", "percentage": "61.80%", "elapsed_time": "5h 17m 28s", "remaining_time": "3h 16m 12s"}
|
| 159 |
-
{"loss": 0.44867301, "token_acc": 0.86317424, "grad_norm": 0.11760305, "learning_rate": 4.21e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007559, "epoch": 0.62065276, "global_step/max_steps": "145/233", "percentage": "62.23%", "elapsed_time": "5h 19m 16s", "remaining_time": "3h 13m 46s"}
|
| 160 |
-
{"loss": 0.49094412, "token_acc": 0.83705991, "grad_norm": 0.10963392, "learning_rate": 4.16e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007559, "epoch": 0.62493312, "global_step/max_steps": "146/233", "percentage": "62.66%", "elapsed_time": "5h 21m 29s", "remaining_time": "3h 11m 34s"}
|
| 161 |
-
{"loss": 0.48288625, "token_acc": 0.84562759, "grad_norm": 0.11926857, "learning_rate": 4.11e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.00757, "epoch": 0.62921348, "global_step/max_steps": "147/233", "percentage": "63.09%", "elapsed_time": "5h 23m 13s", "remaining_time": "3h 9m 6s"}
|
| 162 |
-
{"loss": 0.46473897, "token_acc": 0.86574336, "grad_norm": 0.12197684, "learning_rate": 4.07e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007572, "epoch": 0.63349385, "global_step/max_steps": "148/233", "percentage": "63.52%", "elapsed_time": "5h 25m 20s", "remaining_time": "3h 6m 50s"}
|
| 163 |
-
{"loss": 0.46687201, "token_acc": 0.86090455, "grad_norm": 0.12635399, "learning_rate": 4.02e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007595, "epoch": 0.63777421, "global_step/max_steps": "149/233", "percentage": "63.95%", "elapsed_time": "5h 26m 31s", "remaining_time": "3h 4m 5s"}
|
| 164 |
-
{"loss": 0.50646043, "token_acc": 0.83715979, "grad_norm": 0.11814403, "learning_rate": 3.97e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007587, "epoch": 0.64205457, "global_step/max_steps": "150/233", "percentage": "64.38%", "elapsed_time": "5h 29m 5s", "remaining_time": "3h 2m 5s"}
|
| 165 |
-
{"eval_loss": 0.4841184, "eval_token_acc": 0.83763193, "eval_runtime": 233.6185, "eval_samples_per_second": 1.978, "eval_steps_per_second": 0.248, "epoch": 0.64205457, "global_step/max_steps": "150/233", "percentage": "64.38%", "elapsed_time": "5h 32m 59s", "remaining_time": "3h 4m 15s"}
|
| 166 |
-
{"loss": 0.48874947, "token_acc": 0.8424199, "grad_norm": 0.12018572, "learning_rate": 3.92e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007498, "epoch": 0.64633494, "global_step/max_steps": "151/233", "percentage": "64.81%", "elapsed_time": "5h 35m 12s", "remaining_time": "3h 2m 1s"}
|
| 167 |
-
{"loss": 0.48023725, "token_acc": 0.84619697, "grad_norm": 0.1180267, "learning_rate": 3.88e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007511, "epoch": 0.6506153, "global_step/max_steps": "152/233", "percentage": "65.24%", "elapsed_time": "5h 36m 51s", "remaining_time": "2h 59m 30s"}
|
| 168 |
-
{"loss": 0.48215163, "token_acc": 0.81318267, "grad_norm": 0.12059806, "learning_rate": 3.83e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007523, "epoch": 0.65489567, "global_step/max_steps": "153/233", "percentage": "65.67%", "elapsed_time": "5h 38m 32s", "remaining_time": "2h 57m 0s"}
|
| 169 |
-
{"loss": 0.47923255, "token_acc": 0.83857755, "grad_norm": 0.12951089, "learning_rate": 3.78e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007541, "epoch": 0.65917603, "global_step/max_steps": "154/233", "percentage": "66.09%", "elapsed_time": "5h 39m 56s", "remaining_time": "2h 54m 22s"}
|
|
|
|
| 1 |
+
{"loss": 0.58325195, "grad_norm": 6.15641366, "learning_rate": 3e-07, "memory(GiB)": 68.65, "train_speed(iter/s)": 0.012459, "rewards/chosen": -0.02832031, "rewards/rejected": -0.04541016, "rewards/accuracies": 1.0, "rewards/margins": 0.01708984, "logps/rejected": -0.45507812, "logps/chosen": -0.28320312, "logits/rejected": 0.05541992, "logits/chosen": 0.484375, "nll_loss": 0.28125, "log_odds_ratio": -0.4453125, "log_odds_chosen": 0.5703125, "epoch": 0.00303951, "global_step/max_steps": "1/329", "percentage": "0.30%", "elapsed_time": "1m 13s", "remaining_time": "6h 41m 36s"}
|
| 2 |
+
{"loss": 0.57861328, "grad_norm": 4.96429781, "learning_rate": 6.1e-07, "memory(GiB)": 91.09, "train_speed(iter/s)": 0.015066, "rewards/chosen": -0.0480957, "rewards/rejected": -0.04980469, "rewards/accuracies": 1.0, "rewards/margins": 0.00170898, "logps/rejected": -0.49804688, "logps/chosen": -0.48046875, "logits/rejected": 0.734375, "logits/chosen": 0.59765625, "nll_loss": 0.48242188, "log_odds_ratio": -0.67578125, "log_odds_chosen": 0.03710938, "epoch": 0.00607903, "global_step/max_steps": "2/329", "percentage": "0.61%", "elapsed_time": "2m 5s", "remaining_time": "5h 43m 12s"}
|
| 3 |
+
{"loss": 0.58496094, "grad_norm": 4.47705827, "learning_rate": 9.1e-07, "memory(GiB)": 91.09, "train_speed(iter/s)": 0.015521, "rewards/chosen": -0.0300293, "rewards/rejected": -0.02453613, "rewards/accuracies": 0.0, "rewards/margins": -0.00549316, "logps/rejected": -0.24511719, "logps/chosen": -0.30078125, "logits/rejected": 0.38867188, "logits/chosen": 0.26367188, "nll_loss": 0.29882812, "log_odds_ratio": -0.80859375, "log_odds_chosen": -0.21972656, "epoch": 0.00911854, "global_step/max_steps": "3/329", "percentage": "0.91%", "elapsed_time": "3m 6s", "remaining_time": "5h 37m 45s"}
|
| 4 |
+
{"loss": 0.55322266, "grad_norm": 4.56437749, "learning_rate": 1.21e-06, "memory(GiB)": 93.05, "train_speed(iter/s)": 0.017586, "rewards/chosen": -0.04589844, "rewards/rejected": -0.04174805, "rewards/accuracies": 0.0, "rewards/margins": -0.00415039, "logps/rejected": -0.41796875, "logps/chosen": -0.45898438, "logits/rejected": 0.3046875, "logits/chosen": 0.37695312, "nll_loss": 0.45898438, "log_odds_ratio": -0.75390625, "log_odds_chosen": -0.11914062, "epoch": 0.01215805, "global_step/max_steps": "4/329", "percentage": "1.22%", "elapsed_time": "3m 40s", "remaining_time": "4h 58m 48s"}
|
| 5 |
+
{"loss": 0.57531738, "grad_norm": 3.87794046, "learning_rate": 1.52e-06, "memory(GiB)": 100.25, "train_speed(iter/s)": 0.019266, "rewards/chosen": -0.0456543, "rewards/rejected": -0.046875, "rewards/accuracies": 1.0, "rewards/margins": 0.0012207, "logps/rejected": -0.46875, "logps/chosen": -0.45703125, "logits/rejected": 0.7578125, "logits/chosen": 0.67578125, "nll_loss": 0.45898438, "log_odds_ratio": -0.67578125, "log_odds_chosen": 0.03125, "epoch": 0.01519757, "global_step/max_steps": "5/329", "percentage": "1.52%", "elapsed_time": "4m 12s", "remaining_time": "4h 32m 56s"}
|
| 6 |
+
{"loss": 0.55859375, "grad_norm": 4.16021983, "learning_rate": 1.82e-06, "memory(GiB)": 100.25, "train_speed(iter/s)": 0.019891, "rewards/chosen": -0.04296875, "rewards/rejected": -0.0480957, "rewards/accuracies": 1.0, "rewards/margins": 0.00512695, "logps/rejected": -0.48046875, "logps/chosen": -0.4296875, "logits/rejected": 0.7734375, "logits/chosen": 0.77734375, "nll_loss": 0.4296875, "log_odds_ratio": -0.625, "log_odds_chosen": 0.14453125, "epoch": 0.01823708, "global_step/max_steps": "6/329", "percentage": "1.82%", "elapsed_time": "4m 54s", "remaining_time": "4h 24m 32s"}
|
| 7 |
+
{"loss": 0.64770508, "grad_norm": 5.19085531, "learning_rate": 2.12e-06, "memory(GiB)": 133.18, "train_speed(iter/s)": 0.019273, "rewards/chosen": -0.03979492, "rewards/rejected": -0.03662109, "rewards/accuracies": 0.0, "rewards/margins": -0.00317383, "logps/rejected": -0.36523438, "logps/chosen": -0.3984375, "logits/rejected": 0.54296875, "logits/chosen": 0.515625, "nll_loss": 0.3984375, "log_odds_ratio": -0.74609375, "log_odds_chosen": -0.10351562, "epoch": 0.0212766, "global_step/max_steps": "7/329", "percentage": "2.13%", "elapsed_time": "5m 56s", "remaining_time": "4h 33m 14s"}
|
| 8 |
+
{"loss": 0.48022461, "grad_norm": 8.73835517, "learning_rate": 2.42e-06, "memory(GiB)": 133.18, "train_speed(iter/s)": 0.018543, "rewards/chosen": -0.03088379, "rewards/rejected": -0.04248047, "rewards/accuracies": 1.0, "rewards/margins": 0.01159668, "logps/rejected": -0.42578125, "logps/chosen": -0.30859375, "logits/rejected": 0.78125, "logits/chosen": 0.55078125, "nll_loss": 0.30859375, "log_odds_ratio": -0.515625, "log_odds_chosen": 0.390625, "epoch": 0.02431611, "global_step/max_steps": "8/329", "percentage": "2.43%", "elapsed_time": "7m 4s", "remaining_time": "4h 43m 57s"}
|
| 9 |
+
{"loss": 0.54199219, "grad_norm": 3.21968845, "learning_rate": 2.73e-06, "memory(GiB)": 133.18, "train_speed(iter/s)": 0.018768, "rewards/chosen": -0.04907227, "rewards/rejected": -0.0480957, "rewards/accuracies": 0.0, "rewards/margins": -0.00097656, "logps/rejected": -0.48046875, "logps/chosen": -0.49023438, "logits/rejected": 0.86328125, "logits/chosen": 0.74609375, "nll_loss": 0.4921875, "log_odds_ratio": -0.703125, "log_odds_chosen": -0.02148438, "epoch": 0.02735562, "global_step/max_steps": "9/329", "percentage": "2.74%", "elapsed_time": "7m 52s", "remaining_time": "4h 40m 8s"}
|
| 10 |
+
{"loss": 0.5090332, "grad_norm": 4.62599348, "learning_rate": 3.03e-06, "memory(GiB)": 133.18, "train_speed(iter/s)": 0.018667, "rewards/chosen": -0.04589844, "rewards/rejected": -0.04492188, "rewards/accuracies": 0.0, "rewards/margins": -0.00097656, "logps/rejected": -0.44921875, "logps/chosen": -0.45898438, "logits/rejected": 0.94921875, "logits/chosen": 0.8828125, "nll_loss": 0.4609375, "log_odds_ratio": -0.703125, "log_odds_chosen": -0.02539062, "epoch": 0.03039514, "global_step/max_steps": "10/329", "percentage": "3.04%", "elapsed_time": "8m 48s", "remaining_time": "4h 41m 11s"}
|
| 11 |
+
{"eval_loss": 0.5304302, "eval_runtime": 141.9055, "eval_samples_per_second": 0.571, "eval_steps_per_second": 0.078, "eval_rewards/chosen": -0.05118075, "eval_rewards/rejected": -0.04210316, "eval_rewards/accuracies": 0.09090909, "eval_rewards/margins": -0.00907759, "eval_logps/rejected": -0.42116478, "eval_logps/chosen": -0.51136363, "eval_logits/rejected": 0.64182353, "eval_logits/chosen": 0.47944781, "eval_nll_loss": 0.51136363, "eval_log_odds_ratio": -0.80397725, "eval_log_odds_chosen": -0.19655539, "epoch": 0.03039514, "global_step/max_steps": "10/329", "percentage": "3.04%", "elapsed_time": "11m 10s", "remaining_time": "5h 56m 38s"}
|
| 12 |
+
{"loss": 0.57714844, "grad_norm": 3.36059428, "learning_rate": 3.33e-06, "memory(GiB)": 133.18, "train_speed(iter/s)": 0.014731, "rewards/chosen": -0.05029297, "rewards/rejected": -0.04345703, "rewards/accuracies": 0.0, "rewards/margins": -0.00683594, "logps/rejected": -0.43359375, "logps/chosen": -0.50390625, "logits/rejected": 0.62890625, "logits/chosen": 0.66796875, "nll_loss": 0.50390625, "log_odds_ratio": -0.79296875, "log_odds_chosen": -0.1875, "epoch": 0.03343465, "global_step/max_steps": "11/329", "percentage": "3.34%", "elapsed_time": "12m 19s", "remaining_time": "5h 56m 30s"}
|
| 13 |
+
{"loss": 0.60522461, "grad_norm": 3.99234345, "learning_rate": 3.64e-06, "memory(GiB)": 133.18, "train_speed(iter/s)": 0.014797, "rewards/chosen": -0.1015625, "rewards/rejected": -0.15039062, "rewards/accuracies": 1.0, "rewards/margins": 0.04882812, "logps/rejected": -1.5078125, "logps/chosen": -1.015625, "logits/rejected": 0.53125, "logits/chosen": 0.5234375, "nll_loss": 1.015625, "log_odds_ratio": -0.40429688, "log_odds_chosen": 0.69140625, "epoch": 0.03647416, "global_step/max_steps": "12/329", "percentage": "3.65%", "elapsed_time": "13m 24s", "remaining_time": "5h 54m 3s"}
|
| 14 |
+
{"loss": 0.46606445, "grad_norm": 3.84386517, "learning_rate": 3.94e-06, "memory(GiB)": 133.18, "train_speed(iter/s)": 0.014892, "rewards/chosen": -0.0546875, "rewards/rejected": -0.05029297, "rewards/accuracies": 0.0, "rewards/margins": -0.00439453, "logps/rejected": -0.50390625, "logps/chosen": -0.546875, "logits/rejected": 0.49609375, "logits/chosen": 0.49414062, "nll_loss": 0.55078125, "log_odds_ratio": -0.75, "log_odds_chosen": -0.109375, "epoch": 0.03951368, "global_step/max_steps": "13/329", "percentage": "3.95%", "elapsed_time": "14m 26s", "remaining_time": "5h 50m 54s"}
|
| 15 |
+
{"loss": 0.47485352, "grad_norm": 3.8112063, "learning_rate": 4.24e-06, "memory(GiB)": 133.18, "train_speed(iter/s)": 0.015552, "rewards/chosen": -0.04541016, "rewards/rejected": -0.04736328, "rewards/accuracies": 1.0, "rewards/margins": 0.00195312, "logps/rejected": -0.47460938, "logps/chosen": -0.45507812, "logits/rejected": 0.87890625, "logits/chosen": 0.625, "nll_loss": 0.45507812, "log_odds_ratio": -0.671875, "log_odds_chosen": 0.05078125, "epoch": 0.04255319, "global_step/max_steps": "14/329", "percentage": "4.26%", "elapsed_time": "14m 53s", "remaining_time": "5h 35m 1s"}
|
| 16 |
+
{"loss": 0.41088867, "grad_norm": 2.89629065, "learning_rate": 4.55e-06, "memory(GiB)": 133.18, "train_speed(iter/s)": 0.016012, "rewards/chosen": -0.05981445, "rewards/rejected": -0.06054688, "rewards/accuracies": 1.0, "rewards/margins": 0.00073242, "logps/rejected": -0.60546875, "logps/chosen": -0.59765625, "logits/rejected": 0.546875, "logits/chosen": 0.4609375, "nll_loss": 0.59765625, "log_odds_ratio": -0.68359375, "log_odds_chosen": 0.015625, "epoch": 0.04559271, "global_step/max_steps": "15/329", "percentage": "4.56%", "elapsed_time": "15m 29s", "remaining_time": "5h 24m 27s"}
|
| 17 |
+
{"loss": 0.54125977, "grad_norm": 6.06482857, "learning_rate": 4.85e-06, "memory(GiB)": 133.18, "train_speed(iter/s)": 0.016154, "rewards/chosen": -0.06640625, "rewards/rejected": -0.06835938, "rewards/accuracies": 1.0, "rewards/margins": 0.00195312, "logps/rejected": -0.68359375, "logps/chosen": -0.6640625, "logits/rejected": 0.82421875, "logits/chosen": 0.7109375, "nll_loss": 0.6640625, "log_odds_ratio": -0.671875, "log_odds_chosen": 0.046875, "epoch": 0.04863222, "global_step/max_steps": "16/329", "percentage": "4.86%", "elapsed_time": "16m 23s", "remaining_time": "5h 20m 43s"}
|
| 18 |
+
{"loss": 0.5378418, "grad_norm": 4.0430688, "learning_rate": 5.15e-06, "memory(GiB)": 133.18, "train_speed(iter/s)": 0.016479, "rewards/chosen": -0.03979492, "rewards/rejected": -0.03686523, "rewards/accuracies": 0.0, "rewards/margins": -0.00292969, "logps/rejected": -0.36914062, "logps/chosen": -0.3984375, "logits/rejected": 0.56640625, "logits/chosen": 0.6171875, "nll_loss": 0.3984375, "log_odds_ratio": -0.73828125, "log_odds_chosen": -0.08398438, "epoch": 0.05167173, "global_step/max_steps": "17/329", "percentage": "5.17%", "elapsed_time": "17m 4s", "remaining_time": "5h 13m 28s"}
|
| 19 |
+
{"loss": 0.61254883, "grad_norm": 4.21434647, "learning_rate": 5.45e-06, "memory(GiB)": 133.18, "train_speed(iter/s)": 0.01684, "rewards/chosen": -0.03637695, "rewards/rejected": -0.03564453, "rewards/accuracies": 0.0, "rewards/margins": -0.00073242, "logps/rejected": -0.35546875, "logps/chosen": -0.36328125, "logits/rejected": 0.66796875, "logits/chosen": 0.6796875, "nll_loss": 0.36523438, "log_odds_ratio": -0.703125, "log_odds_chosen": -0.0234375, "epoch": 0.05471125, "global_step/max_steps": "18/329", "percentage": "5.47%", "elapsed_time": "17m 42s", "remaining_time": "5h 5m 50s"}
|
| 20 |
+
{"loss": 0.53515625, "grad_norm": 4.19814501, "learning_rate": 5.76e-06, "memory(GiB)": 133.18, "train_speed(iter/s)": 0.017159, "rewards/chosen": -0.03955078, "rewards/rejected": -0.046875, "rewards/accuracies": 1.0, "rewards/margins": 0.00732422, "logps/rejected": -0.46875, "logps/chosen": -0.39453125, "logits/rejected": 0.9453125, "logits/chosen": 0.8515625, "nll_loss": 0.39453125, "log_odds_ratio": -0.58984375, "log_odds_chosen": 0.21875, "epoch": 0.05775076, "global_step/max_steps": "19/329", "percentage": "5.78%", "elapsed_time": "18m 20s", "remaining_time": "4h 59m 15s"}
|
| 21 |
+
{"loss": 0.54125977, "grad_norm": 3.79412169, "learning_rate": 6.06e-06, "memory(GiB)": 133.18, "train_speed(iter/s)": 0.017282, "rewards/chosen": -0.04223633, "rewards/rejected": -0.04785156, "rewards/accuracies": 1.0, "rewards/margins": 0.00561523, "logps/rejected": -0.47851562, "logps/chosen": -0.421875, "logits/rejected": 0.92578125, "logits/chosen": 0.83203125, "nll_loss": 0.41992188, "log_odds_ratio": -0.6171875, "log_odds_chosen": 0.15820312, "epoch": 0.06079027, "global_step/max_steps": "20/329", "percentage": "6.08%", "elapsed_time": "19m 10s", "remaining_time": "4h 56m 14s"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
special_tokens_map.json
CHANGED
|
@@ -13,5 +13,11 @@
|
|
| 13 |
"rstrip": false,
|
| 14 |
"single_word": false
|
| 15 |
},
|
| 16 |
-
"pad_token":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
}
|
|
|
|
| 13 |
"rstrip": false,
|
| 14 |
"single_word": false
|
| 15 |
},
|
| 16 |
+
"pad_token": {
|
| 17 |
+
"content": "<|eot_id|>",
|
| 18 |
+
"lstrip": false,
|
| 19 |
+
"normalized": false,
|
| 20 |
+
"rstrip": false,
|
| 21 |
+
"single_word": false
|
| 22 |
+
}
|
| 23 |
}
|
training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eb304458da934c460167c713e0581151a0ab2d5cb43649730f985f7d4a9ef096
|
| 3 |
+
size 8248
|