tejeshbhalla commited on
Commit
1bcdbf7
·
verified ·
1 Parent(s): c9abc2f

Training in progress, step 10

Browse files
adapter_config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "alpha_pattern": {},
3
  "auto_mapping": null,
4
- "base_model_name_or_path": "/root/highspeedstorage/ft-volume/nemotron",
5
  "bias": "none",
6
  "eva_config": null,
7
  "exclude_modules": null,
@@ -14,7 +14,7 @@
14
  "loftq_config": {},
15
  "lora_alpha": 256,
16
  "lora_bias": false,
17
- "lora_dropout": 0.01,
18
  "megatron_config": null,
19
  "megatron_core": "megatron.core",
20
  "modules_to_save": [],
@@ -23,13 +23,13 @@
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
- "v_proj",
27
- "q_proj",
28
  "k_proj",
29
- "gate_proj",
30
- "up_proj",
31
  "o_proj",
32
- "down_proj"
 
 
 
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
 
1
  {
2
  "alpha_pattern": {},
3
  "auto_mapping": null,
4
+ "base_model_name_or_path": "/root/dataDisk/output/v63-20250312-123826/checkpoint-160-merged",
5
  "bias": "none",
6
  "eva_config": null,
7
  "exclude_modules": null,
 
14
  "loftq_config": {},
15
  "lora_alpha": 256,
16
  "lora_bias": false,
17
+ "lora_dropout": 0.1,
18
  "megatron_config": null,
19
  "megatron_core": "megatron.core",
20
  "modules_to_save": [],
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
 
 
26
  "k_proj",
27
+ "down_proj",
 
28
  "o_proj",
29
+ "v_proj",
30
+ "gate_proj",
31
+ "q_proj",
32
+ "up_proj"
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:69cf3891e4cfe1cb2064f250c1e0c06cc09441a469282a54c312a95b9b55a8dc
3
  size 13254157312
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6584293598035a8eddf8812fb57e22a49169777cd99a0c1549462db262f1b8eb
3
  size 13254157312
args.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "model": "/root/highspeedstorage/ft-volume/nemotron",
3
  "model_type": "llama3_2",
4
  "model_revision": null,
5
  "task_type": "causal_lm",
@@ -11,29 +11,25 @@
11
  "local_repo_path": null,
12
  "template": "llama3_2",
13
  "system": "",
14
- "max_length": 32000,
15
  "truncation_strategy": "left",
16
  "max_pixels": null,
17
  "tools_prompt": "react_en",
18
- "norm_bbox": null,
19
- "padding_side": "right",
20
  "loss_scale": "default",
21
  "sequence_parallel_size": 1,
22
  "use_chat_template": true,
23
  "template_backend": "swift",
24
  "dataset": [
25
- "new_data_new.jsonl"
26
  ],
27
  "val_dataset": [],
28
  "split_dataset_ratio": 0.03,
29
  "data_seed": 42,
30
  "dataset_num_proc": 1,
31
  "streaming": false,
32
- "enable_cache": false,
33
  "download_mode": "reuse_dataset_if_exists",
34
- "columns": {},
35
  "strict": false,
36
- "remove_unused_columns": true,
37
  "model_name": [
38
  null,
39
  null
@@ -59,8 +55,7 @@
59
  "stream": false,
60
  "stop_words": [],
61
  "logprobs": false,
62
- "top_logprobs": null,
63
- "ckpt_dir": null,
64
  "load_dataset_config": null,
65
  "lora_modules": [],
66
  "tuner_backend": "peft",
@@ -68,14 +63,14 @@
68
  "adapters": [],
69
  "seed": 42,
70
  "model_kwargs": {},
71
- "load_args": false,
72
  "load_data_args": false,
73
  "use_hf": true,
74
  "hub_token": null,
75
  "custom_register_path": [],
76
  "ignore_args_error": false,
77
  "use_swift_lora": false,
78
- "output_dir": "/root/dataDisk/output/v63-20250312-123826",
79
  "overwrite_output_dir": false,
80
  "do_train": false,
81
  "do_eval": false,
@@ -86,7 +81,7 @@
86
  "per_device_eval_batch_size": 1,
87
  "per_gpu_train_batch_size": null,
88
  "per_gpu_eval_batch_size": null,
89
- "gradient_accumulation_steps": 8,
90
  "eval_accumulation_steps": null,
91
  "eval_delay": 0,
92
  "torch_empty_cache_steps": null,
@@ -105,7 +100,7 @@
105
  "log_level": "passive",
106
  "log_level_replica": "warning",
107
  "log_on_each_node": true,
108
- "logging_dir": "/root/dataDisk/output/v63-20250312-123826/runs",
109
  "logging_strategy": "steps",
110
  "logging_first_step": true,
111
  "logging_steps": 1,
@@ -141,6 +136,7 @@
141
  "past_index": -1,
142
  "run_name": null,
143
  "disable_tqdm": null,
 
144
  "label_names": null,
145
  "load_best_model_at_end": false,
146
  "metric_for_best_model": "loss",
@@ -168,7 +164,7 @@
168
  "zero_optimization": {
169
  "stage": 3,
170
  "offload_optimizer": {
171
- "device": "cpu",
172
  "pin_memory": true
173
  },
174
  "offload_param": {
@@ -264,7 +260,7 @@
264
  "modules_to_save": [],
265
  "lora_rank": 512,
266
  "lora_alpha": 256,
267
- "lora_dropout": 0.01,
268
  "lora_bias": "none",
269
  "lora_dtype": null,
270
  "lorap_lr_ratio": null,
@@ -326,29 +322,32 @@
326
  "metric_warmup_step": 0,
327
  "fsdp_num": 1,
328
  "acc_steps": 1,
329
- "swanlab_token": null,
330
- "swanlab_project": null,
331
- "swanlab_workspace": null,
332
- "swanlab_exp_name": null,
333
- "swanlab_mode": "cloud",
334
  "add_version": true,
335
  "resume_only_model": false,
336
  "check_model": true,
337
- "create_checkpoint_symlink": false,
338
  "packing": false,
339
  "lazy_tokenize": false,
340
- "external_plugins": [],
341
- "loss_type": null,
342
  "optimizer": null,
343
  "metric": null,
344
- "acc_strategy": "token",
 
 
 
 
 
 
 
 
 
 
345
  "rank": 0,
346
- "global_world_size": 8,
347
  "local_world_size": 8,
348
- "model_suffix": "nemotron",
349
- "model_info": "ModelInfo(model_type='llama3_2', model_dir='/root/highspeedstorage/ft-volume/nemotron', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, rope_scaling={'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, config=None, task_type='causal_lm', num_labels=None)",
350
- "model_meta": "ModelMeta(model_type='llama3_2', model_groups=[ModelGroup(models=[Model(ms_model_id='LLM-Research/Llama-3.2-1B', hf_model_id='meta-llama/Llama-3.2-1B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='LLM-Research/Llama-3.2-3B', hf_model_id='meta-llama/Llama-3.2-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='LLM-Research/Llama-3.2-1B-Instruct', hf_model_id='meta-llama/Llama-3.2-1B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='LLM-Research/Llama-3.2-3B-Instruct', hf_model_id='meta-llama/Llama-3.2-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='LLM-Research/Llama-3.3-70B-Instruct', hf_model_id='meta-llama/Llama-3.3-70B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='unsloth/Llama-3.3-70B-Instruct-bnb-4bit', hf_model_id='unsloth/Llama-3.3-70B-Instruct-bnb-4bit', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='llama3_2', get_function=<function get_model_tokenizer_with_flash_attn at 0x7f3e89dab760>, model_arch='llama', architectures=['LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=['transformers>=4.45'], tags=[])",
351
- "model_dir": "/root/highspeedstorage/ft-volume/nemotron",
352
  "hub": "<class 'swift.hub.hub.HFHub'>",
353
- "training_args": "Seq2SeqTrainingArguments(output_dir='/root/dataDisk/output/v63-20250312-123826', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.STEPS: 'steps'>, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=1e-05, weight_decay=0.01, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=1.0, max_steps=-1, lr_scheduler_type=<SchedulerType.LINEAR: 'linear'>, lr_scheduler_kwargs=None, warmup_ratio=0.1, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/root/dataDisk/output/v63-20250312-123826/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=1, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.STEPS: 'steps'>, save_steps=10, save_total_limit=2, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=10, dataloader_num_workers=0, dataloader_prefetch_factor=None, past_index=-1, run_name='/root/dataDisk/output/v63-20250312-123826', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'cpu', 'pin_memory': True}, 'offload_param': {'device': 'cpu', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH: 'adamw_torch'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['wandb'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=True, resume_from_checkpoint=None, hub_model_id='TheAgenticAI/LLAMA-3.3-70B-Reasoning', hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=True, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs={'use_reentrant': True}, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, is_encoder_decoder=False, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)"
354
  }
 
1
  {
2
+ "model": "/root/dataDisk/output/v63-20250312-123826/checkpoint-160-merged",
3
  "model_type": "llama3_2",
4
  "model_revision": null,
5
  "task_type": "causal_lm",
 
11
  "local_repo_path": null,
12
  "template": "llama3_2",
13
  "system": "",
14
+ "max_length": 14000,
15
  "truncation_strategy": "left",
16
  "max_pixels": null,
17
  "tools_prompt": "react_en",
 
 
18
  "loss_scale": "default",
19
  "sequence_parallel_size": 1,
20
  "use_chat_template": true,
21
  "template_backend": "swift",
22
  "dataset": [
23
+ "dpo_data.jsonl"
24
  ],
25
  "val_dataset": [],
26
  "split_dataset_ratio": 0.03,
27
  "data_seed": 42,
28
  "dataset_num_proc": 1,
29
  "streaming": false,
30
+ "load_from_cache_file": false,
31
  "download_mode": "reuse_dataset_if_exists",
 
32
  "strict": false,
 
33
  "model_name": [
34
  null,
35
  null
 
55
  "stream": false,
56
  "stop_words": [],
57
  "logprobs": false,
58
+ "ckpt_dir": "/root/dataDisk/output/v63-20250312-123826/checkpoint-160-merged",
 
59
  "load_dataset_config": null,
60
  "lora_modules": [],
61
  "tuner_backend": "peft",
 
63
  "adapters": [],
64
  "seed": 42,
65
  "model_kwargs": {},
66
+ "load_args": true,
67
  "load_data_args": false,
68
  "use_hf": true,
69
  "hub_token": null,
70
  "custom_register_path": [],
71
  "ignore_args_error": false,
72
  "use_swift_lora": false,
73
+ "output_dir": "/root/dataDisk/output/v68-20250313-073537",
74
  "overwrite_output_dir": false,
75
  "do_train": false,
76
  "do_eval": false,
 
81
  "per_device_eval_batch_size": 1,
82
  "per_gpu_train_batch_size": null,
83
  "per_gpu_eval_batch_size": null,
84
+ "gradient_accumulation_steps": 1,
85
  "eval_accumulation_steps": null,
86
  "eval_delay": 0,
87
  "torch_empty_cache_steps": null,
 
100
  "log_level": "passive",
101
  "log_level_replica": "warning",
102
  "log_on_each_node": true,
103
+ "logging_dir": "/root/dataDisk/output/v68-20250313-073537/runs",
104
  "logging_strategy": "steps",
105
  "logging_first_step": true,
106
  "logging_steps": 1,
 
136
  "past_index": -1,
137
  "run_name": null,
138
  "disable_tqdm": null,
139
+ "remove_unused_columns": false,
140
  "label_names": null,
141
  "load_best_model_at_end": false,
142
  "metric_for_best_model": "loss",
 
164
  "zero_optimization": {
165
  "stage": 3,
166
  "offload_optimizer": {
167
+ "device": "none",
168
  "pin_memory": true
169
  },
170
  "offload_param": {
 
260
  "modules_to_save": [],
261
  "lora_rank": 512,
262
  "lora_alpha": 256,
263
+ "lora_dropout": 0.1,
264
  "lora_bias": "none",
265
  "lora_dtype": null,
266
  "lorap_lr_ratio": null,
 
322
  "metric_warmup_step": 0,
323
  "fsdp_num": 1,
324
  "acc_steps": 1,
 
 
 
 
 
325
  "add_version": true,
326
  "resume_only_model": false,
327
  "check_model": true,
328
+ "loss_type": null,
329
  "packing": false,
330
  "lazy_tokenize": false,
331
+ "acc_strategy": "token",
 
332
  "optimizer": null,
333
  "metric": null,
334
+ "rlhf_type": "orpo",
335
+ "ref_model": null,
336
+ "ref_model_type": null,
337
+ "ref_model_revision": null,
338
+ "beta": 0.1,
339
+ "label_smoothing": 0,
340
+ "rpo_alpha": 1.0,
341
+ "cpo_alpha": 1.0,
342
+ "simpo_gamma": 1,
343
+ "desirable_weight": 1.0,
344
+ "undesirable_weight": 1.0,
345
  "rank": 0,
 
346
  "local_world_size": 8,
347
+ "model_suffix": "checkpoint-160-merged",
348
+ "model_info": "ModelInfo(model_type='llama3_2', model_dir='/root/dataDisk/output/v63-20250312-123826/checkpoint-160-merged', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config={'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, task_type=None)",
349
+ "model_meta": "ModelMeta(model_type='llama3_2', model_groups=[ModelGroup(models=[Model(ms_model_id='LLM-Research/Llama-3.2-1B', hf_model_id='meta-llama/Llama-3.2-1B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='LLM-Research/Llama-3.2-3B', hf_model_id='meta-llama/Llama-3.2-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='LLM-Research/Llama-3.2-1B-Instruct', hf_model_id='meta-llama/Llama-3.2-1B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='LLM-Research/Llama-3.2-3B-Instruct', hf_model_id='meta-llama/Llama-3.2-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='LLM-Research/Llama-3.3-70B-Instruct', hf_model_id='meta-llama/Llama-3.3-70B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='unsloth/Llama-3.3-70B-Instruct-bnb-4bit', hf_model_id='unsloth/Llama-3.3-70B-Instruct-bnb-4bit', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='llama3_2', get_function=<function get_model_tokenizer_with_flash_attn at 0x7f03a47424d0>, model_arch='llama', architectures=['LlamaForCausalLM'], is_multimodal=False, additional_saved_files=[], torch_dtype=None, ignore_patterns=[], requires=['transformers>=4.45'], tags=[])",
350
+ "model_dir": "/root/dataDisk/output/v63-20250312-123826/checkpoint-160-merged",
351
  "hub": "<class 'swift.hub.hub.HFHub'>",
352
+ "training_args": "ORPOConfig(output_dir='/root/dataDisk/output/v68-20250313-073537', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.STEPS: 'steps'>, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=1e-05, weight_decay=0.01, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=1.0, max_steps=-1, lr_scheduler_type=<SchedulerType.LINEAR: 'linear'>, lr_scheduler_kwargs=None, warmup_ratio=0.1, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/root/dataDisk/output/v68-20250313-073537/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=1, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.STEPS: 'steps'>, save_steps=10, save_total_limit=2, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=10, dataloader_num_workers=0, dataloader_prefetch_factor=None, past_index=-1, run_name='/root/dataDisk/output/v68-20250313-073537', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'cpu', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH: 'adamw_torch'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['wandb'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=True, resume_from_checkpoint=None, hub_model_id='TheAgenticAI/LLAMA-3.3-70B-Reasoning', hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=True, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs={'use_reentrant': True}, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, max_length=14000, max_prompt_length=None, max_completion_length=None, beta=0.1, disable_dropout=True, label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', generate_during_eval=False, is_encoder_decoder=False, model_init_kwargs=None, dataset_num_proc=1, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora')"
353
  }
logging.jsonl CHANGED
@@ -1,169 +1,21 @@
1
- {"loss": 1.05211091, "token_acc": 0.72991851, "grad_norm": 1.31669497, "learning_rate": 4.2e-07, "memory(GiB)": 48.25, "train_speed(iter/s)": 0.006813, "epoch": 0.00428036, "global_step/max_steps": "1/233", "percentage": "0.43%", "elapsed_time": "2m 0s", "remaining_time": "7h 47m 12s"}
2
- {"loss": 1.00822735, "token_acc": 0.76806084, "grad_norm": 1.31958413, "learning_rate": 8.3e-07, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007099, "epoch": 0.00856073, "global_step/max_steps": "2/233", "percentage": "0.86%", "elapsed_time": "4m 15s", "remaining_time": "8h 12m 20s"}
3
- {"loss": 1.00241852, "token_acc": 0.7587647, "grad_norm": 1.34375119, "learning_rate": 1.25e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.00802, "epoch": 0.01284109, "global_step/max_steps": "3/233", "percentage": "1.29%", "elapsed_time": "5m 48s", "remaining_time": "7h 24m 47s"}
4
- {"loss": 0.93123716, "token_acc": 0.78994138, "grad_norm": 0.82043427, "learning_rate": 1.67e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.008883, "epoch": 0.01712146, "global_step/max_steps": "4/233", "percentage": "1.72%", "elapsed_time": "7m 4s", "remaining_time": "6h 44m 54s"}
5
- {"loss": 0.89957178, "token_acc": 0.7818078, "grad_norm": 0.60323197, "learning_rate": 2.08e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.00999, "epoch": 0.02140182, "global_step/max_steps": "5/233", "percentage": "2.15%", "elapsed_time": "7m 54s", "remaining_time": "6h 0m 39s"}
6
- {"loss": 0.80782145, "token_acc": 0.80445545, "grad_norm": 0.36674395, "learning_rate": 2.5e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.010656, "epoch": 0.02568218, "global_step/max_steps": "6/233", "percentage": "2.58%", "elapsed_time": "8m 57s", "remaining_time": "5h 38m 40s"}
7
- {"loss": 0.72589087, "token_acc": 0.80906443, "grad_norm": 0.37669304, "learning_rate": 2.92e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.010721, "epoch": 0.02996255, "global_step/max_steps": "7/233", "percentage": "3.00%", "elapsed_time": "10m 26s", "remaining_time": "5h 37m 22s"}
8
- {"loss": 0.74144351, "token_acc": 0.78184348, "grad_norm": 0.36026895, "learning_rate": 3.33e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.010667, "epoch": 0.03424291, "global_step/max_steps": "8/233", "percentage": "3.43%", "elapsed_time": "12m 4s", "remaining_time": "5h 39m 23s"}
9
- {"loss": 0.66971028, "token_acc": 0.79516012, "grad_norm": 0.28543693, "learning_rate": 3.75e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.010675, "epoch": 0.03852327, "global_step/max_steps": "9/233", "percentage": "3.86%", "elapsed_time": "13m 37s", "remaining_time": "5h 38m 57s"}
10
- {"loss": 0.67779553, "token_acc": 0.81477687, "grad_norm": 0.31390181, "learning_rate": 4.17e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.010289, "epoch": 0.04280364, "global_step/max_steps": "10/233", "percentage": "4.29%", "elapsed_time": "15m 45s", "remaining_time": "5h 51m 35s"}
11
- {"eval_loss": 0.68645704, "eval_token_acc": 0.79555826, "eval_runtime": 234.6965, "eval_samples_per_second": 1.968, "eval_steps_per_second": 0.247, "epoch": 0.04280364, "global_step/max_steps": "10/233", "percentage": "4.29%", "elapsed_time": "19m 40s", "remaining_time": "7h 18m 48s"}
12
- {"loss": 0.72095913, "token_acc": 0.79695404, "grad_norm": 0.20742597, "learning_rate": 4.58e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007664, "epoch": 0.047084, "global_step/max_steps": "11/233", "percentage": "4.72%", "elapsed_time": "23m 29s", "remaining_time": "7h 54m 1s"}
13
- {"loss": 0.65615392, "token_acc": 0.8028374, "grad_norm": 0.18942571, "learning_rate": 5e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007954, "epoch": 0.05136437, "global_step/max_steps": "12/233", "percentage": "5.15%", "elapsed_time": "24m 42s", "remaining_time": "7h 35m 5s"}
14
- {"loss": 0.69037592, "token_acc": 0.81485513, "grad_norm": 0.15860854, "learning_rate": 5.42e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.008057, "epoch": 0.05564473, "global_step/max_steps": "13/233", "percentage": "5.58%", "elapsed_time": "26m 27s", "remaining_time": "7h 27m 46s"}
15
- {"loss": 0.65000862, "token_acc": 0.84315226, "grad_norm": 0.15301009, "learning_rate": 5.83e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007933, "epoch": 0.05992509, "global_step/max_steps": "14/233", "percentage": "6.01%", "elapsed_time": "28m 58s", "remaining_time": "7h 33m 19s"}
16
- {"loss": 0.62553513, "token_acc": 0.84415985, "grad_norm": 0.17639624, "learning_rate": 6.25e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.008213, "epoch": 0.06420546, "global_step/max_steps": "15/233", "percentage": "6.44%", "elapsed_time": "30m 0s", "remaining_time": "7h 16m 7s"}
17
- {"loss": 0.63152444, "token_acc": 0.8206656, "grad_norm": 0.1590112, "learning_rate": 6.67e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.008329, "epoch": 0.06848582, "global_step/max_steps": "16/233", "percentage": "6.87%", "elapsed_time": "31m 35s", "remaining_time": "7h 8m 21s"}
18
- {"loss": 0.65911484, "token_acc": 0.84129132, "grad_norm": 0.16948754, "learning_rate": 7.08e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.008262, "epoch": 0.07276619, "global_step/max_steps": "17/233", "percentage": "7.30%", "elapsed_time": "33m 51s", "remaining_time": "7h 10m 13s"}
19
- {"loss": 0.59900677, "token_acc": 0.78798923, "grad_norm": 0.13815269, "learning_rate": 7.5e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.008267, "epoch": 0.07704655, "global_step/max_steps": "18/233", "percentage": "7.73%", "elapsed_time": "35m 51s", "remaining_time": "7h 8m 16s"}
20
- {"loss": 0.60964179, "token_acc": 0.81759515, "grad_norm": 0.13055032, "learning_rate": 7.92e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.008345, "epoch": 0.08132691, "global_step/max_steps": "19/233", "percentage": "8.15%", "elapsed_time": "37m 30s", "remaining_time": "7h 2m 33s"}
21
- {"loss": 0.67515284, "token_acc": 0.81178476, "grad_norm": 0.1318536, "learning_rate": 8.33e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007989, "epoch": 0.08560728, "global_step/max_steps": "20/233", "percentage": "8.58%", "elapsed_time": "41m 17s", "remaining_time": "7h 19m 46s"}
22
- {"eval_loss": 0.6061545, "eval_token_acc": 0.80982449, "eval_runtime": 233.1289, "eval_samples_per_second": 1.982, "eval_steps_per_second": 0.249, "epoch": 0.08560728, "global_step/max_steps": "20/233", "percentage": "8.58%", "elapsed_time": "45m 10s", "remaining_time": "8h 1m 8s"}
23
- {"loss": 0.64495265, "token_acc": 0.81662134, "grad_norm": 0.11575121, "learning_rate": 8.75e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007161, "epoch": 0.08988764, "global_step/max_steps": "21/233", "percentage": "9.01%", "elapsed_time": "48m 26s", "remaining_time": "8h 9m 4s"}
24
- {"loss": 0.56515968, "token_acc": 0.80879168, "grad_norm": 0.14889614, "learning_rate": 9.17e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007331, "epoch": 0.094168, "global_step/max_steps": "22/233", "percentage": "9.44%", "elapsed_time": "49m 35s", "remaining_time": "7h 55m 34s"}
25
- {"loss": 0.61245996, "token_acc": 0.81094324, "grad_norm": 0.50602204, "learning_rate": 9.58e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007474, "epoch": 0.09844837, "global_step/max_steps": "23/233", "percentage": "9.87%", "elapsed_time": "50m 51s", "remaining_time": "7h 44m 18s"}
26
- {"loss": 0.62092638, "token_acc": 0.82057519, "grad_norm": 0.1378572, "learning_rate": 1e-05, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007544, "epoch": 0.10272873, "global_step/max_steps": "24/233", "percentage": "10.30%", "elapsed_time": "52m 35s", "remaining_time": "7h 37m 58s"}
27
- {"loss": 0.5545224, "token_acc": 0.84725024, "grad_norm": 0.13007531, "learning_rate": 9.95e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007633, "epoch": 0.1070091, "global_step/max_steps": "25/233", "percentage": "10.73%", "elapsed_time": "54m 9s", "remaining_time": "7h 30m 33s"}
28
- {"loss": 0.57198393, "token_acc": 0.84888653, "grad_norm": 0.42071208, "learning_rate": 9.9e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007755, "epoch": 0.11128946, "global_step/max_steps": "26/233", "percentage": "11.16%", "elapsed_time": "55m 26s", "remaining_time": "7h 21m 25s"}
29
- {"loss": 0.59884644, "token_acc": 0.81060166, "grad_norm": 0.1219664, "learning_rate": 9.86e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007849, "epoch": 0.11556982, "global_step/max_steps": "27/233", "percentage": "11.59%", "elapsed_time": "56m 53s", "remaining_time": "7h 14m 7s"}
30
- {"loss": 0.60813272, "token_acc": 0.8097131, "grad_norm": 0.14412673, "learning_rate": 9.81e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007949, "epoch": 0.11985019, "global_step/max_steps": "28/233", "percentage": "12.02%", "elapsed_time": "58m 16s", "remaining_time": "7h 6m 38s"}
31
- {"loss": 0.58964062, "token_acc": 0.82500748, "grad_norm": 0.18986589, "learning_rate": 9.76e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007997, "epoch": 0.12413055, "global_step/max_steps": "29/233", "percentage": "12.45%", "elapsed_time": "1h 0m 0s", "remaining_time": "7h 2m 7s"}
32
- {"loss": 0.59038109, "token_acc": 0.82640086, "grad_norm": 0.14620499, "learning_rate": 9.71e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.00813, "epoch": 0.12841091, "global_step/max_steps": "30/233", "percentage": "12.88%", "elapsed_time": "1h 1m 4s", "remaining_time": "6h 53m 14s"}
33
- {"eval_loss": 0.57404786, "eval_token_acc": 0.81633597, "eval_runtime": 233.9161, "eval_samples_per_second": 1.975, "eval_steps_per_second": 0.248, "epoch": 0.12841091, "global_step/max_steps": "30/233", "percentage": "12.88%", "elapsed_time": "1h 4m 58s", "remaining_time": "7h 19m 37s"}
34
- {"loss": 0.61220074, "token_acc": 0.82002309, "grad_norm": 0.12467663, "learning_rate": 9.67e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007603, "epoch": 0.13269128, "global_step/max_steps": "31/233", "percentage": "13.30%", "elapsed_time": "1h 7m 31s", "remaining_time": "7h 20m 0s"}
35
- {"loss": 0.57690763, "token_acc": 0.82229942, "grad_norm": 0.12858853, "learning_rate": 9.62e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007548, "epoch": 0.13697164, "global_step/max_steps": "32/233", "percentage": "13.73%", "elapsed_time": "1h 10m 13s", "remaining_time": "7h 21m 8s"}
36
- {"loss": 0.57390118, "token_acc": 0.82656848, "grad_norm": 0.12490374, "learning_rate": 9.57e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007576, "epoch": 0.14125201, "global_step/max_steps": "33/233", "percentage": "14.16%", "elapsed_time": "1h 12m 9s", "remaining_time": "7h 17m 21s"}
37
- {"loss": 0.54597116, "token_acc": 0.81826962, "grad_norm": 0.14487278, "learning_rate": 9.52e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.00766, "epoch": 0.14553237, "global_step/max_steps": "34/233", "percentage": "14.59%", "elapsed_time": "1h 13m 32s", "remaining_time": "7h 10m 26s"}
38
- {"loss": 0.60774434, "token_acc": 0.83905093, "grad_norm": 0.14599162, "learning_rate": 9.47e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007699, "epoch": 0.14981273, "global_step/max_steps": "35/233", "percentage": "15.02%", "elapsed_time": "1h 15m 20s", "remaining_time": "7h 6m 12s"}
39
- {"loss": 0.58154261, "token_acc": 0.8071638, "grad_norm": 0.15756345, "learning_rate": 9.43e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.0076, "epoch": 0.1540931, "global_step/max_steps": "36/233", "percentage": "15.45%", "elapsed_time": "1h 18m 31s", "remaining_time": "7h 9m 40s"}
40
- {"loss": 0.54503143, "token_acc": 0.82044124, "grad_norm": 0.12564433, "learning_rate": 9.38e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.00764, "epoch": 0.15837346, "global_step/max_steps": "37/233", "percentage": "15.88%", "elapsed_time": "1h 20m 16s", "remaining_time": "7h 5m 16s"}
41
- {"loss": 0.54243863, "token_acc": 0.8351083, "grad_norm": 0.13263634, "learning_rate": 9.33e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007717, "epoch": 0.16265383, "global_step/max_steps": "38/233", "percentage": "16.31%", "elapsed_time": "1h 21m 38s", "remaining_time": "6h 58m 54s"}
42
- {"loss": 0.51341844, "token_acc": 0.84136878, "grad_norm": 0.15488632, "learning_rate": 9.28e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.0078, "epoch": 0.16693419, "global_step/max_steps": "39/233", "percentage": "16.74%", "elapsed_time": "1h 22m 53s", "remaining_time": "6h 52m 22s"}
43
- {"loss": 0.5711, "token_acc": 0.81400517, "grad_norm": 0.14193577, "learning_rate": 9.23e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007837, "epoch": 0.17121455, "global_step/max_steps": "40/233", "percentage": "17.17%", "elapsed_time": "1h 24m 37s", "remaining_time": "6h 48m 20s"}
44
- {"eval_loss": 0.55634356, "eval_token_acc": 0.82101112, "eval_runtime": 233.8969, "eval_samples_per_second": 1.975, "eval_steps_per_second": 0.248, "epoch": 0.17121455, "global_step/max_steps": "40/233", "percentage": "17.17%", "elapsed_time": "1h 28m 31s", "remaining_time": "7h 7m 9s"}
45
- {"loss": 0.55852044, "token_acc": 0.82246999, "grad_norm": 0.13955821, "learning_rate": 9.19e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007417, "epoch": 0.17549492, "global_step/max_steps": "41/233", "percentage": "17.60%", "elapsed_time": "1h 31m 42s", "remaining_time": "7h 9m 26s"}
46
- {"loss": 0.5492382, "token_acc": 0.81834246, "grad_norm": 0.18386385, "learning_rate": 9.14e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.00745, "epoch": 0.17977528, "global_step/max_steps": "42/233", "percentage": "18.03%", "elapsed_time": "1h 33m 31s", "remaining_time": "7h 5m 20s"}
47
- {"loss": 0.53807271, "token_acc": 0.83674288, "grad_norm": 0.17359699, "learning_rate": 9.09e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007485, "epoch": 0.18405564, "global_step/max_steps": "43/233", "percentage": "18.45%", "elapsed_time": "1h 35m 18s", "remaining_time": "7h 1m 9s"}
48
- {"loss": 0.54787058, "token_acc": 0.83625366, "grad_norm": 0.10988069, "learning_rate": 9.04e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007475, "epoch": 0.18833601, "global_step/max_steps": "44/233", "percentage": "18.88%", "elapsed_time": "1h 37m 40s", "remaining_time": "6h 59m 33s"}
49
- {"loss": 0.5589273, "token_acc": 0.84398263, "grad_norm": 0.11602305, "learning_rate": 9e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007511, "epoch": 0.19261637, "global_step/max_steps": "45/233", "percentage": "19.31%", "elapsed_time": "1h 39m 24s", "remaining_time": "6h 55m 20s"}
50
- {"loss": 0.53926349, "token_acc": 0.83050162, "grad_norm": 0.12073734, "learning_rate": 8.95e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007563, "epoch": 0.19689674, "global_step/max_steps": "46/233", "percentage": "19.74%", "elapsed_time": "1h 40m 56s", "remaining_time": "6h 50m 21s"}
51
- {"loss": 0.5202747, "token_acc": 0.83425064, "grad_norm": 0.11006246, "learning_rate": 8.9e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007615, "epoch": 0.2011771, "global_step/max_steps": "47/233", "percentage": "20.17%", "elapsed_time": "1h 42m 26s", "remaining_time": "6h 45m 24s"}
52
- {"loss": 0.54124808, "token_acc": 0.85483645, "grad_norm": 0.11072037, "learning_rate": 8.85e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007602, "epoch": 0.20545746, "global_step/max_steps": "48/233", "percentage": "20.60%", "elapsed_time": "1h 44m 48s", "remaining_time": "6h 43m 56s"}
53
- {"loss": 0.52816677, "token_acc": 0.82739567, "grad_norm": 0.11390109, "learning_rate": 8.8e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007721, "epoch": 0.20973783, "global_step/max_steps": "49/233", "percentage": "21.03%", "elapsed_time": "1h 45m 20s", "remaining_time": "6h 35m 34s"}
54
- {"loss": 0.50078875, "token_acc": 0.8451592, "grad_norm": 0.10181949, "learning_rate": 8.76e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.00775, "epoch": 0.21401819, "global_step/max_steps": "50/233", "percentage": "21.46%", "elapsed_time": "1h 47m 5s", "remaining_time": "6h 31m 57s"}
55
- {"eval_loss": 0.54290175, "eval_token_acc": 0.82446886, "eval_runtime": 233.5934, "eval_samples_per_second": 1.978, "eval_steps_per_second": 0.248, "epoch": 0.21401819, "global_step/max_steps": "50/233", "percentage": "21.46%", "elapsed_time": "1h 50m 59s", "remaining_time": "6h 46m 12s"}
56
- {"loss": 0.53490806, "token_acc": 0.82852007, "grad_norm": 0.26775351, "learning_rate": 8.71e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.00748, "epoch": 0.21829856, "global_step/max_steps": "51/233", "percentage": "21.89%", "elapsed_time": "1h 53m 11s", "remaining_time": "6h 43m 57s"}
57
- {"loss": 0.54204899, "token_acc": 0.83737674, "grad_norm": 0.18461479, "learning_rate": 8.66e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.00755, "epoch": 0.22257892, "global_step/max_steps": "52/233", "percentage": "22.32%", "elapsed_time": "1h 54m 21s", "remaining_time": "6h 38m 4s"}
58
- {"loss": 0.57908702, "token_acc": 0.84690497, "grad_norm": 0.11942858, "learning_rate": 8.61e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007558, "epoch": 0.22685928, "global_step/max_steps": "53/233", "percentage": "22.75%", "elapsed_time": "1h 56m 26s", "remaining_time": "6h 35m 27s"}
59
- {"loss": 0.52622569, "token_acc": 0.82521804, "grad_norm": 0.13052414, "learning_rate": 8.56e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007578, "epoch": 0.23113965, "global_step/max_steps": "54/233", "percentage": "23.18%", "elapsed_time": "1h 58m 20s", "remaining_time": "6h 32m 16s"}
60
- {"loss": 0.56600893, "token_acc": 0.83141038, "grad_norm": 0.11560788, "learning_rate": 8.52e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007591, "epoch": 0.23542001, "global_step/max_steps": "55/233", "percentage": "23.61%", "elapsed_time": "2h 0m 19s", "remaining_time": "6h 29m 23s"}
61
- {"loss": 0.51500416, "token_acc": 0.82068742, "grad_norm": 0.17905267, "learning_rate": 8.47e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007629, "epoch": 0.23970037, "global_step/max_steps": "56/233", "percentage": "24.03%", "elapsed_time": "2h 1m 53s", "remaining_time": "6h 25m 17s"}
62
- {"loss": 0.52793306, "token_acc": 0.86082048, "grad_norm": 0.12558642, "learning_rate": 8.42e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007673, "epoch": 0.24398074, "global_step/max_steps": "57/233", "percentage": "24.46%", "elapsed_time": "2h 3m 23s", "remaining_time": "6h 20m 58s"}
63
- {"loss": 0.51401901, "token_acc": 0.82928567, "grad_norm": 0.1479127, "learning_rate": 8.37e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007701, "epoch": 0.2482611, "global_step/max_steps": "58/233", "percentage": "24.89%", "elapsed_time": "2h 5m 5s", "remaining_time": "6h 17m 27s"}
64
- {"loss": 0.54972792, "token_acc": 0.82862575, "grad_norm": 0.13245791, "learning_rate": 8.33e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007743, "epoch": 0.25254147, "global_step/max_steps": "59/233", "percentage": "25.32%", "elapsed_time": "2h 6m 34s", "remaining_time": "6h 13m 15s"}
65
- {"loss": 0.50546074, "token_acc": 0.84307301, "grad_norm": 0.12975469, "learning_rate": 8.28e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007704, "epoch": 0.25682183, "global_step/max_steps": "60/233", "percentage": "25.75%", "elapsed_time": "2h 9m 22s", "remaining_time": "6h 13m 0s"}
66
- {"eval_loss": 0.5343886, "eval_token_acc": 0.82678127, "eval_runtime": 233.6975, "eval_samples_per_second": 1.977, "eval_steps_per_second": 0.248, "epoch": 0.25682183, "global_step/max_steps": "60/233", "percentage": "25.75%", "elapsed_time": "2h 13m 15s", "remaining_time": "6h 24m 14s"}
67
- {"loss": 0.534343, "token_acc": 0.82905063, "grad_norm": 0.13970451, "learning_rate": 8.23e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007459, "epoch": 0.26110219, "global_step/max_steps": "61/233", "percentage": "26.18%", "elapsed_time": "2h 15m 52s", "remaining_time": "6h 23m 6s"}
68
- {"loss": 0.51031673, "token_acc": 0.83080435, "grad_norm": 0.11731356, "learning_rate": 8.18e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007501, "epoch": 0.26538256, "global_step/max_steps": "62/233", "percentage": "26.61%", "elapsed_time": "2h 17m 19s", "remaining_time": "6h 18m 44s"}
69
- {"loss": 0.56087667, "token_acc": 0.85323266, "grad_norm": 0.12921853, "learning_rate": 8.13e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007493, "epoch": 0.26966292, "global_step/max_steps": "63/233", "percentage": "27.04%", "elapsed_time": "2h 19m 41s", "remaining_time": "6h 16m 56s"}
70
- {"loss": 0.5064438, "token_acc": 0.86674917, "grad_norm": 0.11894882, "learning_rate": 8.09e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007516, "epoch": 0.27394329, "global_step/max_steps": "64/233", "percentage": "27.47%", "elapsed_time": "2h 21m 28s", "remaining_time": "6h 13m 35s"}
71
- {"loss": 0.49739748, "token_acc": 0.84879067, "grad_norm": 0.26122409, "learning_rate": 8.04e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007601, "epoch": 0.27822365, "global_step/max_steps": "65/233", "percentage": "27.90%", "elapsed_time": "2h 22m 5s", "remaining_time": "6h 7m 14s"}
72
- {"loss": 0.54737341, "token_acc": 0.82416686, "grad_norm": 0.10891951, "learning_rate": 7.99e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007629, "epoch": 0.28250401, "global_step/max_steps": "66/233", "percentage": "28.33%", "elapsed_time": "2h 23m 44s", "remaining_time": "6h 3m 43s"}
73
- {"loss": 0.56025583, "token_acc": 0.790977, "grad_norm": 0.1199242, "learning_rate": 7.94e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007675, "epoch": 0.28678438, "global_step/max_steps": "67/233", "percentage": "28.76%", "elapsed_time": "2h 25m 4s", "remaining_time": "5h 59m 25s"}
74
- {"loss": 0.51193327, "token_acc": 0.84465331, "grad_norm": 0.1607635, "learning_rate": 7.89e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007679, "epoch": 0.29106474, "global_step/max_steps": "68/233", "percentage": "29.18%", "elapsed_time": "2h 27m 9s", "remaining_time": "5h 57m 5s"}
75
- {"loss": 0.52289128, "token_acc": 0.83648469, "grad_norm": 0.12519571, "learning_rate": 7.85e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007698, "epoch": 0.2953451, "global_step/max_steps": "69/233", "percentage": "29.61%", "elapsed_time": "2h 28m 57s", "remaining_time": "5h 54m 2s"}
76
- {"loss": 0.50253069, "token_acc": 0.83147491, "grad_norm": 0.10590418, "learning_rate": 7.8e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007715, "epoch": 0.29962547, "global_step/max_steps": "70/233", "percentage": "30.04%", "elapsed_time": "2h 30m 47s", "remaining_time": "5h 51m 7s"}
77
- {"eval_loss": 0.52436066, "eval_token_acc": 0.82876859, "eval_runtime": 233.6281, "eval_samples_per_second": 1.978, "eval_steps_per_second": 0.248, "epoch": 0.29962547, "global_step/max_steps": "70/233", "percentage": "30.04%", "elapsed_time": "2h 34m 40s", "remaining_time": "6h 0m 11s"}
78
- {"loss": 0.48546743, "token_acc": 0.83361594, "grad_norm": 0.10634065, "learning_rate": 7.75e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007503, "epoch": 0.30390583, "global_step/max_steps": "71/233", "percentage": "30.47%", "elapsed_time": "2h 37m 16s", "remaining_time": "5h 58m 51s"}
79
- {"loss": 0.52685374, "token_acc": 0.82080856, "grad_norm": 0.12289236, "learning_rate": 7.7e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007523, "epoch": 0.3081862, "global_step/max_steps": "72/233", "percentage": "30.90%", "elapsed_time": "2h 39m 5s", "remaining_time": "5h 55m 44s"}
80
- {"loss": 0.50480282, "token_acc": 0.83330189, "grad_norm": 0.1234926, "learning_rate": 7.66e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007574, "epoch": 0.31246656, "global_step/max_steps": "73/233", "percentage": "31.33%", "elapsed_time": "2h 40m 12s", "remaining_time": "5h 51m 7s"}
81
- {"loss": 0.52709985, "token_acc": 0.83077331, "grad_norm": 0.20998599, "learning_rate": 7.61e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.00758, "epoch": 0.31674692, "global_step/max_steps": "74/233", "percentage": "31.76%", "elapsed_time": "2h 42m 16s", "remaining_time": "5h 48m 39s"}
82
- {"loss": 0.51562619, "token_acc": 0.85752608, "grad_norm": 0.11690234, "learning_rate": 7.56e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007622, "epoch": 0.32102729, "global_step/max_steps": "75/233", "percentage": "32.19%", "elapsed_time": "2h 43m 34s", "remaining_time": "5h 44m 34s"}
83
- {"loss": 0.53282851, "token_acc": 0.83144705, "grad_norm": 0.1492236, "learning_rate": 7.51e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007626, "epoch": 0.32530765, "global_step/max_steps": "76/233", "percentage": "32.62%", "elapsed_time": "2h 45m 39s", "remaining_time": "5h 42m 13s"}
84
- {"loss": 0.53681839, "token_acc": 0.8160215, "grad_norm": 0.12850326, "learning_rate": 7.46e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007615, "epoch": 0.32958801, "global_step/max_steps": "77/233", "percentage": "33.05%", "elapsed_time": "2h 48m 5s", "remaining_time": "5h 40m 32s"}
85
- {"loss": 0.53259099, "token_acc": 0.81180987, "grad_norm": 0.12752953, "learning_rate": 7.42e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007602, "epoch": 0.33386838, "global_step/max_steps": "78/233", "percentage": "33.48%", "elapsed_time": "2h 50m 34s", "remaining_time": "5h 38m 58s"}
86
- {"loss": 0.53562546, "token_acc": 0.82184946, "grad_norm": 0.15158969, "learning_rate": 7.37e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007612, "epoch": 0.33814874, "global_step/max_steps": "79/233", "percentage": "33.91%", "elapsed_time": "2h 52m 32s", "remaining_time": "5h 36m 21s"}
87
- {"loss": 0.51222068, "token_acc": 0.84699546, "grad_norm": 0.1310516, "learning_rate": 7.32e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007659, "epoch": 0.34242911, "global_step/max_steps": "80/233", "percentage": "34.33%", "elapsed_time": "2h 53m 39s", "remaining_time": "5h 32m 7s"}
88
- {"eval_loss": 0.51669514, "eval_token_acc": 0.83052194, "eval_runtime": 234.1299, "eval_samples_per_second": 1.973, "eval_steps_per_second": 0.248, "epoch": 0.34242911, "global_step/max_steps": "80/233", "percentage": "34.33%", "elapsed_time": "2h 57m 33s", "remaining_time": "5h 39m 35s"}
89
- {"loss": 0.53691947, "token_acc": 0.83763546, "grad_norm": 0.11690947, "learning_rate": 7.27e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007443, "epoch": 0.34670947, "global_step/max_steps": "81/233", "percentage": "34.76%", "elapsed_time": "3h 0m 57s", "remaining_time": "5h 39m 34s"}
90
- {"loss": 0.50225633, "token_acc": 0.85966191, "grad_norm": 0.11442987, "learning_rate": 7.22e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007462, "epoch": 0.35098983, "global_step/max_steps": "82/233", "percentage": "35.19%", "elapsed_time": "3h 2m 42s", "remaining_time": "5h 36m 27s"}
91
- {"loss": 0.4828037, "token_acc": 0.82932011, "grad_norm": 0.13160026, "learning_rate": 7.18e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007516, "epoch": 0.3552702, "global_step/max_steps": "83/233", "percentage": "35.62%", "elapsed_time": "3h 3m 37s", "remaining_time": "5h 31m 51s"}
92
- {"loss": 0.51871783, "token_acc": 0.84281236, "grad_norm": 0.1129145, "learning_rate": 7.13e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007533, "epoch": 0.35955056, "global_step/max_steps": "84/233", "percentage": "36.05%", "elapsed_time": "3h 5m 25s", "remaining_time": "5h 28m 55s"}
93
- {"loss": 0.51214552, "token_acc": 0.82529789, "grad_norm": 0.1373262, "learning_rate": 7.08e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007551, "epoch": 0.36383093, "global_step/max_steps": "85/233", "percentage": "36.48%", "elapsed_time": "3h 7m 11s", "remaining_time": "5h 25m 55s"}
94
- {"loss": 0.49339923, "token_acc": 0.82989313, "grad_norm": 0.11337092, "learning_rate": 7.03e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007587, "epoch": 0.36811129, "global_step/max_steps": "86/233", "percentage": "36.91%", "elapsed_time": "3h 8m 29s", "remaining_time": "5h 22m 11s"}
95
- {"loss": 0.48843583, "token_acc": 0.84886903, "grad_norm": 0.15754931, "learning_rate": 6.99e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007588, "epoch": 0.37239165, "global_step/max_steps": "87/233", "percentage": "37.34%", "elapsed_time": "3h 10m 39s", "remaining_time": "5h 19m 56s"}
96
- {"loss": 0.53001589, "token_acc": 0.83002833, "grad_norm": 0.15598585, "learning_rate": 6.94e-06, "memory(GiB)": 128.1, "train_speed(iter/s)": 0.007635, "epoch": 0.37667202, "global_step/max_steps": "88/233", "percentage": "37.77%", "elapsed_time": "3h 11m 39s", "remaining_time": "5h 15m 48s"}
97
- {"loss": 0.50424647, "token_acc": 0.83222175, "grad_norm": 0.12133142, "learning_rate": 6.89e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007628, "epoch": 0.38095238, "global_step/max_steps": "89/233", "percentage": "38.20%", "elapsed_time": "3h 14m 2s", "remaining_time": "5h 13m 57s"}
98
- {"loss": 0.49213964, "token_acc": 0.82991934, "grad_norm": 0.10147729, "learning_rate": 6.84e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007664, "epoch": 0.38523274, "global_step/max_steps": "90/233", "percentage": "38.63%", "elapsed_time": "3h 15m 16s", "remaining_time": "5h 10m 16s"}
99
- {"eval_loss": 0.51017594, "eval_token_acc": 0.83137481, "eval_runtime": 234.0416, "eval_samples_per_second": 1.974, "eval_steps_per_second": 0.248, "epoch": 0.38523274, "global_step/max_steps": "90/233", "percentage": "38.63%", "elapsed_time": "3h 19m 10s", "remaining_time": "5h 16m 28s"}
100
- {"loss": 0.49563336, "token_acc": 0.83621232, "grad_norm": 0.11565997, "learning_rate": 6.79e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007508, "epoch": 0.38951311, "global_step/max_steps": "91/233", "percentage": "39.06%", "elapsed_time": "3h 21m 34s", "remaining_time": "5h 14m 33s"}
101
- {"loss": 0.52433681, "token_acc": 0.82337625, "grad_norm": 0.12434755, "learning_rate": 6.75e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007493, "epoch": 0.39379347, "global_step/max_steps": "92/233", "percentage": "39.48%", "elapsed_time": "3h 24m 12s", "remaining_time": "5h 12m 57s"}
102
- {"loss": 0.5221467, "token_acc": 0.8442623, "grad_norm": 0.11333634, "learning_rate": 6.7e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007489, "epoch": 0.39807384, "global_step/max_steps": "93/233", "percentage": "39.91%", "elapsed_time": "3h 26m 32s", "remaining_time": "5h 10m 55s"}
103
- {"loss": 0.50553328, "token_acc": 0.8583166, "grad_norm": 0.13680269, "learning_rate": 6.65e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.00748, "epoch": 0.4023542, "global_step/max_steps": "94/233", "percentage": "40.34%", "elapsed_time": "3h 29m 0s", "remaining_time": "5h 9m 3s"}
104
- {"loss": 0.49891001, "token_acc": 0.855139, "grad_norm": 0.08764607, "learning_rate": 6.6e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007474, "epoch": 0.40663456, "global_step/max_steps": "95/233", "percentage": "40.77%", "elapsed_time": "3h 31m 24s", "remaining_time": "5h 7m 6s"}
105
- {"loss": 0.51290613, "token_acc": 0.84636644, "grad_norm": 0.12013482, "learning_rate": 6.56e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.00751, "epoch": 0.41091493, "global_step/max_steps": "96/233", "percentage": "41.20%", "elapsed_time": "3h 32m 36s", "remaining_time": "5h 3m 24s"}
106
- {"loss": 0.49172401, "token_acc": 0.8548566, "grad_norm": 0.13774136, "learning_rate": 6.51e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007528, "epoch": 0.41519529, "global_step/max_steps": "97/233", "percentage": "41.63%", "elapsed_time": "3h 34m 18s", "remaining_time": "5h 0m 28s"}
107
- {"loss": 0.55053842, "token_acc": 0.8340143, "grad_norm": 0.13488252, "learning_rate": 6.46e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007554, "epoch": 0.41947566, "global_step/max_steps": "98/233", "percentage": "42.06%", "elapsed_time": "3h 35m 47s", "remaining_time": "4h 57m 15s"}
108
- {"loss": 0.5442782, "token_acc": 0.84363808, "grad_norm": 0.10564359, "learning_rate": 6.41e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007526, "epoch": 0.42375602, "global_step/max_steps": "99/233", "percentage": "42.49%", "elapsed_time": "3h 38m 47s", "remaining_time": "4h 56m 8s"}
109
- {"loss": 0.51258826, "token_acc": 0.83983287, "grad_norm": 0.17585698, "learning_rate": 6.36e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007543, "epoch": 0.42803638, "global_step/max_steps": "100/233", "percentage": "42.92%", "elapsed_time": "3h 40m 31s", "remaining_time": "4h 53m 18s"}
110
- {"eval_loss": 0.50452524, "eval_token_acc": 0.83303703, "eval_runtime": 234.2336, "eval_samples_per_second": 1.972, "eval_steps_per_second": 0.248, "epoch": 0.42803638, "global_step/max_steps": "100/233", "percentage": "42.92%", "elapsed_time": "3h 44m 26s", "remaining_time": "4h 58m 30s"}
111
- {"loss": 0.50650519, "token_acc": 0.83758287, "grad_norm": 0.12161291, "learning_rate": 6.32e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007409, "epoch": 0.43231675, "global_step/max_steps": "101/233", "percentage": "43.35%", "elapsed_time": "3h 46m 45s", "remaining_time": "4h 56m 21s"}
112
- {"loss": 0.46387315, "token_acc": 0.84282257, "grad_norm": 0.17145611, "learning_rate": 6.27e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007454, "epoch": 0.43659711, "global_step/max_steps": "102/233", "percentage": "43.78%", "elapsed_time": "3h 47m 37s", "remaining_time": "4h 52m 20s"}
113
- {"loss": 0.51760161, "token_acc": 0.84220632, "grad_norm": 0.12423951, "learning_rate": 6.22e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007479, "epoch": 0.44087747, "global_step/max_steps": "103/233", "percentage": "44.21%", "elapsed_time": "3h 49m 5s", "remaining_time": "4h 49m 9s"}
114
- {"loss": 0.48974538, "token_acc": 0.85497655, "grad_norm": 0.11668554, "learning_rate": 6.17e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007488, "epoch": 0.44515784, "global_step/max_steps": "104/233", "percentage": "44.64%", "elapsed_time": "3h 51m 2s", "remaining_time": "4h 46m 34s"}
115
- {"loss": 0.46702838, "token_acc": 0.84152495, "grad_norm": 0.12153333, "learning_rate": 6.12e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.00752, "epoch": 0.4494382, "global_step/max_steps": "105/233", "percentage": "45.06%", "elapsed_time": "3h 52m 17s", "remaining_time": "4h 43m 10s"}
116
- {"loss": 0.47563401, "token_acc": 0.84894142, "grad_norm": 0.11127526, "learning_rate": 6.08e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007559, "epoch": 0.45371857, "global_step/max_steps": "106/233", "percentage": "45.49%", "elapsed_time": "3h 53m 17s", "remaining_time": "4h 39m 30s"}
117
- {"loss": 0.56198275, "token_acc": 0.83849832, "grad_norm": 0.11438177, "learning_rate": 6.03e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.00757, "epoch": 0.45799893, "global_step/max_steps": "107/233", "percentage": "45.92%", "elapsed_time": "3h 55m 7s", "remaining_time": "4h 36m 53s"}
118
- {"loss": 0.53137517, "token_acc": 0.83432408, "grad_norm": 0.11901586, "learning_rate": 5.98e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.00757, "epoch": 0.46227929, "global_step/max_steps": "108/233", "percentage": "46.35%", "elapsed_time": "3h 57m 20s", "remaining_time": "4h 34m 42s"}
119
- {"loss": 0.49352676, "token_acc": 0.84623256, "grad_norm": 0.11566687, "learning_rate": 5.93e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007602, "epoch": 0.46655966, "global_step/max_steps": "109/233", "percentage": "46.78%", "elapsed_time": "3h 58m 33s", "remaining_time": "4h 31m 22s"}
120
- {"loss": 0.45819372, "token_acc": 0.84602131, "grad_norm": 0.1290195, "learning_rate": 5.89e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007643, "epoch": 0.47084002, "global_step/max_steps": "110/233", "percentage": "47.21%", "elapsed_time": "3h 59m 26s", "remaining_time": "4h 27m 44s"}
121
- {"eval_loss": 0.49978775, "eval_token_acc": 0.83414018, "eval_runtime": 233.5668, "eval_samples_per_second": 1.978, "eval_steps_per_second": 0.248, "epoch": 0.47084002, "global_step/max_steps": "110/233", "percentage": "47.21%", "elapsed_time": "4h 3m 20s", "remaining_time": "4h 32m 5s"}
122
- {"loss": 0.49648049, "token_acc": 0.83760248, "grad_norm": 0.12439388, "learning_rate": 5.84e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007504, "epoch": 0.47512039, "global_step/max_steps": "111/233", "percentage": "47.64%", "elapsed_time": "4h 6m 7s", "remaining_time": "4h 30m 30s"}
123
- {"loss": 0.54757476, "token_acc": 0.84584637, "grad_norm": 0.11718772, "learning_rate": 5.79e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007514, "epoch": 0.47940075, "global_step/max_steps": "112/233", "percentage": "48.07%", "elapsed_time": "4h 8m 0s", "remaining_time": "4h 27m 56s"}
124
- {"loss": 0.4847149, "token_acc": 0.84780194, "grad_norm": 0.11900615, "learning_rate": 5.74e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.00753, "epoch": 0.48368111, "global_step/max_steps": "113/233", "percentage": "48.50%", "elapsed_time": "4h 9m 40s", "remaining_time": "4h 25m 8s"}
125
- {"loss": 0.54250658, "token_acc": 0.8307074, "grad_norm": 0.13745169, "learning_rate": 5.69e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007533, "epoch": 0.48796148, "global_step/max_steps": "114/233", "percentage": "48.93%", "elapsed_time": "4h 11m 48s", "remaining_time": "4h 22m 50s"}
126
- {"loss": 0.82845831, "token_acc": 0.83484427, "grad_norm": 0.12465348, "learning_rate": 5.65e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007558, "epoch": 0.49224184, "global_step/max_steps": "115/233", "percentage": "49.36%", "elapsed_time": "4h 13m 10s", "remaining_time": "4h 19m 46s"}
127
- {"loss": 0.50414205, "token_acc": 0.855043, "grad_norm": 0.15168351, "learning_rate": 5.6e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.00753, "epoch": 0.4965222, "global_step/max_steps": "116/233", "percentage": "49.79%", "elapsed_time": "4h 16m 18s", "remaining_time": "4h 18m 30s"}
128
- {"loss": 0.47190434, "token_acc": 0.83568548, "grad_norm": 0.12488054, "learning_rate": 5.55e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007556, "epoch": 0.50080257, "global_step/max_steps": "117/233", "percentage": "50.21%", "elapsed_time": "4h 17m 37s", "remaining_time": "4h 15m 25s"}
129
- {"loss": 0.52818644, "token_acc": 0.84974471, "grad_norm": 0.24668963, "learning_rate": 5.5e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.00759, "epoch": 0.50508293, "global_step/max_steps": "118/233", "percentage": "50.64%", "elapsed_time": "4h 18m 39s", "remaining_time": "4h 12m 5s"}
130
- {"loss": 0.48247573, "token_acc": 0.8508541, "grad_norm": 0.12629485, "learning_rate": 5.45e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007587, "epoch": 0.5093633, "global_step/max_steps": "119/233", "percentage": "51.07%", "elapsed_time": "4h 20m 59s", "remaining_time": "4h 10m 1s"}
131
- {"loss": 0.50020522, "token_acc": 0.84359375, "grad_norm": 0.1088229, "learning_rate": 5.41e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007567, "epoch": 0.51364366, "global_step/max_steps": "120/233", "percentage": "51.50%", "elapsed_time": "4h 23m 52s", "remaining_time": "4h 8m 28s"}
132
- {"eval_loss": 0.49559635, "eval_token_acc": 0.8350121, "eval_runtime": 234.1304, "eval_samples_per_second": 1.973, "eval_steps_per_second": 0.248, "epoch": 0.51364366, "global_step/max_steps": "120/233", "percentage": "51.50%", "elapsed_time": "4h 27m 46s", "remaining_time": "4h 12m 9s"}
133
- {"loss": 0.46793276, "token_acc": 0.84055563, "grad_norm": 0.16117993, "learning_rate": 5.36e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007476, "epoch": 0.51792402, "global_step/max_steps": "121/233", "percentage": "51.93%", "elapsed_time": "4h 29m 19s", "remaining_time": "4h 9m 17s"}
134
- {"loss": 0.49023747, "token_acc": 0.8382855, "grad_norm": 0.13395602, "learning_rate": 5.31e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007493, "epoch": 0.52220439, "global_step/max_steps": "122/233", "percentage": "52.36%", "elapsed_time": "4h 30m 55s", "remaining_time": "4h 6m 30s"}
135
- {"loss": 0.47423121, "token_acc": 0.83460689, "grad_norm": 0.14448754, "learning_rate": 5.26e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007516, "epoch": 0.52648475, "global_step/max_steps": "123/233", "percentage": "52.79%", "elapsed_time": "4h 32m 18s", "remaining_time": "4h 3m 31s"}
136
- {"loss": 0.48822367, "token_acc": 0.85513403, "grad_norm": 0.21798755, "learning_rate": 5.22e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007509, "epoch": 0.53076512, "global_step/max_steps": "124/233", "percentage": "53.22%", "elapsed_time": "4h 34m 46s", "remaining_time": "4h 1m 32s"}
137
- {"loss": 0.49620324, "token_acc": 0.86353582, "grad_norm": 0.1151548, "learning_rate": 5.17e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.00754, "epoch": 0.53504548, "global_step/max_steps": "125/233", "percentage": "53.65%", "elapsed_time": "4h 35m 52s", "remaining_time": "3h 58m 21s"}
138
- {"loss": 0.5336532, "token_acc": 0.81213332, "grad_norm": 0.1721856, "learning_rate": 5.12e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007551, "epoch": 0.53932584, "global_step/max_steps": "126/233", "percentage": "54.08%", "elapsed_time": "4h 37m 41s", "remaining_time": "3h 55m 49s"}
139
- {"loss": 0.49009401, "token_acc": 0.86326662, "grad_norm": 0.11931138, "learning_rate": 5.07e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.00757, "epoch": 0.54360621, "global_step/max_steps": "127/233", "percentage": "54.51%", "elapsed_time": "4h 39m 11s", "remaining_time": "3h 53m 1s"}
140
- {"loss": 0.52549112, "token_acc": 0.83741613, "grad_norm": 0.13655488, "learning_rate": 5.02e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007581, "epoch": 0.54788657, "global_step/max_steps": "128/233", "percentage": "54.94%", "elapsed_time": "4h 40m 58s", "remaining_time": "3h 50m 29s"}
141
- {"loss": 0.5040428, "token_acc": 0.85339168, "grad_norm": 0.13167702, "learning_rate": 4.98e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007592, "epoch": 0.55216693, "global_step/max_steps": "129/233", "percentage": "55.36%", "elapsed_time": "4h 42m 45s", "remaining_time": "3h 47m 57s"}
142
- {"loss": 0.4640165, "token_acc": 0.84589084, "grad_norm": 0.13566221, "learning_rate": 4.93e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007614, "epoch": 0.5564473, "global_step/max_steps": "130/233", "percentage": "55.79%", "elapsed_time": "4h 44m 7s", "remaining_time": "3h 45m 6s"}
143
- {"eval_loss": 0.49200445, "eval_token_acc": 0.83594251, "eval_runtime": 234.1648, "eval_samples_per_second": 1.973, "eval_steps_per_second": 0.248, "epoch": 0.5564473, "global_step/max_steps": "130/233", "percentage": "55.79%", "elapsed_time": "4h 48m 1s", "remaining_time": "3h 48m 12s"}
144
- {"loss": 0.48091298, "token_acc": 0.83779916, "grad_norm": 0.12161104, "learning_rate": 4.88e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007505, "epoch": 0.56072766, "global_step/max_steps": "131/233", "percentage": "56.22%", "elapsed_time": "4h 50m 28s", "remaining_time": "3h 46m 9s"}
145
- {"loss": 0.49842244, "token_acc": 0.81520841, "grad_norm": 0.12285878, "learning_rate": 4.83e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007538, "epoch": 0.56500803, "global_step/max_steps": "132/233", "percentage": "56.65%", "elapsed_time": "4h 51m 25s", "remaining_time": "3h 42m 59s"}
146
- {"loss": 0.48106787, "token_acc": 0.80721846, "grad_norm": 0.13136606, "learning_rate": 4.78e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007557, "epoch": 0.56928839, "global_step/max_steps": "133/233", "percentage": "57.08%", "elapsed_time": "4h 52m 53s", "remaining_time": "3h 40m 13s"}
147
- {"loss": 0.48102638, "token_acc": 0.85745879, "grad_norm": 0.13375333, "learning_rate": 4.74e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007588, "epoch": 0.57356875, "global_step/max_steps": "134/233", "percentage": "57.51%", "elapsed_time": "4h 53m 53s", "remaining_time": "3h 37m 7s"}
148
- {"loss": 0.49373215, "token_acc": 0.84182651, "grad_norm": 0.11789739, "learning_rate": 4.69e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007588, "epoch": 0.57784912, "global_step/max_steps": "135/233", "percentage": "57.94%", "elapsed_time": "4h 56m 6s", "remaining_time": "3h 34m 57s"}
149
- {"loss": 0.50271904, "token_acc": 0.86526884, "grad_norm": 0.11597518, "learning_rate": 4.64e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007594, "epoch": 0.58212948, "global_step/max_steps": "136/233", "percentage": "58.37%", "elapsed_time": "4h 58m 1s", "remaining_time": "3h 32m 33s"}
150
- {"loss": 0.48411375, "token_acc": 0.82616099, "grad_norm": 0.14201498, "learning_rate": 4.59e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007623, "epoch": 0.58640984, "global_step/max_steps": "137/233", "percentage": "58.80%", "elapsed_time": "4h 59m 7s", "remaining_time": "3h 29m 36s"}
151
- {"loss": 0.47382951, "token_acc": 0.81373225, "grad_norm": 0.11298946, "learning_rate": 4.55e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007638, "epoch": 0.59069021, "global_step/max_steps": "138/233", "percentage": "59.23%", "elapsed_time": "5h 0m 41s", "remaining_time": "3h 26m 59s"}
152
- {"loss": 0.52312583, "token_acc": 0.85347195, "grad_norm": 0.10149517, "learning_rate": 4.5e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007625, "epoch": 0.59497057, "global_step/max_steps": "139/233", "percentage": "59.66%", "elapsed_time": "5h 3m 22s", "remaining_time": "3h 25m 9s"}
153
- {"loss": 0.49635562, "token_acc": 0.84268554, "grad_norm": 0.28712398, "learning_rate": 4.45e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007616, "epoch": 0.59925094, "global_step/max_steps": "140/233", "percentage": "60.09%", "elapsed_time": "5h 5m 57s", "remaining_time": "3h 23m 14s"}
154
- {"eval_loss": 0.4882482, "eval_token_acc": 0.83673689, "eval_runtime": 234.0956, "eval_samples_per_second": 1.974, "eval_steps_per_second": 0.248, "epoch": 0.59925094, "global_step/max_steps": "140/233", "percentage": "60.09%", "elapsed_time": "5h 9m 51s", "remaining_time": "3h 25m 49s"}
155
- {"loss": 0.47461641, "token_acc": 0.84012604, "grad_norm": 0.14535584, "learning_rate": 4.4e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007522, "epoch": 0.6035313, "global_step/max_steps": "141/233", "percentage": "60.52%", "elapsed_time": "5h 11m 57s", "remaining_time": "3h 23m 33s"}
156
- {"loss": 0.5000332, "token_acc": 0.83833177, "grad_norm": 0.11992717, "learning_rate": 4.35e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007548, "epoch": 0.60781166, "global_step/max_steps": "142/233", "percentage": "60.94%", "elapsed_time": "5h 13m 5s", "remaining_time": "3h 20m 38s"}
157
- {"loss": 0.48448735, "token_acc": 0.84470637, "grad_norm": 0.16265908, "learning_rate": 4.31e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007574, "epoch": 0.61209203, "global_step/max_steps": "143/233", "percentage": "61.37%", "elapsed_time": "5h 14m 14s", "remaining_time": "3h 17m 46s"}
158
- {"loss": 0.46533874, "token_acc": 0.85549313, "grad_norm": 0.12145889, "learning_rate": 4.26e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.00755, "epoch": 0.61637239, "global_step/max_steps": "144/233", "percentage": "61.80%", "elapsed_time": "5h 17m 28s", "remaining_time": "3h 16m 12s"}
159
- {"loss": 0.44867301, "token_acc": 0.86317424, "grad_norm": 0.11760305, "learning_rate": 4.21e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007559, "epoch": 0.62065276, "global_step/max_steps": "145/233", "percentage": "62.23%", "elapsed_time": "5h 19m 16s", "remaining_time": "3h 13m 46s"}
160
- {"loss": 0.49094412, "token_acc": 0.83705991, "grad_norm": 0.10963392, "learning_rate": 4.16e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007559, "epoch": 0.62493312, "global_step/max_steps": "146/233", "percentage": "62.66%", "elapsed_time": "5h 21m 29s", "remaining_time": "3h 11m 34s"}
161
- {"loss": 0.48288625, "token_acc": 0.84562759, "grad_norm": 0.11926857, "learning_rate": 4.11e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.00757, "epoch": 0.62921348, "global_step/max_steps": "147/233", "percentage": "63.09%", "elapsed_time": "5h 23m 13s", "remaining_time": "3h 9m 6s"}
162
- {"loss": 0.46473897, "token_acc": 0.86574336, "grad_norm": 0.12197684, "learning_rate": 4.07e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007572, "epoch": 0.63349385, "global_step/max_steps": "148/233", "percentage": "63.52%", "elapsed_time": "5h 25m 20s", "remaining_time": "3h 6m 50s"}
163
- {"loss": 0.46687201, "token_acc": 0.86090455, "grad_norm": 0.12635399, "learning_rate": 4.02e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007595, "epoch": 0.63777421, "global_step/max_steps": "149/233", "percentage": "63.95%", "elapsed_time": "5h 26m 31s", "remaining_time": "3h 4m 5s"}
164
- {"loss": 0.50646043, "token_acc": 0.83715979, "grad_norm": 0.11814403, "learning_rate": 3.97e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007587, "epoch": 0.64205457, "global_step/max_steps": "150/233", "percentage": "64.38%", "elapsed_time": "5h 29m 5s", "remaining_time": "3h 2m 5s"}
165
- {"eval_loss": 0.4841184, "eval_token_acc": 0.83763193, "eval_runtime": 233.6185, "eval_samples_per_second": 1.978, "eval_steps_per_second": 0.248, "epoch": 0.64205457, "global_step/max_steps": "150/233", "percentage": "64.38%", "elapsed_time": "5h 32m 59s", "remaining_time": "3h 4m 15s"}
166
- {"loss": 0.48874947, "token_acc": 0.8424199, "grad_norm": 0.12018572, "learning_rate": 3.92e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007498, "epoch": 0.64633494, "global_step/max_steps": "151/233", "percentage": "64.81%", "elapsed_time": "5h 35m 12s", "remaining_time": "3h 2m 1s"}
167
- {"loss": 0.48023725, "token_acc": 0.84619697, "grad_norm": 0.1180267, "learning_rate": 3.88e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007511, "epoch": 0.6506153, "global_step/max_steps": "152/233", "percentage": "65.24%", "elapsed_time": "5h 36m 51s", "remaining_time": "2h 59m 30s"}
168
- {"loss": 0.48215163, "token_acc": 0.81318267, "grad_norm": 0.12059806, "learning_rate": 3.83e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007523, "epoch": 0.65489567, "global_step/max_steps": "153/233", "percentage": "65.67%", "elapsed_time": "5h 38m 32s", "remaining_time": "2h 57m 0s"}
169
- {"loss": 0.47923255, "token_acc": 0.83857755, "grad_norm": 0.12951089, "learning_rate": 3.78e-06, "memory(GiB)": 133.69, "train_speed(iter/s)": 0.007541, "epoch": 0.65917603, "global_step/max_steps": "154/233", "percentage": "66.09%", "elapsed_time": "5h 39m 56s", "remaining_time": "2h 54m 22s"}
 
1
+ {"loss": 0.58325195, "grad_norm": 6.15641366, "learning_rate": 3e-07, "memory(GiB)": 68.65, "train_speed(iter/s)": 0.012459, "rewards/chosen": -0.02832031, "rewards/rejected": -0.04541016, "rewards/accuracies": 1.0, "rewards/margins": 0.01708984, "logps/rejected": -0.45507812, "logps/chosen": -0.28320312, "logits/rejected": 0.05541992, "logits/chosen": 0.484375, "nll_loss": 0.28125, "log_odds_ratio": -0.4453125, "log_odds_chosen": 0.5703125, "epoch": 0.00303951, "global_step/max_steps": "1/329", "percentage": "0.30%", "elapsed_time": "1m 13s", "remaining_time": "6h 41m 36s"}
2
+ {"loss": 0.57861328, "grad_norm": 4.96429781, "learning_rate": 6.1e-07, "memory(GiB)": 91.09, "train_speed(iter/s)": 0.015066, "rewards/chosen": -0.0480957, "rewards/rejected": -0.04980469, "rewards/accuracies": 1.0, "rewards/margins": 0.00170898, "logps/rejected": -0.49804688, "logps/chosen": -0.48046875, "logits/rejected": 0.734375, "logits/chosen": 0.59765625, "nll_loss": 0.48242188, "log_odds_ratio": -0.67578125, "log_odds_chosen": 0.03710938, "epoch": 0.00607903, "global_step/max_steps": "2/329", "percentage": "0.61%", "elapsed_time": "2m 5s", "remaining_time": "5h 43m 12s"}
3
+ {"loss": 0.58496094, "grad_norm": 4.47705827, "learning_rate": 9.1e-07, "memory(GiB)": 91.09, "train_speed(iter/s)": 0.015521, "rewards/chosen": -0.0300293, "rewards/rejected": -0.02453613, "rewards/accuracies": 0.0, "rewards/margins": -0.00549316, "logps/rejected": -0.24511719, "logps/chosen": -0.30078125, "logits/rejected": 0.38867188, "logits/chosen": 0.26367188, "nll_loss": 0.29882812, "log_odds_ratio": -0.80859375, "log_odds_chosen": -0.21972656, "epoch": 0.00911854, "global_step/max_steps": "3/329", "percentage": "0.91%", "elapsed_time": "3m 6s", "remaining_time": "5h 37m 45s"}
4
+ {"loss": 0.55322266, "grad_norm": 4.56437749, "learning_rate": 1.21e-06, "memory(GiB)": 93.05, "train_speed(iter/s)": 0.017586, "rewards/chosen": -0.04589844, "rewards/rejected": -0.04174805, "rewards/accuracies": 0.0, "rewards/margins": -0.00415039, "logps/rejected": -0.41796875, "logps/chosen": -0.45898438, "logits/rejected": 0.3046875, "logits/chosen": 0.37695312, "nll_loss": 0.45898438, "log_odds_ratio": -0.75390625, "log_odds_chosen": -0.11914062, "epoch": 0.01215805, "global_step/max_steps": "4/329", "percentage": "1.22%", "elapsed_time": "3m 40s", "remaining_time": "4h 58m 48s"}
5
+ {"loss": 0.57531738, "grad_norm": 3.87794046, "learning_rate": 1.52e-06, "memory(GiB)": 100.25, "train_speed(iter/s)": 0.019266, "rewards/chosen": -0.0456543, "rewards/rejected": -0.046875, "rewards/accuracies": 1.0, "rewards/margins": 0.0012207, "logps/rejected": -0.46875, "logps/chosen": -0.45703125, "logits/rejected": 0.7578125, "logits/chosen": 0.67578125, "nll_loss": 0.45898438, "log_odds_ratio": -0.67578125, "log_odds_chosen": 0.03125, "epoch": 0.01519757, "global_step/max_steps": "5/329", "percentage": "1.52%", "elapsed_time": "4m 12s", "remaining_time": "4h 32m 56s"}
6
+ {"loss": 0.55859375, "grad_norm": 4.16021983, "learning_rate": 1.82e-06, "memory(GiB)": 100.25, "train_speed(iter/s)": 0.019891, "rewards/chosen": -0.04296875, "rewards/rejected": -0.0480957, "rewards/accuracies": 1.0, "rewards/margins": 0.00512695, "logps/rejected": -0.48046875, "logps/chosen": -0.4296875, "logits/rejected": 0.7734375, "logits/chosen": 0.77734375, "nll_loss": 0.4296875, "log_odds_ratio": -0.625, "log_odds_chosen": 0.14453125, "epoch": 0.01823708, "global_step/max_steps": "6/329", "percentage": "1.82%", "elapsed_time": "4m 54s", "remaining_time": "4h 24m 32s"}
7
+ {"loss": 0.64770508, "grad_norm": 5.19085531, "learning_rate": 2.12e-06, "memory(GiB)": 133.18, "train_speed(iter/s)": 0.019273, "rewards/chosen": -0.03979492, "rewards/rejected": -0.03662109, "rewards/accuracies": 0.0, "rewards/margins": -0.00317383, "logps/rejected": -0.36523438, "logps/chosen": -0.3984375, "logits/rejected": 0.54296875, "logits/chosen": 0.515625, "nll_loss": 0.3984375, "log_odds_ratio": -0.74609375, "log_odds_chosen": -0.10351562, "epoch": 0.0212766, "global_step/max_steps": "7/329", "percentage": "2.13%", "elapsed_time": "5m 56s", "remaining_time": "4h 33m 14s"}
8
+ {"loss": 0.48022461, "grad_norm": 8.73835517, "learning_rate": 2.42e-06, "memory(GiB)": 133.18, "train_speed(iter/s)": 0.018543, "rewards/chosen": -0.03088379, "rewards/rejected": -0.04248047, "rewards/accuracies": 1.0, "rewards/margins": 0.01159668, "logps/rejected": -0.42578125, "logps/chosen": -0.30859375, "logits/rejected": 0.78125, "logits/chosen": 0.55078125, "nll_loss": 0.30859375, "log_odds_ratio": -0.515625, "log_odds_chosen": 0.390625, "epoch": 0.02431611, "global_step/max_steps": "8/329", "percentage": "2.43%", "elapsed_time": "7m 4s", "remaining_time": "4h 43m 57s"}
9
+ {"loss": 0.54199219, "grad_norm": 3.21968845, "learning_rate": 2.73e-06, "memory(GiB)": 133.18, "train_speed(iter/s)": 0.018768, "rewards/chosen": -0.04907227, "rewards/rejected": -0.0480957, "rewards/accuracies": 0.0, "rewards/margins": -0.00097656, "logps/rejected": -0.48046875, "logps/chosen": -0.49023438, "logits/rejected": 0.86328125, "logits/chosen": 0.74609375, "nll_loss": 0.4921875, "log_odds_ratio": -0.703125, "log_odds_chosen": -0.02148438, "epoch": 0.02735562, "global_step/max_steps": "9/329", "percentage": "2.74%", "elapsed_time": "7m 52s", "remaining_time": "4h 40m 8s"}
10
+ {"loss": 0.5090332, "grad_norm": 4.62599348, "learning_rate": 3.03e-06, "memory(GiB)": 133.18, "train_speed(iter/s)": 0.018667, "rewards/chosen": -0.04589844, "rewards/rejected": -0.04492188, "rewards/accuracies": 0.0, "rewards/margins": -0.00097656, "logps/rejected": -0.44921875, "logps/chosen": -0.45898438, "logits/rejected": 0.94921875, "logits/chosen": 0.8828125, "nll_loss": 0.4609375, "log_odds_ratio": -0.703125, "log_odds_chosen": -0.02539062, "epoch": 0.03039514, "global_step/max_steps": "10/329", "percentage": "3.04%", "elapsed_time": "8m 48s", "remaining_time": "4h 41m 11s"}
11
+ {"eval_loss": 0.5304302, "eval_runtime": 141.9055, "eval_samples_per_second": 0.571, "eval_steps_per_second": 0.078, "eval_rewards/chosen": -0.05118075, "eval_rewards/rejected": -0.04210316, "eval_rewards/accuracies": 0.09090909, "eval_rewards/margins": -0.00907759, "eval_logps/rejected": -0.42116478, "eval_logps/chosen": -0.51136363, "eval_logits/rejected": 0.64182353, "eval_logits/chosen": 0.47944781, "eval_nll_loss": 0.51136363, "eval_log_odds_ratio": -0.80397725, "eval_log_odds_chosen": -0.19655539, "epoch": 0.03039514, "global_step/max_steps": "10/329", "percentage": "3.04%", "elapsed_time": "11m 10s", "remaining_time": "5h 56m 38s"}
12
+ {"loss": 0.57714844, "grad_norm": 3.36059428, "learning_rate": 3.33e-06, "memory(GiB)": 133.18, "train_speed(iter/s)": 0.014731, "rewards/chosen": -0.05029297, "rewards/rejected": -0.04345703, "rewards/accuracies": 0.0, "rewards/margins": -0.00683594, "logps/rejected": -0.43359375, "logps/chosen": -0.50390625, "logits/rejected": 0.62890625, "logits/chosen": 0.66796875, "nll_loss": 0.50390625, "log_odds_ratio": -0.79296875, "log_odds_chosen": -0.1875, "epoch": 0.03343465, "global_step/max_steps": "11/329", "percentage": "3.34%", "elapsed_time": "12m 19s", "remaining_time": "5h 56m 30s"}
13
+ {"loss": 0.60522461, "grad_norm": 3.99234345, "learning_rate": 3.64e-06, "memory(GiB)": 133.18, "train_speed(iter/s)": 0.014797, "rewards/chosen": -0.1015625, "rewards/rejected": -0.15039062, "rewards/accuracies": 1.0, "rewards/margins": 0.04882812, "logps/rejected": -1.5078125, "logps/chosen": -1.015625, "logits/rejected": 0.53125, "logits/chosen": 0.5234375, "nll_loss": 1.015625, "log_odds_ratio": -0.40429688, "log_odds_chosen": 0.69140625, "epoch": 0.03647416, "global_step/max_steps": "12/329", "percentage": "3.65%", "elapsed_time": "13m 24s", "remaining_time": "5h 54m 3s"}
14
+ {"loss": 0.46606445, "grad_norm": 3.84386517, "learning_rate": 3.94e-06, "memory(GiB)": 133.18, "train_speed(iter/s)": 0.014892, "rewards/chosen": -0.0546875, "rewards/rejected": -0.05029297, "rewards/accuracies": 0.0, "rewards/margins": -0.00439453, "logps/rejected": -0.50390625, "logps/chosen": -0.546875, "logits/rejected": 0.49609375, "logits/chosen": 0.49414062, "nll_loss": 0.55078125, "log_odds_ratio": -0.75, "log_odds_chosen": -0.109375, "epoch": 0.03951368, "global_step/max_steps": "13/329", "percentage": "3.95%", "elapsed_time": "14m 26s", "remaining_time": "5h 50m 54s"}
15
+ {"loss": 0.47485352, "grad_norm": 3.8112063, "learning_rate": 4.24e-06, "memory(GiB)": 133.18, "train_speed(iter/s)": 0.015552, "rewards/chosen": -0.04541016, "rewards/rejected": -0.04736328, "rewards/accuracies": 1.0, "rewards/margins": 0.00195312, "logps/rejected": -0.47460938, "logps/chosen": -0.45507812, "logits/rejected": 0.87890625, "logits/chosen": 0.625, "nll_loss": 0.45507812, "log_odds_ratio": -0.671875, "log_odds_chosen": 0.05078125, "epoch": 0.04255319, "global_step/max_steps": "14/329", "percentage": "4.26%", "elapsed_time": "14m 53s", "remaining_time": "5h 35m 1s"}
16
+ {"loss": 0.41088867, "grad_norm": 2.89629065, "learning_rate": 4.55e-06, "memory(GiB)": 133.18, "train_speed(iter/s)": 0.016012, "rewards/chosen": -0.05981445, "rewards/rejected": -0.06054688, "rewards/accuracies": 1.0, "rewards/margins": 0.00073242, "logps/rejected": -0.60546875, "logps/chosen": -0.59765625, "logits/rejected": 0.546875, "logits/chosen": 0.4609375, "nll_loss": 0.59765625, "log_odds_ratio": -0.68359375, "log_odds_chosen": 0.015625, "epoch": 0.04559271, "global_step/max_steps": "15/329", "percentage": "4.56%", "elapsed_time": "15m 29s", "remaining_time": "5h 24m 27s"}
17
+ {"loss": 0.54125977, "grad_norm": 6.06482857, "learning_rate": 4.85e-06, "memory(GiB)": 133.18, "train_speed(iter/s)": 0.016154, "rewards/chosen": -0.06640625, "rewards/rejected": -0.06835938, "rewards/accuracies": 1.0, "rewards/margins": 0.00195312, "logps/rejected": -0.68359375, "logps/chosen": -0.6640625, "logits/rejected": 0.82421875, "logits/chosen": 0.7109375, "nll_loss": 0.6640625, "log_odds_ratio": -0.671875, "log_odds_chosen": 0.046875, "epoch": 0.04863222, "global_step/max_steps": "16/329", "percentage": "4.86%", "elapsed_time": "16m 23s", "remaining_time": "5h 20m 43s"}
18
+ {"loss": 0.5378418, "grad_norm": 4.0430688, "learning_rate": 5.15e-06, "memory(GiB)": 133.18, "train_speed(iter/s)": 0.016479, "rewards/chosen": -0.03979492, "rewards/rejected": -0.03686523, "rewards/accuracies": 0.0, "rewards/margins": -0.00292969, "logps/rejected": -0.36914062, "logps/chosen": -0.3984375, "logits/rejected": 0.56640625, "logits/chosen": 0.6171875, "nll_loss": 0.3984375, "log_odds_ratio": -0.73828125, "log_odds_chosen": -0.08398438, "epoch": 0.05167173, "global_step/max_steps": "17/329", "percentage": "5.17%", "elapsed_time": "17m 4s", "remaining_time": "5h 13m 28s"}
19
+ {"loss": 0.61254883, "grad_norm": 4.21434647, "learning_rate": 5.45e-06, "memory(GiB)": 133.18, "train_speed(iter/s)": 0.01684, "rewards/chosen": -0.03637695, "rewards/rejected": -0.03564453, "rewards/accuracies": 0.0, "rewards/margins": -0.00073242, "logps/rejected": -0.35546875, "logps/chosen": -0.36328125, "logits/rejected": 0.66796875, "logits/chosen": 0.6796875, "nll_loss": 0.36523438, "log_odds_ratio": -0.703125, "log_odds_chosen": -0.0234375, "epoch": 0.05471125, "global_step/max_steps": "18/329", "percentage": "5.47%", "elapsed_time": "17m 42s", "remaining_time": "5h 5m 50s"}
20
+ {"loss": 0.53515625, "grad_norm": 4.19814501, "learning_rate": 5.76e-06, "memory(GiB)": 133.18, "train_speed(iter/s)": 0.017159, "rewards/chosen": -0.03955078, "rewards/rejected": -0.046875, "rewards/accuracies": 1.0, "rewards/margins": 0.00732422, "logps/rejected": -0.46875, "logps/chosen": -0.39453125, "logits/rejected": 0.9453125, "logits/chosen": 0.8515625, "nll_loss": 0.39453125, "log_odds_ratio": -0.58984375, "log_odds_chosen": 0.21875, "epoch": 0.05775076, "global_step/max_steps": "19/329", "percentage": "5.78%", "elapsed_time": "18m 20s", "remaining_time": "4h 59m 15s"}
21
+ {"loss": 0.54125977, "grad_norm": 3.79412169, "learning_rate": 6.06e-06, "memory(GiB)": 133.18, "train_speed(iter/s)": 0.017282, "rewards/chosen": -0.04223633, "rewards/rejected": -0.04785156, "rewards/accuracies": 1.0, "rewards/margins": 0.00561523, "logps/rejected": -0.47851562, "logps/chosen": -0.421875, "logits/rejected": 0.92578125, "logits/chosen": 0.83203125, "nll_loss": 0.41992188, "log_odds_ratio": -0.6171875, "log_odds_chosen": 0.15820312, "epoch": 0.06079027, "global_step/max_steps": "20/329", "percentage": "6.08%", "elapsed_time": "19m 10s", "remaining_time": "4h 56m 14s"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
special_tokens_map.json CHANGED
@@ -13,5 +13,11 @@
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
- "pad_token": "<|eot_id|>"
 
 
 
 
 
 
17
  }
 
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
+ "pad_token": {
17
+ "content": "<|eot_id|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6dc6ee30eaa6cb53723acd63a68aa205b65195097bb4895e84b722e3c5128d0b
3
- size 8184
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb304458da934c460167c713e0581151a0ab2d5cb43649730f985f7d4a9ef096
3
+ size 8248