Training in progress, step 30
Browse files- adapter_config.json +6 -6
- adapter_model.safetensors +1 -1
- args.json +9 -9
- logging.jsonl +37 -256
- special_tokens_map.json +3 -9
- tokenizer.json +2 -2
- tokenizer_config.json +22 -25
- training_args.bin +1 -1
adapter_config.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
{
|
| 2 |
"alpha_pattern": {},
|
| 3 |
"auto_mapping": null,
|
| 4 |
-
"base_model_name_or_path": "/root/highspeedstorage/ft-volume/
|
| 5 |
"bias": "none",
|
| 6 |
"eva_config": null,
|
| 7 |
"exclude_modules": null,
|
|
@@ -23,13 +23,13 @@
|
|
| 23 |
"rank_pattern": {},
|
| 24 |
"revision": null,
|
| 25 |
"target_modules": [
|
| 26 |
-
"
|
| 27 |
"v_proj",
|
| 28 |
-
"up_proj",
|
| 29 |
"k_proj",
|
| 30 |
-
"
|
| 31 |
-
"
|
| 32 |
-
"gate_proj"
|
|
|
|
| 33 |
],
|
| 34 |
"task_type": "CAUSAL_LM",
|
| 35 |
"use_dora": false,
|
|
|
|
| 1 |
{
|
| 2 |
"alpha_pattern": {},
|
| 3 |
"auto_mapping": null,
|
| 4 |
+
"base_model_name_or_path": "/root/highspeedstorage/ft-volume/nemotron",
|
| 5 |
"bias": "none",
|
| 6 |
"eva_config": null,
|
| 7 |
"exclude_modules": null,
|
|
|
|
| 23 |
"rank_pattern": {},
|
| 24 |
"revision": null,
|
| 25 |
"target_modules": [
|
| 26 |
+
"o_proj",
|
| 27 |
"v_proj",
|
|
|
|
| 28 |
"k_proj",
|
| 29 |
+
"q_proj",
|
| 30 |
+
"up_proj",
|
| 31 |
+
"gate_proj",
|
| 32 |
+
"down_proj"
|
| 33 |
],
|
| 34 |
"task_type": "CAUSAL_LM",
|
| 35 |
"use_dora": false,
|
adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 13254157312
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:900be958120edccf3beb7aea72d2454f2acf94387b3f2d702e5a35f374d59cea
|
| 3 |
size 13254157312
|
args.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
-
"model": "/root/highspeedstorage/ft-volume/
|
| 3 |
"model_type": "llama3_2",
|
| 4 |
"model_revision": null,
|
| 5 |
"task_type": "causal_lm",
|
|
@@ -9,7 +9,7 @@
|
|
| 9 |
"rope_scaling": null,
|
| 10 |
"device_map": null,
|
| 11 |
"local_repo_path": null,
|
| 12 |
-
"template": "
|
| 13 |
"system": "",
|
| 14 |
"max_length": 32000,
|
| 15 |
"truncation_strategy": "left",
|
|
@@ -75,7 +75,7 @@
|
|
| 75 |
"custom_register_path": [],
|
| 76 |
"ignore_args_error": false,
|
| 77 |
"use_swift_lora": false,
|
| 78 |
-
"output_dir": "/root/dataDisk/output/
|
| 79 |
"overwrite_output_dir": false,
|
| 80 |
"do_train": false,
|
| 81 |
"do_eval": false,
|
|
@@ -105,7 +105,7 @@
|
|
| 105 |
"log_level": "passive",
|
| 106 |
"log_level_replica": "warning",
|
| 107 |
"log_on_each_node": true,
|
| 108 |
-
"logging_dir": "/root/dataDisk/output/
|
| 109 |
"logging_strategy": "steps",
|
| 110 |
"logging_first_step": true,
|
| 111 |
"logging_steps": 1,
|
|
@@ -345,10 +345,10 @@
|
|
| 345 |
"rank": 0,
|
| 346 |
"global_world_size": 8,
|
| 347 |
"local_world_size": 8,
|
| 348 |
-
"model_suffix": "
|
| 349 |
-
"model_info": "ModelInfo(model_type='llama3_2', model_dir='/root/highspeedstorage/ft-volume/
|
| 350 |
-
"model_meta": "ModelMeta(model_type='
|
| 351 |
-
"model_dir": "/root/highspeedstorage/ft-volume/
|
| 352 |
"hub": "<class 'swift.hub.hub.HFHub'>",
|
| 353 |
-
"training_args": "Seq2SeqTrainingArguments(output_dir='/root/dataDisk/output/
|
| 354 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"model": "/root/highspeedstorage/ft-volume/nemotron",
|
| 3 |
"model_type": "llama3_2",
|
| 4 |
"model_revision": null,
|
| 5 |
"task_type": "causal_lm",
|
|
|
|
| 9 |
"rope_scaling": null,
|
| 10 |
"device_map": null,
|
| 11 |
"local_repo_path": null,
|
| 12 |
+
"template": "llama3_2",
|
| 13 |
"system": "",
|
| 14 |
"max_length": 32000,
|
| 15 |
"truncation_strategy": "left",
|
|
|
|
| 75 |
"custom_register_path": [],
|
| 76 |
"ignore_args_error": false,
|
| 77 |
"use_swift_lora": false,
|
| 78 |
+
"output_dir": "/root/dataDisk/output/v62-20250312-085633",
|
| 79 |
"overwrite_output_dir": false,
|
| 80 |
"do_train": false,
|
| 81 |
"do_eval": false,
|
|
|
|
| 105 |
"log_level": "passive",
|
| 106 |
"log_level_replica": "warning",
|
| 107 |
"log_on_each_node": true,
|
| 108 |
+
"logging_dir": "/root/dataDisk/output/v62-20250312-085633/runs",
|
| 109 |
"logging_strategy": "steps",
|
| 110 |
"logging_first_step": true,
|
| 111 |
"logging_steps": 1,
|
|
|
|
| 345 |
"rank": 0,
|
| 346 |
"global_world_size": 8,
|
| 347 |
"local_world_size": 8,
|
| 348 |
+
"model_suffix": "nemotron",
|
| 349 |
+
"model_info": "ModelInfo(model_type='llama3_2', model_dir='/root/highspeedstorage/ft-volume/nemotron', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, rope_scaling={'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, config=None, task_type='causal_lm', num_labels=None)",
|
| 350 |
+
"model_meta": "ModelMeta(model_type='llama3_2', model_groups=[ModelGroup(models=[Model(ms_model_id='LLM-Research/Llama-3.2-1B', hf_model_id='meta-llama/Llama-3.2-1B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='LLM-Research/Llama-3.2-3B', hf_model_id='meta-llama/Llama-3.2-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='LLM-Research/Llama-3.2-1B-Instruct', hf_model_id='meta-llama/Llama-3.2-1B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='LLM-Research/Llama-3.2-3B-Instruct', hf_model_id='meta-llama/Llama-3.2-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='LLM-Research/Llama-3.3-70B-Instruct', hf_model_id='meta-llama/Llama-3.3-70B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='unsloth/Llama-3.3-70B-Instruct-bnb-4bit', hf_model_id='unsloth/Llama-3.3-70B-Instruct-bnb-4bit', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='llama3_2', get_function=<function get_model_tokenizer_with_flash_attn at 0x7f482567b760>, model_arch='llama', architectures=['LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=['transformers>=4.45'], tags=[])",
|
| 351 |
+
"model_dir": "/root/highspeedstorage/ft-volume/nemotron",
|
| 352 |
"hub": "<class 'swift.hub.hub.HFHub'>",
|
| 353 |
+
"training_args": "Seq2SeqTrainingArguments(output_dir='/root/dataDisk/output/v62-20250312-085633', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.STEPS: 'steps'>, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=1e-05, weight_decay=0.01, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=1.0, max_steps=-1, lr_scheduler_type=<SchedulerType.LINEAR: 'linear'>, lr_scheduler_kwargs=None, warmup_ratio=0.1, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/root/dataDisk/output/v62-20250312-085633/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=1, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.STEPS: 'steps'>, save_steps=10, save_total_limit=2, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=10, dataloader_num_workers=0, dataloader_prefetch_factor=None, past_index=-1, run_name='/root/dataDisk/output/v62-20250312-085633', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'cpu', 'pin_memory': True}, 'offload_param': {'device': 'cpu', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH: 'adamw_torch'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['wandb'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=True, resume_from_checkpoint=None, hub_model_id='TheAgenticAI/LLAMA-3.3-70B-Reasoning', hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=True, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs={'use_reentrant': True}, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, is_encoder_decoder=False, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer='galore', galore_config=None)"
|
| 354 |
}
|
logging.jsonl
CHANGED
|
@@ -1,256 +1,37 @@
|
|
| 1 |
-
{"loss":
|
| 2 |
-
{"loss": 0.
|
| 3 |
-
{"loss": 0.
|
| 4 |
-
{"loss": 0.
|
| 5 |
-
{"loss": 0.
|
| 6 |
-
{"loss": 0.
|
| 7 |
-
{"loss": 0.
|
| 8 |
-
{"loss": 0.
|
| 9 |
-
{"loss": 0.
|
| 10 |
-
{"loss": 0.
|
| 11 |
-
{"eval_loss": 0.
|
| 12 |
-
{"loss": 0.
|
| 13 |
-
{"loss": 0.
|
| 14 |
-
{"loss": 0.
|
| 15 |
-
{"loss": 0.
|
| 16 |
-
{"loss": 0.
|
| 17 |
-
{"loss": 0.
|
| 18 |
-
{"loss": 0.
|
| 19 |
-
{"loss": 0.
|
| 20 |
-
{"loss": 0.
|
| 21 |
-
{"loss": 0.
|
| 22 |
-
{"eval_loss": 0.
|
| 23 |
-
{"loss": 0.
|
| 24 |
-
{"loss": 0.
|
| 25 |
-
{"loss": 0.
|
| 26 |
-
{"loss": 0.
|
| 27 |
-
{"loss": 0.
|
| 28 |
-
{"loss": 0.
|
| 29 |
-
{"loss": 0.
|
| 30 |
-
{"loss": 0.
|
| 31 |
-
{"loss": 0.
|
| 32 |
-
{"loss": 0.
|
| 33 |
-
{"eval_loss": 0.
|
| 34 |
-
{"loss": 0.
|
| 35 |
-
{"loss": 0.
|
| 36 |
-
{"loss": 0.
|
| 37 |
-
{"loss": 0.
|
| 38 |
-
{"loss": 0.56055653, "token_acc": 0.84721173, "grad_norm": 0.14894214, "learning_rate": 8.5e-06, "memory(GiB)": 126.5, "train_speed(iter/s)": 0.0077, "epoch": 0.14981273, "global_step/max_steps": "35/233", "percentage": "15.02%", "elapsed_time": "1h 15m 25s", "remaining_time": "7h 6m 38s"}
|
| 39 |
-
{"loss": 0.53099561, "token_acc": 0.82040125, "grad_norm": 0.11507381, "learning_rate": 8.45e-06, "memory(GiB)": 126.5, "train_speed(iter/s)": 0.007602, "epoch": 0.1540931, "global_step/max_steps": "36/233", "percentage": "15.45%", "elapsed_time": "1h 18m 34s", "remaining_time": "7h 10m 1s"}
|
| 40 |
-
{"loss": 0.49963912, "token_acc": 0.83337873, "grad_norm": 0.14346635, "learning_rate": 8.41e-06, "memory(GiB)": 126.5, "train_speed(iter/s)": 0.007643, "epoch": 0.15837346, "global_step/max_steps": "37/233", "percentage": "15.88%", "elapsed_time": "1h 20m 20s", "remaining_time": "7h 5m 35s"}
|
| 41 |
-
{"loss": 0.50402641, "token_acc": 0.84251997, "grad_norm": 0.14424579, "learning_rate": 8.37e-06, "memory(GiB)": 126.5, "train_speed(iter/s)": 0.00772, "epoch": 0.16265383, "global_step/max_steps": "38/233", "percentage": "16.31%", "elapsed_time": "1h 21m 41s", "remaining_time": "6h 59m 14s"}
|
| 42 |
-
{"loss": 0.46469942, "token_acc": 0.85498028, "grad_norm": 0.12475314, "learning_rate": 8.33e-06, "memory(GiB)": 126.5, "train_speed(iter/s)": 0.007803, "epoch": 0.16693419, "global_step/max_steps": "39/233", "percentage": "16.74%", "elapsed_time": "1h 22m 57s", "remaining_time": "6h 52m 40s"}
|
| 43 |
-
{"loss": 0.52988553, "token_acc": 0.82646555, "grad_norm": 0.15125506, "learning_rate": 8.28e-06, "memory(GiB)": 126.5, "train_speed(iter/s)": 0.00784, "epoch": 0.17121455, "global_step/max_steps": "40/233", "percentage": "17.17%", "elapsed_time": "1h 24m 41s", "remaining_time": "6h 48m 36s"}
|
| 44 |
-
{"eval_loss": 0.52028233, "eval_token_acc": 0.83092729, "eval_runtime": 235.7627, "eval_samples_per_second": 1.96, "eval_steps_per_second": 0.246, "epoch": 0.17121455, "global_step/max_steps": "40/233", "percentage": "17.17%", "elapsed_time": "1h 28m 36s", "remaining_time": "7h 7m 34s"}
|
| 45 |
-
{"loss": 0.51787907, "token_acc": 0.83202028, "grad_norm": 0.1290229, "learning_rate": 8.24e-06, "memory(GiB)": 126.5, "train_speed(iter/s)": 0.007414, "epoch": 0.17549492, "global_step/max_steps": "41/233", "percentage": "17.60%", "elapsed_time": "1h 31m 49s", "remaining_time": "7h 9m 58s"}
|
| 46 |
-
{"loss": 0.51522481, "token_acc": 0.82788479, "grad_norm": 0.12198465, "learning_rate": 8.2e-06, "memory(GiB)": 126.5, "train_speed(iter/s)": 0.007448, "epoch": 0.17977528, "global_step/max_steps": "42/233", "percentage": "18.03%", "elapsed_time": "1h 33m 38s", "remaining_time": "7h 5m 51s"}
|
| 47 |
-
{"loss": 0.50701392, "token_acc": 0.84508624, "grad_norm": 0.14420471, "learning_rate": 8.15e-06, "memory(GiB)": 126.5, "train_speed(iter/s)": 0.007484, "epoch": 0.18405564, "global_step/max_steps": "43/233", "percentage": "18.45%", "elapsed_time": "1h 35m 25s", "remaining_time": "7h 1m 38s"}
|
| 48 |
-
{"loss": 0.50702739, "token_acc": 0.84217746, "grad_norm": 0.13070235, "learning_rate": 8.11e-06, "memory(GiB)": 126.5, "train_speed(iter/s)": 0.007474, "epoch": 0.18833601, "global_step/max_steps": "44/233", "percentage": "18.88%", "elapsed_time": "1h 37m 46s", "remaining_time": "7h 0m 0s"}
|
| 49 |
-
{"loss": 0.52828574, "token_acc": 0.84879032, "grad_norm": 0.13208568, "learning_rate": 8.07e-06, "memory(GiB)": 126.5, "train_speed(iter/s)": 0.007511, "epoch": 0.19261637, "global_step/max_steps": "45/233", "percentage": "19.31%", "elapsed_time": "1h 39m 30s", "remaining_time": "6h 55m 44s"}
|
| 50 |
-
{"loss": 0.50708497, "token_acc": 0.84317691, "grad_norm": 0.12373436, "learning_rate": 8.03e-06, "memory(GiB)": 126.5, "train_speed(iter/s)": 0.007563, "epoch": 0.19689674, "global_step/max_steps": "46/233", "percentage": "19.74%", "elapsed_time": "1h 41m 2s", "remaining_time": "6h 50m 43s"}
|
| 51 |
-
{"loss": 0.49027938, "token_acc": 0.83851458, "grad_norm": 0.12841317, "learning_rate": 7.98e-06, "memory(GiB)": 126.5, "train_speed(iter/s)": 0.007614, "epoch": 0.2011771, "global_step/max_steps": "47/233", "percentage": "20.17%", "elapsed_time": "1h 42m 32s", "remaining_time": "6h 45m 47s"}
|
| 52 |
-
{"loss": 0.51143616, "token_acc": 0.85708269, "grad_norm": 0.22385377, "learning_rate": 7.94e-06, "memory(GiB)": 126.5, "train_speed(iter/s)": 0.007602, "epoch": 0.20545746, "global_step/max_steps": "48/233", "percentage": "20.60%", "elapsed_time": "1h 44m 53s", "remaining_time": "6h 44m 16s"}
|
| 53 |
-
{"loss": 0.50135702, "token_acc": 0.83626581, "grad_norm": 0.15403379, "learning_rate": 7.9e-06, "memory(GiB)": 126.5, "train_speed(iter/s)": 0.00772, "epoch": 0.20973783, "global_step/max_steps": "49/233", "percentage": "21.03%", "elapsed_time": "1h 45m 26s", "remaining_time": "6h 35m 55s"}
|
| 54 |
-
{"loss": 0.47195634, "token_acc": 0.85469533, "grad_norm": 0.19196728, "learning_rate": 7.85e-06, "memory(GiB)": 126.5, "train_speed(iter/s)": 0.007749, "epoch": 0.21401819, "global_step/max_steps": "50/233", "percentage": "21.46%", "elapsed_time": "1h 47m 11s", "remaining_time": "6h 32m 20s"}
|
| 55 |
-
{"eval_loss": 0.51820177, "eval_token_acc": 0.83162646, "eval_runtime": 236.2231, "eval_samples_per_second": 1.956, "eval_steps_per_second": 0.246, "epoch": 0.21401819, "global_step/max_steps": "50/233", "percentage": "21.46%", "elapsed_time": "1h 51m 8s", "remaining_time": "6h 46m 44s"}
|
| 56 |
-
{"loss": 0.51357847, "token_acc": 0.83365776, "grad_norm": 0.16629253, "learning_rate": 7.81e-06, "memory(GiB)": 126.5, "train_speed(iter/s)": 0.007476, "epoch": 0.21829856, "global_step/max_steps": "51/233", "percentage": "21.89%", "elapsed_time": "1h 53m 21s", "remaining_time": "6h 44m 32s"}
|
| 57 |
-
{"loss": 0.51775813, "token_acc": 0.84551573, "grad_norm": 0.15469484, "learning_rate": 7.77e-06, "memory(GiB)": 126.5, "train_speed(iter/s)": 0.007544, "epoch": 0.22257892, "global_step/max_steps": "52/233", "percentage": "22.32%", "elapsed_time": "1h 54m 32s", "remaining_time": "6h 38m 41s"}
|
| 58 |
-
{"loss": 0.55643523, "token_acc": 0.85632084, "grad_norm": 0.13863128, "learning_rate": 7.73e-06, "memory(GiB)": 126.5, "train_speed(iter/s)": 0.007551, "epoch": 0.22685928, "global_step/max_steps": "53/233", "percentage": "22.75%", "elapsed_time": "1h 56m 38s", "remaining_time": "6h 36m 7s"}
|
| 59 |
-
{"loss": 0.50702429, "token_acc": 0.82881185, "grad_norm": 0.15469763, "learning_rate": 7.68e-06, "memory(GiB)": 126.5, "train_speed(iter/s)": 0.007571, "epoch": 0.23113965, "global_step/max_steps": "54/233", "percentage": "23.18%", "elapsed_time": "1h 58m 31s", "remaining_time": "6h 32m 54s"}
|
| 60 |
-
{"loss": 0.54310501, "token_acc": 0.83081361, "grad_norm": 0.16917156, "learning_rate": 7.64e-06, "memory(GiB)": 126.5, "train_speed(iter/s)": 0.007584, "epoch": 0.23542001, "global_step/max_steps": "55/233", "percentage": "23.61%", "elapsed_time": "2h 0m 31s", "remaining_time": "6h 30m 3s"}
|
| 61 |
-
{"loss": 0.49319041, "token_acc": 0.83154994, "grad_norm": 0.12704188, "learning_rate": 7.6e-06, "memory(GiB)": 126.5, "train_speed(iter/s)": 0.007622, "epoch": 0.23970037, "global_step/max_steps": "56/233", "percentage": "24.03%", "elapsed_time": "2h 2m 6s", "remaining_time": "6h 25m 56s"}
|
| 62 |
-
{"loss": 0.50535458, "token_acc": 0.86976872, "grad_norm": 0.14562564, "learning_rate": 7.55e-06, "memory(GiB)": 126.5, "train_speed(iter/s)": 0.007666, "epoch": 0.24398074, "global_step/max_steps": "57/233", "percentage": "24.46%", "elapsed_time": "2h 3m 34s", "remaining_time": "6h 21m 34s"}
|
| 63 |
-
{"loss": 0.49788111, "token_acc": 0.8297607, "grad_norm": 0.24545969, "learning_rate": 7.51e-06, "memory(GiB)": 126.5, "train_speed(iter/s)": 0.007694, "epoch": 0.2482611, "global_step/max_steps": "58/233", "percentage": "24.89%", "elapsed_time": "2h 5m 17s", "remaining_time": "6h 18m 2s"}
|
| 64 |
-
{"loss": 0.53209269, "token_acc": 0.83677349, "grad_norm": 0.17009053, "learning_rate": 7.47e-06, "memory(GiB)": 126.5, "train_speed(iter/s)": 0.007736, "epoch": 0.25254147, "global_step/max_steps": "59/233", "percentage": "25.32%", "elapsed_time": "2h 6m 45s", "remaining_time": "6h 13m 50s"}
|
| 65 |
-
{"loss": 0.4868502, "token_acc": 0.84821353, "grad_norm": 0.41996536, "learning_rate": 7.42e-06, "memory(GiB)": 126.78, "train_speed(iter/s)": 0.007699, "epoch": 0.25682183, "global_step/max_steps": "60/233", "percentage": "25.75%", "elapsed_time": "2h 9m 33s", "remaining_time": "6h 13m 32s"}
|
| 66 |
-
{"eval_loss": 0.51735187, "eval_token_acc": 0.83192979, "eval_runtime": 235.8271, "eval_samples_per_second": 1.959, "eval_steps_per_second": 0.246, "epoch": 0.25682183, "global_step/max_steps": "60/233", "percentage": "25.75%", "elapsed_time": "2h 13m 28s", "remaining_time": "6h 24m 52s"}
|
| 67 |
-
{"loss": 0.52326024, "token_acc": 0.83251999, "grad_norm": 0.138914, "learning_rate": 7.38e-06, "memory(GiB)": 126.78, "train_speed(iter/s)": 0.00745, "epoch": 0.26110219, "global_step/max_steps": "61/233", "percentage": "26.18%", "elapsed_time": "2h 16m 7s", "remaining_time": "6h 23m 48s"}
|
| 68 |
-
{"loss": 0.49799752, "token_acc": 0.83543709, "grad_norm": 0.12604268, "learning_rate": 7.34e-06, "memory(GiB)": 126.78, "train_speed(iter/s)": 0.007493, "epoch": 0.26538256, "global_step/max_steps": "62/233", "percentage": "26.61%", "elapsed_time": "2h 17m 33s", "remaining_time": "6h 19m 23s"}
|
| 69 |
-
{"loss": 0.54000628, "token_acc": 0.86219915, "grad_norm": 0.1309191, "learning_rate": 7.3e-06, "memory(GiB)": 126.78, "train_speed(iter/s)": 0.007486, "epoch": 0.26966292, "global_step/max_steps": "63/233", "percentage": "27.04%", "elapsed_time": "2h 19m 55s", "remaining_time": "6h 17m 34s"}
|
| 70 |
-
{"loss": 0.4935776, "token_acc": 0.87444994, "grad_norm": 0.12530285, "learning_rate": 7.25e-06, "memory(GiB)": 126.78, "train_speed(iter/s)": 0.007509, "epoch": 0.27394329, "global_step/max_steps": "64/233", "percentage": "27.47%", "elapsed_time": "2h 21m 42s", "remaining_time": "6h 14m 11s"}
|
| 71 |
-
{"loss": 0.48380467, "token_acc": 0.8545328, "grad_norm": 0.13684304, "learning_rate": 7.21e-06, "memory(GiB)": 126.78, "train_speed(iter/s)": 0.007594, "epoch": 0.27822365, "global_step/max_steps": "65/233", "percentage": "27.90%", "elapsed_time": "2h 22m 19s", "remaining_time": "6h 7m 50s"}
|
| 72 |
-
{"loss": 0.53414261, "token_acc": 0.8233512, "grad_norm": 0.12200661, "learning_rate": 7.17e-06, "memory(GiB)": 126.78, "train_speed(iter/s)": 0.007622, "epoch": 0.28250401, "global_step/max_steps": "66/233", "percentage": "28.33%", "elapsed_time": "2h 23m 58s", "remaining_time": "6h 4m 18s"}
|
| 73 |
-
{"loss": 0.54979384, "token_acc": 0.8006445, "grad_norm": 0.14477921, "learning_rate": 7.12e-06, "memory(GiB)": 126.78, "train_speed(iter/s)": 0.007668, "epoch": 0.28678438, "global_step/max_steps": "67/233", "percentage": "28.76%", "elapsed_time": "2h 25m 17s", "remaining_time": "5h 59m 58s"}
|
| 74 |
-
{"loss": 0.50618756, "token_acc": 0.8421706, "grad_norm": 0.12256683, "learning_rate": 7.08e-06, "memory(GiB)": 126.78, "train_speed(iter/s)": 0.007672, "epoch": 0.29106474, "global_step/max_steps": "68/233", "percentage": "29.18%", "elapsed_time": "2h 27m 22s", "remaining_time": "5h 57m 35s"}
|
| 75 |
-
{"loss": 0.51525098, "token_acc": 0.8395739, "grad_norm": 0.33895594, "learning_rate": 7.04e-06, "memory(GiB)": 126.78, "train_speed(iter/s)": 0.007692, "epoch": 0.2953451, "global_step/max_steps": "69/233", "percentage": "29.61%", "elapsed_time": "2h 29m 9s", "remaining_time": "5h 54m 32s"}
|
| 76 |
-
{"loss": 0.49514559, "token_acc": 0.83755702, "grad_norm": 0.13254978, "learning_rate": 7e-06, "memory(GiB)": 126.78, "train_speed(iter/s)": 0.007709, "epoch": 0.29962547, "global_step/max_steps": "70/233", "percentage": "30.04%", "elapsed_time": "2h 30m 59s", "remaining_time": "5h 51m 35s"}
|
| 77 |
-
{"eval_loss": 0.51468235, "eval_token_acc": 0.83213111, "eval_runtime": 235.4854, "eval_samples_per_second": 1.962, "eval_steps_per_second": 0.246, "epoch": 0.29962547, "global_step/max_steps": "70/233", "percentage": "30.04%", "elapsed_time": "2h 34m 55s", "remaining_time": "6h 0m 44s"}
|
| 78 |
-
{"loss": 0.47727573, "token_acc": 0.83547054, "grad_norm": 0.14191172, "learning_rate": 6.95e-06, "memory(GiB)": 126.78, "train_speed(iter/s)": 0.007496, "epoch": 0.30390583, "global_step/max_steps": "71/233", "percentage": "30.47%", "elapsed_time": "2h 37m 31s", "remaining_time": "5h 59m 25s"}
|
| 79 |
-
{"loss": 0.51388264, "token_acc": 0.82497027, "grad_norm": 0.12943304, "learning_rate": 6.91e-06, "memory(GiB)": 126.78, "train_speed(iter/s)": 0.007516, "epoch": 0.3081862, "global_step/max_steps": "72/233", "percentage": "30.90%", "elapsed_time": "2h 39m 19s", "remaining_time": "5h 56m 15s"}
|
| 80 |
-
{"loss": 0.48675218, "token_acc": 0.83789222, "grad_norm": 0.12511204, "learning_rate": 6.87e-06, "memory(GiB)": 126.78, "train_speed(iter/s)": 0.007567, "epoch": 0.31246656, "global_step/max_steps": "73/233", "percentage": "31.33%", "elapsed_time": "2h 40m 26s", "remaining_time": "5h 51m 38s"}
|
| 81 |
-
{"loss": 0.51524091, "token_acc": 0.82786017, "grad_norm": 0.11458036, "learning_rate": 6.82e-06, "memory(GiB)": 126.78, "train_speed(iter/s)": 0.007574, "epoch": 0.31674692, "global_step/max_steps": "74/233", "percentage": "31.76%", "elapsed_time": "2h 42m 29s", "remaining_time": "5h 49m 8s"}
|
| 82 |
-
{"loss": 0.50742763, "token_acc": 0.8585693, "grad_norm": 0.13843079, "learning_rate": 6.78e-06, "memory(GiB)": 126.78, "train_speed(iter/s)": 0.007616, "epoch": 0.32102729, "global_step/max_steps": "75/233", "percentage": "32.19%", "elapsed_time": "2h 43m 47s", "remaining_time": "5h 45m 3s"}
|
| 83 |
-
{"loss": 0.52581084, "token_acc": 0.83562382, "grad_norm": 0.11976344, "learning_rate": 6.74e-06, "memory(GiB)": 126.78, "train_speed(iter/s)": 0.00762, "epoch": 0.32530765, "global_step/max_steps": "76/233", "percentage": "32.62%", "elapsed_time": "2h 45m 52s", "remaining_time": "5h 42m 39s"}
|
| 84 |
-
{"loss": 0.53450525, "token_acc": 0.82231925, "grad_norm": 0.15798301, "learning_rate": 6.7e-06, "memory(GiB)": 127.0, "train_speed(iter/s)": 0.00761, "epoch": 0.32958801, "global_step/max_steps": "77/233", "percentage": "33.05%", "elapsed_time": "2h 48m 18s", "remaining_time": "5h 40m 58s"}
|
| 85 |
-
{"loss": 0.52083588, "token_acc": 0.81583811, "grad_norm": 0.11763263, "learning_rate": 6.65e-06, "memory(GiB)": 127.0, "train_speed(iter/s)": 0.007597, "epoch": 0.33386838, "global_step/max_steps": "78/233", "percentage": "33.48%", "elapsed_time": "2h 50m 46s", "remaining_time": "5h 39m 22s"}
|
| 86 |
-
{"loss": 0.53338146, "token_acc": 0.8253553, "grad_norm": 0.12795284, "learning_rate": 6.61e-06, "memory(GiB)": 127.0, "train_speed(iter/s)": 0.007607, "epoch": 0.33814874, "global_step/max_steps": "79/233", "percentage": "33.91%", "elapsed_time": "2h 52m 44s", "remaining_time": "5h 36m 43s"}
|
| 87 |
-
{"loss": 0.50855756, "token_acc": 0.84615385, "grad_norm": 0.14013508, "learning_rate": 6.57e-06, "memory(GiB)": 127.0, "train_speed(iter/s)": 0.007654, "epoch": 0.34242911, "global_step/max_steps": "80/233", "percentage": "34.33%", "elapsed_time": "2h 53m 50s", "remaining_time": "5h 32m 29s"}
|
| 88 |
-
{"eval_loss": 0.51241541, "eval_token_acc": 0.83242084, "eval_runtime": 235.6235, "eval_samples_per_second": 1.961, "eval_steps_per_second": 0.246, "epoch": 0.34242911, "global_step/max_steps": "80/233", "percentage": "34.33%", "elapsed_time": "2h 57m 46s", "remaining_time": "5h 39m 59s"}
|
| 89 |
-
{"loss": 0.5318374, "token_acc": 0.83843328, "grad_norm": 0.10666233, "learning_rate": 6.52e-06, "memory(GiB)": 127.0, "train_speed(iter/s)": 0.007436, "epoch": 0.34670947, "global_step/max_steps": "81/233", "percentage": "34.76%", "elapsed_time": "3h 1m 12s", "remaining_time": "5h 40m 3s"}
|
| 90 |
-
{"loss": 0.49064749, "token_acc": 0.86607403, "grad_norm": 0.34532365, "learning_rate": 6.48e-06, "memory(GiB)": 127.0, "train_speed(iter/s)": 0.007456, "epoch": 0.35098983, "global_step/max_steps": "82/233", "percentage": "35.19%", "elapsed_time": "3h 2m 57s", "remaining_time": "5h 36m 55s"}
|
| 91 |
-
{"loss": 0.47813955, "token_acc": 0.83404155, "grad_norm": 0.20377178, "learning_rate": 6.44e-06, "memory(GiB)": 127.0, "train_speed(iter/s)": 0.007509, "epoch": 0.3552702, "global_step/max_steps": "83/233", "percentage": "35.62%", "elapsed_time": "3h 3m 53s", "remaining_time": "5h 32m 19s"}
|
| 92 |
-
{"loss": 0.51242083, "token_acc": 0.84345211, "grad_norm": 0.12413169, "learning_rate": 6.39e-06, "memory(GiB)": 127.0, "train_speed(iter/s)": 0.007526, "epoch": 0.35955056, "global_step/max_steps": "84/233", "percentage": "36.05%", "elapsed_time": "3h 5m 40s", "remaining_time": "5h 29m 20s"}
|
| 93 |
-
{"loss": 0.51419246, "token_acc": 0.82768103, "grad_norm": 0.16430534, "learning_rate": 6.35e-06, "memory(GiB)": 127.0, "train_speed(iter/s)": 0.007545, "epoch": 0.36383093, "global_step/max_steps": "85/233", "percentage": "36.48%", "elapsed_time": "3h 7m 25s", "remaining_time": "5h 26m 20s"}
|
| 94 |
-
{"loss": 0.48613182, "token_acc": 0.83516323, "grad_norm": 0.12657382, "learning_rate": 6.31e-06, "memory(GiB)": 127.0, "train_speed(iter/s)": 0.007581, "epoch": 0.36811129, "global_step/max_steps": "86/233", "percentage": "36.91%", "elapsed_time": "3h 8m 44s", "remaining_time": "5h 22m 36s"}
|
| 95 |
-
{"loss": 0.49069044, "token_acc": 0.84284933, "grad_norm": 0.12505151, "learning_rate": 6.27e-06, "memory(GiB)": 127.0, "train_speed(iter/s)": 0.007583, "epoch": 0.37239165, "global_step/max_steps": "87/233", "percentage": "37.34%", "elapsed_time": "3h 10m 52s", "remaining_time": "5h 20m 19s"}
|
| 96 |
-
{"loss": 0.53458667, "token_acc": 0.82781131, "grad_norm": 0.14310487, "learning_rate": 6.22e-06, "memory(GiB)": 127.0, "train_speed(iter/s)": 0.007628, "epoch": 0.37667202, "global_step/max_steps": "88/233", "percentage": "37.77%", "elapsed_time": "3h 11m 56s", "remaining_time": "5h 16m 15s"}
|
| 97 |
-
{"loss": 0.50245881, "token_acc": 0.8347507, "grad_norm": 0.12842128, "learning_rate": 6.18e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007621, "epoch": 0.38095238, "global_step/max_steps": "89/233", "percentage": "38.20%", "elapsed_time": "3h 14m 18s", "remaining_time": "5h 14m 22s"}
|
| 98 |
-
{"loss": 0.48739982, "token_acc": 0.82806244, "grad_norm": 0.1115669, "learning_rate": 6.14e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007656, "epoch": 0.38523274, "global_step/max_steps": "90/233", "percentage": "38.63%", "elapsed_time": "3h 15m 34s", "remaining_time": "5h 10m 44s"}
|
| 99 |
-
{"eval_loss": 0.51217604, "eval_token_acc": 0.83252966, "eval_runtime": 235.5128, "eval_samples_per_second": 1.962, "eval_steps_per_second": 0.246, "epoch": 0.38523274, "global_step/max_steps": "90/233", "percentage": "38.63%", "elapsed_time": "3h 19m 29s", "remaining_time": "5h 16m 58s"}
|
| 100 |
-
{"loss": 0.49240842, "token_acc": 0.83577156, "grad_norm": 0.23490313, "learning_rate": 6.09e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007498, "epoch": 0.38951311, "global_step/max_steps": "91/233", "percentage": "39.06%", "elapsed_time": "3h 21m 55s", "remaining_time": "5h 15m 5s"}
|
| 101 |
-
{"loss": 0.52370095, "token_acc": 0.83104296, "grad_norm": 0.12524754, "learning_rate": 6.05e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007484, "epoch": 0.39379347, "global_step/max_steps": "92/233", "percentage": "39.48%", "elapsed_time": "3h 24m 32s", "remaining_time": "5h 13m 28s"}
|
| 102 |
-
{"loss": 0.52046466, "token_acc": 0.8491291, "grad_norm": 0.11728439, "learning_rate": 6.01e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.00748, "epoch": 0.39807384, "global_step/max_steps": "93/233", "percentage": "39.91%", "elapsed_time": "3h 26m 51s", "remaining_time": "5h 11m 24s"}
|
| 103 |
-
{"loss": 0.50553274, "token_acc": 0.85265656, "grad_norm": 0.12955283, "learning_rate": 5.97e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007472, "epoch": 0.4023542, "global_step/max_steps": "94/233", "percentage": "40.34%", "elapsed_time": "3h 29m 19s", "remaining_time": "5h 9m 32s"}
|
| 104 |
-
{"loss": 0.49560741, "token_acc": 0.85768936, "grad_norm": 0.14827773, "learning_rate": 5.92e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007466, "epoch": 0.40663456, "global_step/max_steps": "95/233", "percentage": "40.77%", "elapsed_time": "3h 31m 43s", "remaining_time": "5h 7m 33s"}
|
| 105 |
-
{"loss": 0.51661247, "token_acc": 0.84913022, "grad_norm": 0.14370561, "learning_rate": 5.88e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007502, "epoch": 0.41091493, "global_step/max_steps": "96/233", "percentage": "41.20%", "elapsed_time": "3h 32m 55s", "remaining_time": "5h 3m 51s"}
|
| 106 |
-
{"loss": 0.49931169, "token_acc": 0.85026145, "grad_norm": 0.12156764, "learning_rate": 5.84e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.00752, "epoch": 0.41519529, "global_step/max_steps": "97/233", "percentage": "41.63%", "elapsed_time": "3h 34m 37s", "remaining_time": "5h 0m 55s"}
|
| 107 |
-
{"loss": 0.55132127, "token_acc": 0.83299285, "grad_norm": 0.16988352, "learning_rate": 5.79e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007546, "epoch": 0.41947566, "global_step/max_steps": "98/233", "percentage": "42.06%", "elapsed_time": "3h 36m 5s", "remaining_time": "4h 57m 41s"}
|
| 108 |
-
{"loss": 0.54132557, "token_acc": 0.84194577, "grad_norm": 0.17169492, "learning_rate": 5.75e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007519, "epoch": 0.42375602, "global_step/max_steps": "99/233", "percentage": "42.49%", "elapsed_time": "3h 39m 5s", "remaining_time": "4h 56m 33s"}
|
| 109 |
-
{"loss": 0.51719457, "token_acc": 0.83995948, "grad_norm": 0.14107411, "learning_rate": 5.71e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007536, "epoch": 0.42803638, "global_step/max_steps": "100/233", "percentage": "42.92%", "elapsed_time": "3h 40m 49s", "remaining_time": "4h 53m 41s"}
|
| 110 |
-
{"eval_loss": 0.51198262, "eval_token_acc": 0.83308328, "eval_runtime": 235.963, "eval_samples_per_second": 1.958, "eval_steps_per_second": 0.246, "epoch": 0.42803638, "global_step/max_steps": "100/233", "percentage": "42.92%", "elapsed_time": "3h 44m 45s", "remaining_time": "4h 58m 55s"}
|
| 111 |
-
{"loss": 0.51422942, "token_acc": 0.83764542, "grad_norm": 0.43609548, "learning_rate": 5.67e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.0074, "epoch": 0.43231675, "global_step/max_steps": "101/233", "percentage": "43.35%", "elapsed_time": "3h 47m 7s", "remaining_time": "4h 56m 50s"}
|
| 112 |
-
{"loss": 0.47127306, "token_acc": 0.83849191, "grad_norm": 0.22676709, "learning_rate": 5.62e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007445, "epoch": 0.43659711, "global_step/max_steps": "102/233", "percentage": "43.78%", "elapsed_time": "3h 47m 59s", "remaining_time": "4h 52m 49s"}
|
| 113 |
-
{"loss": 0.52837503, "token_acc": 0.83819166, "grad_norm": 0.12363542, "learning_rate": 5.58e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.00747, "epoch": 0.44087747, "global_step/max_steps": "103/233", "percentage": "44.21%", "elapsed_time": "3h 49m 28s", "remaining_time": "4h 49m 37s"}
|
| 114 |
-
{"loss": 0.48837972, "token_acc": 0.85860214, "grad_norm": 0.10835283, "learning_rate": 5.54e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007479, "epoch": 0.44515784, "global_step/max_steps": "104/233", "percentage": "44.64%", "elapsed_time": "3h 51m 24s", "remaining_time": "4h 47m 2s"}
|
| 115 |
-
{"loss": 0.47488719, "token_acc": 0.84470192, "grad_norm": 0.13628873, "learning_rate": 5.49e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007511, "epoch": 0.4494382, "global_step/max_steps": "105/233", "percentage": "45.06%", "elapsed_time": "3h 52m 39s", "remaining_time": "4h 43m 37s"}
|
| 116 |
-
{"loss": 0.48864883, "token_acc": 0.85257826, "grad_norm": 0.15452881, "learning_rate": 5.45e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.00755, "epoch": 0.45371857, "global_step/max_steps": "106/233", "percentage": "45.49%", "elapsed_time": "3h 53m 39s", "remaining_time": "4h 39m 56s"}
|
| 117 |
-
{"loss": 0.57554269, "token_acc": 0.83283159, "grad_norm": 0.13806233, "learning_rate": 5.41e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007562, "epoch": 0.45799893, "global_step/max_steps": "107/233", "percentage": "45.92%", "elapsed_time": "3h 55m 29s", "remaining_time": "4h 37m 18s"}
|
| 118 |
-
{"loss": 0.53410757, "token_acc": 0.83494514, "grad_norm": 0.13137825, "learning_rate": 5.36e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007562, "epoch": 0.46227929, "global_step/max_steps": "108/233", "percentage": "46.35%", "elapsed_time": "3h 57m 41s", "remaining_time": "4h 35m 6s"}
|
| 119 |
-
{"loss": 0.49687245, "token_acc": 0.84851163, "grad_norm": 0.12582642, "learning_rate": 5.32e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007593, "epoch": 0.46655966, "global_step/max_steps": "109/233", "percentage": "46.78%", "elapsed_time": "3h 58m 54s", "remaining_time": "4h 31m 47s"}
|
| 120 |
-
{"loss": 0.46854696, "token_acc": 0.84283071, "grad_norm": 0.12569916, "learning_rate": 5.28e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007634, "epoch": 0.47084002, "global_step/max_steps": "110/233", "percentage": "47.21%", "elapsed_time": "3h 59m 48s", "remaining_time": "4h 28m 8s"}
|
| 121 |
-
{"eval_loss": 0.51227695, "eval_token_acc": 0.83273097, "eval_runtime": 235.8399, "eval_samples_per_second": 1.959, "eval_steps_per_second": 0.246, "epoch": 0.47084002, "global_step/max_steps": "110/233", "percentage": "47.21%", "elapsed_time": "4h 3m 44s", "remaining_time": "4h 32m 32s"}
|
| 122 |
-
{"loss": 0.50789607, "token_acc": 0.83416802, "grad_norm": 0.15211298, "learning_rate": 5.24e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007494, "epoch": 0.47512039, "global_step/max_steps": "111/233", "percentage": "47.64%", "elapsed_time": "4h 6m 32s", "remaining_time": "4h 30m 57s"}
|
| 123 |
-
{"loss": 0.54684865, "token_acc": 0.84827946, "grad_norm": 0.1417892, "learning_rate": 5.19e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007504, "epoch": 0.47940075, "global_step/max_steps": "112/233", "percentage": "48.07%", "elapsed_time": "4h 8m 24s", "remaining_time": "4h 28m 22s"}
|
| 124 |
-
{"loss": 0.49388218, "token_acc": 0.84526266, "grad_norm": 0.16174501, "learning_rate": 5.15e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007521, "epoch": 0.48368111, "global_step/max_steps": "113/233", "percentage": "48.50%", "elapsed_time": "4h 10m 4s", "remaining_time": "4h 25m 33s"}
|
| 125 |
-
{"loss": 0.55895323, "token_acc": 0.81881029, "grad_norm": 0.13975629, "learning_rate": 5.11e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007523, "epoch": 0.48796148, "global_step/max_steps": "114/233", "percentage": "48.93%", "elapsed_time": "4h 12m 12s", "remaining_time": "4h 23m 16s"}
|
| 126 |
-
{"loss": 0.83853632, "token_acc": 0.8404093, "grad_norm": 0.15562704, "learning_rate": 5.06e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007548, "epoch": 0.49224184, "global_step/max_steps": "115/233", "percentage": "49.36%", "elapsed_time": "4h 13m 34s", "remaining_time": "4h 20m 11s"}
|
| 127 |
-
{"loss": 0.51695478, "token_acc": 0.84878812, "grad_norm": 0.12231343, "learning_rate": 5.02e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007522, "epoch": 0.4965222, "global_step/max_steps": "116/233", "percentage": "49.79%", "elapsed_time": "4h 16m 41s", "remaining_time": "4h 18m 53s"}
|
| 128 |
-
{"loss": 0.47607303, "token_acc": 0.82846102, "grad_norm": 0.12620105, "learning_rate": 4.98e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007548, "epoch": 0.50080257, "global_step/max_steps": "117/233", "percentage": "50.21%", "elapsed_time": "4h 18m 0s", "remaining_time": "4h 15m 48s"}
|
| 129 |
-
{"loss": 0.53858316, "token_acc": 0.84895453, "grad_norm": 0.13305934, "learning_rate": 4.94e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007582, "epoch": 0.50508293, "global_step/max_steps": "118/233", "percentage": "50.64%", "elapsed_time": "4h 19m 2s", "remaining_time": "4h 12m 27s"}
|
| 130 |
-
{"loss": 0.5041815, "token_acc": 0.84877242, "grad_norm": 0.30841193, "learning_rate": 4.89e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007579, "epoch": 0.5093633, "global_step/max_steps": "119/233", "percentage": "51.07%", "elapsed_time": "4h 21m 21s", "remaining_time": "4h 10m 22s"}
|
| 131 |
-
{"loss": 0.507828, "token_acc": 0.8459375, "grad_norm": 0.10909942, "learning_rate": 4.85e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.00756, "epoch": 0.51364366, "global_step/max_steps": "120/233", "percentage": "51.50%", "elapsed_time": "4h 24m 13s", "remaining_time": "4h 8m 48s"}
|
| 132 |
-
{"eval_loss": 0.51221395, "eval_token_acc": 0.83292141, "eval_runtime": 235.8236, "eval_samples_per_second": 1.959, "eval_steps_per_second": 0.246, "epoch": 0.51364366, "global_step/max_steps": "120/233", "percentage": "51.50%", "elapsed_time": "4h 28m 8s", "remaining_time": "4h 12m 30s"}
|
| 133 |
-
{"loss": 0.47961944, "token_acc": 0.83604326, "grad_norm": 0.11930238, "learning_rate": 4.81e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007466, "epoch": 0.51792402, "global_step/max_steps": "121/233", "percentage": "51.93%", "elapsed_time": "4h 29m 45s", "remaining_time": "4h 9m 41s"}
|
| 134 |
-
{"loss": 0.50552303, "token_acc": 0.84073083, "grad_norm": 0.126531, "learning_rate": 4.76e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007484, "epoch": 0.52220439, "global_step/max_steps": "122/233", "percentage": "52.36%", "elapsed_time": "4h 31m 21s", "remaining_time": "4h 6m 53s"}
|
| 135 |
-
{"loss": 0.49100709, "token_acc": 0.83426728, "grad_norm": 0.20472339, "learning_rate": 4.72e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007507, "epoch": 0.52648475, "global_step/max_steps": "123/233", "percentage": "52.79%", "elapsed_time": "4h 32m 43s", "remaining_time": "4h 3m 54s"}
|
| 136 |
-
{"loss": 0.50948769, "token_acc": 0.84956789, "grad_norm": 0.12975369, "learning_rate": 4.68e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007501, "epoch": 0.53076512, "global_step/max_steps": "124/233", "percentage": "53.22%", "elapsed_time": "4h 35m 11s", "remaining_time": "4h 1m 54s"}
|
| 137 |
-
{"loss": 0.50430632, "token_acc": 0.85928195, "grad_norm": 0.12879616, "learning_rate": 4.64e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007531, "epoch": 0.53504548, "global_step/max_steps": "125/233", "percentage": "53.65%", "elapsed_time": "4h 36m 16s", "remaining_time": "3h 58m 42s"}
|
| 138 |
-
{"loss": 0.54828668, "token_acc": 0.81168203, "grad_norm": 0.14827292, "learning_rate": 4.59e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007542, "epoch": 0.53932584, "global_step/max_steps": "126/233", "percentage": "54.08%", "elapsed_time": "4h 38m 6s", "remaining_time": "3h 56m 10s"}
|
| 139 |
-
{"loss": 0.50712746, "token_acc": 0.86262105, "grad_norm": 0.12914628, "learning_rate": 4.55e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007561, "epoch": 0.54360621, "global_step/max_steps": "127/233", "percentage": "54.51%", "elapsed_time": "4h 39m 36s", "remaining_time": "3h 53m 22s"}
|
| 140 |
-
{"loss": 0.54310966, "token_acc": 0.83741613, "grad_norm": 0.13367902, "learning_rate": 4.51e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007573, "epoch": 0.54788657, "global_step/max_steps": "128/233", "percentage": "54.94%", "elapsed_time": "4h 41m 22s", "remaining_time": "3h 50m 49s"}
|
| 141 |
-
{"loss": 0.52424949, "token_acc": 0.84507659, "grad_norm": 0.13046494, "learning_rate": 4.46e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007584, "epoch": 0.55216693, "global_step/max_steps": "129/233", "percentage": "55.36%", "elapsed_time": "4h 43m 9s", "remaining_time": "3h 48m 17s"}
|
| 142 |
-
{"loss": 0.48390692, "token_acc": 0.84203655, "grad_norm": 0.1205571, "learning_rate": 4.42e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007606, "epoch": 0.5564473, "global_step/max_steps": "130/233", "percentage": "55.79%", "elapsed_time": "4h 44m 32s", "remaining_time": "3h 45m 26s"}
|
| 143 |
-
{"eval_loss": 0.51123512, "eval_token_acc": 0.83316761, "eval_runtime": 236.3646, "eval_samples_per_second": 1.955, "eval_steps_per_second": 0.245, "epoch": 0.5564473, "global_step/max_steps": "130/233", "percentage": "55.79%", "elapsed_time": "4h 48m 28s", "remaining_time": "3h 48m 33s"}
|
| 144 |
-
{"loss": 0.496582, "token_acc": 0.83325523, "grad_norm": 0.18167204, "learning_rate": 4.38e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007495, "epoch": 0.56072766, "global_step/max_steps": "131/233", "percentage": "56.22%", "elapsed_time": "4h 50m 58s", "remaining_time": "3h 46m 33s"}
|
| 145 |
-
{"loss": 0.50854313, "token_acc": 0.81173944, "grad_norm": 0.13208148, "learning_rate": 4.33e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007527, "epoch": 0.56500803, "global_step/max_steps": "132/233", "percentage": "56.65%", "elapsed_time": "4h 51m 56s", "remaining_time": "3h 43m 22s"}
|
| 146 |
-
{"loss": 0.49582225, "token_acc": 0.80062467, "grad_norm": 0.14537254, "learning_rate": 4.29e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007546, "epoch": 0.56928839, "global_step/max_steps": "133/233", "percentage": "57.08%", "elapsed_time": "4h 53m 23s", "remaining_time": "3h 40m 35s"}
|
| 147 |
-
{"loss": 0.4956243, "token_acc": 0.85303579, "grad_norm": 0.13708043, "learning_rate": 4.25e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007578, "epoch": 0.57356875, "global_step/max_steps": "134/233", "percentage": "57.51%", "elapsed_time": "4h 54m 23s", "remaining_time": "3h 37m 29s"}
|
| 148 |
-
{"loss": 0.52060252, "token_acc": 0.83481248, "grad_norm": 0.12764537, "learning_rate": 4.21e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007578, "epoch": 0.57784912, "global_step/max_steps": "135/233", "percentage": "57.94%", "elapsed_time": "4h 56m 35s", "remaining_time": "3h 35m 17s"}
|
| 149 |
-
{"loss": 0.50756574, "token_acc": 0.86122068, "grad_norm": 1.14753449, "learning_rate": 4.16e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007585, "epoch": 0.58212948, "global_step/max_steps": "136/233", "percentage": "58.37%", "elapsed_time": "4h 58m 29s", "remaining_time": "3h 32m 53s"}
|
| 150 |
-
{"loss": 0.51690817, "token_acc": 0.82105263, "grad_norm": 0.15368675, "learning_rate": 4.12e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007613, "epoch": 0.58640984, "global_step/max_steps": "137/233", "percentage": "58.80%", "elapsed_time": "4h 59m 34s", "remaining_time": "3h 29m 55s"}
|
| 151 |
-
{"loss": 0.48402226, "token_acc": 0.81125052, "grad_norm": 0.14295113, "learning_rate": 4.08e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007629, "epoch": 0.59069021, "global_step/max_steps": "138/233", "percentage": "59.23%", "elapsed_time": "5h 1m 9s", "remaining_time": "3h 27m 18s"}
|
| 152 |
-
{"loss": 0.53648186, "token_acc": 0.85190271, "grad_norm": 0.11894955, "learning_rate": 4.03e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007616, "epoch": 0.59497057, "global_step/max_steps": "139/233", "percentage": "59.66%", "elapsed_time": "5h 3m 49s", "remaining_time": "3h 25m 27s"}
|
| 153 |
-
{"loss": 0.51075351, "token_acc": 0.84169821, "grad_norm": 0.10679007, "learning_rate": 3.99e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007607, "epoch": 0.59925094, "global_step/max_steps": "140/233", "percentage": "60.09%", "elapsed_time": "5h 6m 24s", "remaining_time": "3h 23m 32s"}
|
| 154 |
-
{"eval_loss": 0.5122714, "eval_token_acc": 0.83310912, "eval_runtime": 235.5678, "eval_samples_per_second": 1.961, "eval_steps_per_second": 0.246, "epoch": 0.59925094, "global_step/max_steps": "140/233", "percentage": "60.09%", "elapsed_time": "5h 10m 19s", "remaining_time": "3h 26m 8s"}
|
| 155 |
-
{"loss": 0.49892116, "token_acc": 0.8350151, "grad_norm": 0.13105898, "learning_rate": 3.95e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.00751, "epoch": 0.6035313, "global_step/max_steps": "141/233", "percentage": "60.52%", "elapsed_time": "5h 12m 34s", "remaining_time": "3h 23m 56s"}
|
| 156 |
-
{"loss": 0.51705021, "token_acc": 0.83333333, "grad_norm": 0.14186479, "learning_rate": 3.91e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007536, "epoch": 0.60781166, "global_step/max_steps": "142/233", "percentage": "60.94%", "elapsed_time": "5h 13m 41s", "remaining_time": "3h 21m 1s"}
|
| 157 |
-
{"loss": 0.50113285, "token_acc": 0.84346567, "grad_norm": 0.21789369, "learning_rate": 3.86e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007562, "epoch": 0.61209203, "global_step/max_steps": "143/233", "percentage": "61.37%", "elapsed_time": "5h 14m 50s", "remaining_time": "3h 18m 9s"}
|
| 158 |
-
{"loss": 0.49362943, "token_acc": 0.84292541, "grad_norm": 0.13137925, "learning_rate": 3.82e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007538, "epoch": 0.61637239, "global_step/max_steps": "144/233", "percentage": "61.80%", "elapsed_time": "5h 18m 3s", "remaining_time": "3h 16m 34s"}
|
| 159 |
-
{"loss": 0.46474981, "token_acc": 0.85470206, "grad_norm": 0.15578702, "learning_rate": 3.78e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007547, "epoch": 0.62065276, "global_step/max_steps": "145/233", "percentage": "62.23%", "elapsed_time": "5h 19m 51s", "remaining_time": "3h 14m 7s"}
|
| 160 |
-
{"loss": 0.51430953, "token_acc": 0.83872761, "grad_norm": 0.13705236, "learning_rate": 3.73e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007548, "epoch": 0.62493312, "global_step/max_steps": "146/233", "percentage": "62.66%", "elapsed_time": "5h 22m 3s", "remaining_time": "3h 11m 54s"}
|
| 161 |
-
{"loss": 0.50783074, "token_acc": 0.84250942, "grad_norm": 0.19945437, "learning_rate": 3.69e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007558, "epoch": 0.62921348, "global_step/max_steps": "147/233", "percentage": "63.09%", "elapsed_time": "5h 23m 48s", "remaining_time": "3h 9m 26s"}
|
| 162 |
-
{"loss": 0.49359503, "token_acc": 0.85719333, "grad_norm": 0.24197783, "learning_rate": 3.65e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007561, "epoch": 0.63349385, "global_step/max_steps": "148/233", "percentage": "63.52%", "elapsed_time": "5h 25m 53s", "remaining_time": "3h 7m 10s"}
|
| 163 |
-
{"loss": 0.4955087, "token_acc": 0.84956723, "grad_norm": 0.1309997, "learning_rate": 3.61e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007584, "epoch": 0.63777421, "global_step/max_steps": "149/233", "percentage": "63.95%", "elapsed_time": "5h 27m 5s", "remaining_time": "3h 4m 24s"}
|
| 164 |
-
{"loss": 0.52520931, "token_acc": 0.83175096, "grad_norm": 0.11741098, "learning_rate": 3.56e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007576, "epoch": 0.64205457, "global_step/max_steps": "150/233", "percentage": "64.38%", "elapsed_time": "5h 29m 38s", "remaining_time": "3h 2m 23s"}
|
| 165 |
-
{"eval_loss": 0.51157564, "eval_token_acc": 0.83332132, "eval_runtime": 235.6884, "eval_samples_per_second": 1.96, "eval_steps_per_second": 0.246, "epoch": 0.64205457, "global_step/max_steps": "150/233", "percentage": "64.38%", "elapsed_time": "5h 33m 33s", "remaining_time": "3h 4m 34s"}
|
| 166 |
-
{"loss": 0.50126874, "token_acc": 0.83641022, "grad_norm": 0.13107127, "learning_rate": 3.52e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007486, "epoch": 0.64633494, "global_step/max_steps": "151/233", "percentage": "64.81%", "elapsed_time": "5h 35m 49s", "remaining_time": "3h 2m 22s"}
|
| 167 |
-
{"loss": 0.49819481, "token_acc": 0.83685292, "grad_norm": 0.12817332, "learning_rate": 3.48e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007499, "epoch": 0.6506153, "global_step/max_steps": "152/233", "percentage": "65.24%", "elapsed_time": "5h 37m 29s", "remaining_time": "2h 59m 50s"}
|
| 168 |
-
{"loss": 0.50091374, "token_acc": 0.81324545, "grad_norm": 0.13367279, "learning_rate": 3.43e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007511, "epoch": 0.65489567, "global_step/max_steps": "153/233", "percentage": "65.67%", "elapsed_time": "5h 39m 9s", "remaining_time": "2h 57m 20s"}
|
| 169 |
-
{"loss": 0.51562655, "token_acc": 0.8242413, "grad_norm": 0.12840405, "learning_rate": 3.39e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007529, "epoch": 0.65917603, "global_step/max_steps": "154/233", "percentage": "66.09%", "elapsed_time": "5h 40m 33s", "remaining_time": "2h 54m 41s"}
|
| 170 |
-
{"loss": 0.50541002, "token_acc": 0.84841673, "grad_norm": 0.14733191, "learning_rate": 3.35e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007528, "epoch": 0.66345639, "global_step/max_steps": "155/233", "percentage": "66.52%", "elapsed_time": "5h 42m 48s", "remaining_time": "2h 52m 30s"}
|
| 171 |
-
{"loss": 0.49965221, "token_acc": 0.84833804, "grad_norm": 0.11884115, "learning_rate": 3.3e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007537, "epoch": 0.66773676, "global_step/max_steps": "156/233", "percentage": "66.95%", "elapsed_time": "5h 44m 36s", "remaining_time": "2h 50m 5s"}
|
| 172 |
-
{"loss": 0.50881106, "token_acc": 0.81733524, "grad_norm": 0.13272379, "learning_rate": 3.26e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007563, "epoch": 0.67201712, "global_step/max_steps": "157/233", "percentage": "67.38%", "elapsed_time": "5h 45m 38s", "remaining_time": "2h 47m 19s"}
|
| 173 |
-
{"loss": 0.50714695, "token_acc": 0.84222865, "grad_norm": 0.14587349, "learning_rate": 3.22e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007585, "epoch": 0.67629749, "global_step/max_steps": "158/233", "percentage": "67.81%", "elapsed_time": "5h 46m 50s", "remaining_time": "2h 44m 38s"}
|
| 174 |
-
{"loss": 0.50509226, "token_acc": 0.84222363, "grad_norm": 0.1192864, "learning_rate": 3.18e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007576, "epoch": 0.68057785, "global_step/max_steps": "159/233", "percentage": "68.24%", "elapsed_time": "5h 49m 25s", "remaining_time": "2h 42m 37s"}
|
| 175 |
-
{"loss": 0.52008939, "token_acc": 0.81305959, "grad_norm": 0.13223591, "learning_rate": 3.13e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007593, "epoch": 0.68485821, "global_step/max_steps": "160/233", "percentage": "68.67%", "elapsed_time": "5h 50m 51s", "remaining_time": "2h 40m 4s"}
|
| 176 |
-
{"eval_loss": 0.51053989, "eval_token_acc": 0.83335124, "eval_runtime": 235.8011, "eval_samples_per_second": 1.959, "eval_steps_per_second": 0.246, "epoch": 0.68485821, "global_step/max_steps": "160/233", "percentage": "68.67%", "elapsed_time": "5h 54m 46s", "remaining_time": "2h 41m 52s"}
|
| 177 |
-
{"loss": 0.52317727, "token_acc": 0.83413149, "grad_norm": 0.12414582, "learning_rate": 3.09e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007483, "epoch": 0.68913858, "global_step/max_steps": "161/233", "percentage": "69.10%", "elapsed_time": "5h 58m 13s", "remaining_time": "2h 40m 12s"}
|
| 178 |
-
{"loss": 0.476078, "token_acc": 0.83058201, "grad_norm": 0.10707334, "learning_rate": 3.05e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007475, "epoch": 0.69341894, "global_step/max_steps": "162/233", "percentage": "69.53%", "elapsed_time": "6h 0m 52s", "remaining_time": "2h 38m 9s"}
|
| 179 |
-
{"loss": 0.48284999, "token_acc": 0.83550349, "grad_norm": 0.11075233, "learning_rate": 3e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007471, "epoch": 0.6976993, "global_step/max_steps": "163/233", "percentage": "69.96%", "elapsed_time": "6h 3m 16s", "remaining_time": "2h 36m 0s"}
|
| 180 |
-
{"loss": 0.48539856, "token_acc": 0.85493779, "grad_norm": 0.12864996, "learning_rate": 2.96e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.00749, "epoch": 0.70197967, "global_step/max_steps": "164/233", "percentage": "70.39%", "elapsed_time": "6h 4m 36s", "remaining_time": "2h 33m 24s"}
|
| 181 |
-
{"loss": 0.52488422, "token_acc": 0.83583838, "grad_norm": 0.1258402, "learning_rate": 2.92e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007484, "epoch": 0.70626003, "global_step/max_steps": "165/233", "percentage": "70.82%", "elapsed_time": "6h 7m 5s", "remaining_time": "2h 31m 17s"}
|
| 182 |
-
{"loss": 0.51129127, "token_acc": 0.83417319, "grad_norm": 0.18856949, "learning_rate": 2.88e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007498, "epoch": 0.7105404, "global_step/max_steps": "166/233", "percentage": "71.24%", "elapsed_time": "6h 8m 38s", "remaining_time": "2h 28m 47s"}
|
| 183 |
-
{"loss": 0.52113378, "token_acc": 0.82794905, "grad_norm": 0.15255888, "learning_rate": 2.83e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007512, "epoch": 0.71482076, "global_step/max_steps": "167/233", "percentage": "71.67%", "elapsed_time": "6h 10m 10s", "remaining_time": "2h 26m 17s"}
|
| 184 |
-
{"loss": 0.48344895, "token_acc": 0.84987893, "grad_norm": 0.22295879, "learning_rate": 2.79e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007517, "epoch": 0.71910112, "global_step/max_steps": "168/233", "percentage": "72.10%", "elapsed_time": "6h 12m 9s", "remaining_time": "2h 23m 59s"}
|
| 185 |
-
{"loss": 0.51115608, "token_acc": 0.85642846, "grad_norm": 0.15013304, "learning_rate": 2.75e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007544, "epoch": 0.72338149, "global_step/max_steps": "169/233", "percentage": "72.53%", "elapsed_time": "6h 13m 2s", "remaining_time": "2h 21m 16s"}
|
| 186 |
-
{"loss": 0.51714134, "token_acc": 0.83837247, "grad_norm": 0.13112898, "learning_rate": 2.7e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007541, "epoch": 0.72766185, "global_step/max_steps": "170/233", "percentage": "72.96%", "elapsed_time": "6h 15m 23s", "remaining_time": "2h 19m 7s"}
|
| 187 |
-
{"eval_loss": 0.50959533, "eval_token_acc": 0.83369538, "eval_runtime": 235.5226, "eval_samples_per_second": 1.962, "eval_steps_per_second": 0.246, "epoch": 0.72766185, "global_step/max_steps": "170/233", "percentage": "72.96%", "elapsed_time": "6h 19m 19s", "remaining_time": "2h 20m 34s"}
|
| 188 |
-
{"loss": 0.54881608, "token_acc": 0.8329001, "grad_norm": 0.16292445, "learning_rate": 2.66e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007464, "epoch": 0.73194222, "global_step/max_steps": "171/233", "percentage": "73.39%", "elapsed_time": "6h 21m 27s", "remaining_time": "2h 18m 18s"}
|
| 189 |
-
{"loss": 0.55621934, "token_acc": 0.83621385, "grad_norm": 0.14366055, "learning_rate": 2.62e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.00746, "epoch": 0.73622258, "global_step/max_steps": "172/233", "percentage": "73.82%", "elapsed_time": "6h 23m 56s", "remaining_time": "2h 16m 10s"}
|
| 190 |
-
{"loss": 0.49691424, "token_acc": 0.85122178, "grad_norm": 0.60208136, "learning_rate": 2.58e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007466, "epoch": 0.74050294, "global_step/max_steps": "173/233", "percentage": "74.25%", "elapsed_time": "6h 25m 50s", "remaining_time": "2h 13m 49s"}
|
| 191 |
-
{"loss": 0.48890877, "token_acc": 0.88896396, "grad_norm": 0.11797956, "learning_rate": 2.53e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007477, "epoch": 0.74478331, "global_step/max_steps": "174/233", "percentage": "74.68%", "elapsed_time": "6h 27m 32s", "remaining_time": "2h 11m 24s"}
|
| 192 |
-
{"loss": 0.48969388, "token_acc": 0.85049943, "grad_norm": 0.14641684, "learning_rate": 2.49e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007499, "epoch": 0.74906367, "global_step/max_steps": "175/233", "percentage": "75.11%", "elapsed_time": "6h 28m 35s", "remaining_time": "2h 8m 47s"}
|
| 193 |
-
{"loss": 0.49373892, "token_acc": 0.85778095, "grad_norm": 0.13159955, "learning_rate": 2.45e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007511, "epoch": 0.75334403, "global_step/max_steps": "176/233", "percentage": "75.54%", "elapsed_time": "6h 30m 12s", "remaining_time": "2h 6m 22s"}
|
| 194 |
-
{"loss": 0.50072569, "token_acc": 0.84314664, "grad_norm": 0.15253282, "learning_rate": 2.4e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007525, "epoch": 0.7576244, "global_step/max_steps": "177/233", "percentage": "75.97%", "elapsed_time": "6h 31m 42s", "remaining_time": "2h 3m 55s"}
|
| 195 |
-
{"loss": 0.49835771, "token_acc": 0.85106974, "grad_norm": 0.12960319, "learning_rate": 2.36e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007537, "epoch": 0.76190476, "global_step/max_steps": "178/233", "percentage": "76.39%", "elapsed_time": "6h 33m 16s", "remaining_time": "2h 1m 30s"}
|
| 196 |
-
{"loss": 0.53520441, "token_acc": 0.84272897, "grad_norm": 0.19155194, "learning_rate": 2.32e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007543, "epoch": 0.76618513, "global_step/max_steps": "179/233", "percentage": "76.82%", "elapsed_time": "6h 35m 10s", "remaining_time": "1h 59m 12s"}
|
| 197 |
-
{"loss": 0.50994658, "token_acc": 0.83529193, "grad_norm": 0.1294762, "learning_rate": 2.27e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.00755, "epoch": 0.77046549, "global_step/max_steps": "180/233", "percentage": "77.25%", "elapsed_time": "6h 37m 1s", "remaining_time": "1h 56m 54s"}
|
| 198 |
-
{"eval_loss": 0.50943142, "eval_token_acc": 0.83370355, "eval_runtime": 236.1071, "eval_samples_per_second": 1.957, "eval_steps_per_second": 0.246, "epoch": 0.77046549, "global_step/max_steps": "180/233", "percentage": "77.25%", "elapsed_time": "6h 40m 57s", "remaining_time": "1h 58m 3s"}
|
| 199 |
-
{"loss": 0.50555718, "token_acc": 0.83636152, "grad_norm": 0.13209662, "learning_rate": 2.23e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007464, "epoch": 0.77474585, "global_step/max_steps": "181/233", "percentage": "77.68%", "elapsed_time": "6h 43m 48s", "remaining_time": "1h 56m 0s"}
|
| 200 |
-
{"loss": 0.53007305, "token_acc": 0.84191974, "grad_norm": 0.1212738, "learning_rate": 2.19e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007475, "epoch": 0.77902622, "global_step/max_steps": "182/233", "percentage": "78.11%", "elapsed_time": "6h 45m 27s", "remaining_time": "1h 53m 37s"}
|
| 201 |
-
{"loss": 0.53875828, "token_acc": 0.81942612, "grad_norm": 0.14762594, "learning_rate": 2.15e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.00748, "epoch": 0.78330658, "global_step/max_steps": "183/233", "percentage": "78.54%", "elapsed_time": "6h 47m 23s", "remaining_time": "1h 51m 18s"}
|
| 202 |
-
{"loss": 0.51536155, "token_acc": 0.81711541, "grad_norm": 0.16659674, "learning_rate": 2.1e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.00748, "epoch": 0.78758694, "global_step/max_steps": "184/233", "percentage": "78.97%", "elapsed_time": "6h 49m 37s", "remaining_time": "1h 49m 4s"}
|
| 203 |
-
{"loss": 0.48481408, "token_acc": 0.84935853, "grad_norm": 0.11972429, "learning_rate": 2.06e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007489, "epoch": 0.79186731, "global_step/max_steps": "185/233", "percentage": "79.40%", "elapsed_time": "6h 51m 22s", "remaining_time": "1h 46m 44s"}
|
| 204 |
-
{"loss": 0.49308038, "token_acc": 0.83884662, "grad_norm": 0.10787959, "learning_rate": 2.02e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007498, "epoch": 0.79614767, "global_step/max_steps": "186/233", "percentage": "79.83%", "elapsed_time": "6h 53m 7s", "remaining_time": "1h 44m 23s"}
|
| 205 |
-
{"loss": 0.49464846, "token_acc": 0.84874494, "grad_norm": 0.13731882, "learning_rate": 1.97e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.00751, "epoch": 0.80042804, "global_step/max_steps": "187/233", "percentage": "80.26%", "elapsed_time": "6h 54m 39s", "remaining_time": "1h 42m 0s"}
|
| 206 |
-
{"loss": 0.48671663, "token_acc": 0.84075687, "grad_norm": 0.10981249, "learning_rate": 1.93e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007512, "epoch": 0.8047084, "global_step/max_steps": "188/233", "percentage": "80.69%", "elapsed_time": "6h 56m 44s", "remaining_time": "1h 39m 45s"}
|
| 207 |
-
{"loss": 0.48544246, "token_acc": 0.84682673, "grad_norm": 0.12859865, "learning_rate": 1.89e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.00753, "epoch": 0.80898876, "global_step/max_steps": "189/233", "percentage": "81.12%", "elapsed_time": "6h 58m 0s", "remaining_time": "1h 37m 18s"}
|
| 208 |
-
{"loss": 0.5006417, "token_acc": 0.84206275, "grad_norm": 0.12259582, "learning_rate": 1.85e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007549, "epoch": 0.81326913, "global_step/max_steps": "190/233", "percentage": "81.55%", "elapsed_time": "6h 59m 6s", "remaining_time": "1h 34m 51s"}
|
| 209 |
-
{"eval_loss": 0.50880826, "eval_token_acc": 0.83371443, "eval_runtime": 235.7001, "eval_samples_per_second": 1.96, "eval_steps_per_second": 0.246, "epoch": 0.81326913, "global_step/max_steps": "190/233", "percentage": "81.55%", "elapsed_time": "7h 3m 2s", "remaining_time": "1h 35m 44s"}
|
| 210 |
-
{"loss": 0.49958098, "token_acc": 0.83476374, "grad_norm": 0.12608472, "learning_rate": 1.8e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007472, "epoch": 0.81754949, "global_step/max_steps": "191/233", "percentage": "81.97%", "elapsed_time": "7h 5m 41s", "remaining_time": "1h 33m 36s"}
|
| 211 |
-
{"loss": 0.50622708, "token_acc": 0.8277248, "grad_norm": 0.10919516, "learning_rate": 1.76e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007469, "epoch": 0.82182986, "global_step/max_steps": "192/233", "percentage": "82.40%", "elapsed_time": "7h 8m 7s", "remaining_time": "1h 31m 25s"}
|
| 212 |
-
{"loss": 0.51051688, "token_acc": 0.83841894, "grad_norm": 0.14452438, "learning_rate": 1.72e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007496, "epoch": 0.82611022, "global_step/max_steps": "193/233", "percentage": "82.83%", "elapsed_time": "7h 8m 48s", "remaining_time": "1h 28m 52s"}
|
| 213 |
-
{"loss": 0.51354766, "token_acc": 0.85329018, "grad_norm": 0.11761606, "learning_rate": 1.67e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007492, "epoch": 0.83039058, "global_step/max_steps": "194/233", "percentage": "83.26%", "elapsed_time": "7h 11m 13s", "remaining_time": "1h 26m 41s"}
|
| 214 |
-
{"loss": 0.51062506, "token_acc": 0.85424893, "grad_norm": 0.14140776, "learning_rate": 1.63e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.0075, "epoch": 0.83467095, "global_step/max_steps": "195/233", "percentage": "83.69%", "elapsed_time": "7h 12m 58s", "remaining_time": "1h 24m 22s"}
|
| 215 |
-
{"loss": 0.55619252, "token_acc": 0.84431075, "grad_norm": 0.1660874, "learning_rate": 1.59e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007499, "epoch": 0.83895131, "global_step/max_steps": "196/233", "percentage": "84.12%", "elapsed_time": "7h 15m 15s", "remaining_time": "1h 22m 9s"}
|
| 216 |
-
{"loss": 0.51129085, "token_acc": 0.81936346, "grad_norm": 0.14064978, "learning_rate": 1.55e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007516, "epoch": 0.84323167, "global_step/max_steps": "197/233", "percentage": "84.55%", "elapsed_time": "7h 16m 29s", "remaining_time": "1h 19m 45s"}
|
| 217 |
-
{"loss": 0.508412, "token_acc": 0.84866787, "grad_norm": 0.11868906, "learning_rate": 1.5e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007505, "epoch": 0.84751204, "global_step/max_steps": "198/233", "percentage": "84.98%", "elapsed_time": "7h 19m 22s", "remaining_time": "1h 17m 40s"}
|
| 218 |
-
{"loss": 0.54063851, "token_acc": 0.83712304, "grad_norm": 0.12853099, "learning_rate": 1.46e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007516, "epoch": 0.8517924, "global_step/max_steps": "199/233", "percentage": "85.41%", "elapsed_time": "7h 20m 55s", "remaining_time": "1h 15m 20s"}
|
| 219 |
-
{"loss": 0.46997577, "token_acc": 0.82234024, "grad_norm": 0.12919647, "learning_rate": 1.42e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007526, "epoch": 0.85607277, "global_step/max_steps": "200/233", "percentage": "85.84%", "elapsed_time": "7h 22m 33s", "remaining_time": "1h 13m 1s"}
|
| 220 |
-
{"eval_loss": 0.50845164, "eval_token_acc": 0.83379876, "eval_runtime": 235.3989, "eval_samples_per_second": 1.963, "eval_steps_per_second": 0.246, "epoch": 0.85607277, "global_step/max_steps": "200/233", "percentage": "85.84%", "elapsed_time": "7h 26m 29s", "remaining_time": "1h 13m 40s"}
|
| 221 |
-
{"loss": 0.51254267, "token_acc": 0.8372357, "grad_norm": 0.16826271, "learning_rate": 1.37e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007463, "epoch": 0.86035313, "global_step/max_steps": "201/233", "percentage": "86.27%", "elapsed_time": "7h 28m 30s", "remaining_time": "1h 11m 24s"}
|
| 222 |
-
{"loss": 0.50134093, "token_acc": 0.83362946, "grad_norm": 0.15078701, "learning_rate": 1.33e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007483, "epoch": 0.86463349, "global_step/max_steps": "202/233", "percentage": "86.70%", "elapsed_time": "7h 29m 35s", "remaining_time": "1h 8m 59s"}
|
| 223 |
-
{"loss": 0.51513213, "token_acc": 0.82223492, "grad_norm": 0.12865345, "learning_rate": 1.29e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007484, "epoch": 0.86891386, "global_step/max_steps": "203/233", "percentage": "87.12%", "elapsed_time": "7h 31m 42s", "remaining_time": "1h 6m 45s"}
|
| 224 |
-
{"loss": 0.52011067, "token_acc": 0.83890838, "grad_norm": 0.11933945, "learning_rate": 1.24e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007478, "epoch": 0.87319422, "global_step/max_steps": "204/233", "percentage": "87.55%", "elapsed_time": "7h 34m 18s", "remaining_time": "1h 4m 34s"}
|
| 225 |
-
{"loss": 0.49174532, "token_acc": 0.83849821, "grad_norm": 0.25774974, "learning_rate": 1.2e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007473, "epoch": 0.87747459, "global_step/max_steps": "205/233", "percentage": "87.98%", "elapsed_time": "7h 36m 50s", "remaining_time": "1h 2m 23s"}
|
| 226 |
-
{"loss": 0.49890012, "token_acc": 0.82418112, "grad_norm": 0.15101479, "learning_rate": 1.16e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.00749, "epoch": 0.88175495, "global_step/max_steps": "206/233", "percentage": "88.41%", "elapsed_time": "7h 38m 2s", "remaining_time": "1h 0m 2s"}
|
| 227 |
-
{"loss": 0.50326902, "token_acc": 0.81516661, "grad_norm": 0.1402124, "learning_rate": 1.12e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.00749, "epoch": 0.88603531, "global_step/max_steps": "207/233", "percentage": "88.84%", "elapsed_time": "7h 40m 17s", "remaining_time": "57m 48s"}
|
| 228 |
-
{"loss": 0.5040676, "token_acc": 0.81869791, "grad_norm": 0.27301475, "learning_rate": 1.07e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007491, "epoch": 0.89031568, "global_step/max_steps": "208/233", "percentage": "89.27%", "elapsed_time": "7h 42m 24s", "remaining_time": "55m 34s"}
|
| 229 |
-
{"loss": 0.47629112, "token_acc": 0.82132275, "grad_norm": 0.17399816, "learning_rate": 1.03e-06, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007495, "epoch": 0.89459604, "global_step/max_steps": "209/233", "percentage": "89.70%", "elapsed_time": "7h 44m 24s", "remaining_time": "53m 19s"}
|
| 230 |
-
{"loss": 0.50204992, "token_acc": 0.83965597, "grad_norm": 0.13901281, "learning_rate": 9.9e-07, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007496, "epoch": 0.8988764, "global_step/max_steps": "210/233", "percentage": "90.13%", "elapsed_time": "7h 46m 34s", "remaining_time": "51m 6s"}
|
| 231 |
-
{"eval_loss": 0.50826108, "eval_token_acc": 0.83396063, "eval_runtime": 235.9818, "eval_samples_per_second": 1.958, "eval_steps_per_second": 0.246, "epoch": 0.8988764, "global_step/max_steps": "210/233", "percentage": "90.13%", "elapsed_time": "7h 50m 30s", "remaining_time": "51m 31s"}
|
| 232 |
-
{"loss": 0.52222174, "token_acc": 0.83534812, "grad_norm": 0.13053484, "learning_rate": 9.4e-07, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007434, "epoch": 0.90315677, "global_step/max_steps": "211/233", "percentage": "90.56%", "elapsed_time": "7h 52m 43s", "remaining_time": "49m 17s"}
|
| 233 |
-
{"loss": 0.49084908, "token_acc": 0.83773069, "grad_norm": 0.1325853, "learning_rate": 9e-07, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007429, "epoch": 0.90743713, "global_step/max_steps": "212/233", "percentage": "90.99%", "elapsed_time": "7h 55m 17s", "remaining_time": "47m 4s"}
|
| 234 |
-
{"loss": 0.49427927, "token_acc": 0.85061342, "grad_norm": 0.12438133, "learning_rate": 8.6e-07, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007447, "epoch": 0.9117175, "global_step/max_steps": "213/233", "percentage": "91.42%", "elapsed_time": "7h 56m 20s", "remaining_time": "44m 43s"}
|
| 235 |
-
{"loss": 0.50391412, "token_acc": 0.85781355, "grad_norm": 0.11095317, "learning_rate": 8.2e-07, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007444, "epoch": 0.91599786, "global_step/max_steps": "214/233", "percentage": "91.85%", "elapsed_time": "7h 58m 47s", "remaining_time": "42m 30s"}
|
| 236 |
-
{"loss": 0.50294411, "token_acc": 0.83485309, "grad_norm": 0.1230354, "learning_rate": 7.7e-07, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007453, "epoch": 0.92027822, "global_step/max_steps": "215/233", "percentage": "92.27%", "elapsed_time": "8h 0m 25s", "remaining_time": "40m 13s"}
|
| 237 |
-
{"loss": 0.4798848, "token_acc": 0.8473504, "grad_norm": 0.17395706, "learning_rate": 7.3e-07, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.00745, "epoch": 0.92455859, "global_step/max_steps": "216/233", "percentage": "92.70%", "elapsed_time": "8h 2m 53s", "remaining_time": "38m 0s"}
|
| 238 |
-
{"loss": 0.51420152, "token_acc": 0.82058785, "grad_norm": 0.13729912, "learning_rate": 6.9e-07, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007451, "epoch": 0.92883895, "global_step/max_steps": "217/233", "percentage": "93.13%", "elapsed_time": "8h 5m 3s", "remaining_time": "35m 45s"}
|
| 239 |
-
{"loss": 0.51177865, "token_acc": 0.81501807, "grad_norm": 0.10358699, "learning_rate": 6.4e-07, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007453, "epoch": 0.93311932, "global_step/max_steps": "218/233", "percentage": "93.56%", "elapsed_time": "8h 7m 8s", "remaining_time": "33m 31s"}
|
| 240 |
-
{"loss": 0.52010918, "token_acc": 0.8401833, "grad_norm": 0.16485111, "learning_rate": 6e-07, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007455, "epoch": 0.93739968, "global_step/max_steps": "219/233", "percentage": "93.99%", "elapsed_time": "8h 9m 13s", "remaining_time": "31m 16s"}
|
| 241 |
-
{"loss": 0.5114243, "token_acc": 0.84615385, "grad_norm": 0.1616209, "learning_rate": 5.6e-07, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007451, "epoch": 0.94168004, "global_step/max_steps": "220/233", "percentage": "94.42%", "elapsed_time": "8h 11m 44s", "remaining_time": "29m 3s"}
|
| 242 |
-
{"eval_loss": 0.50814027, "eval_token_acc": 0.83419323, "eval_runtime": 236.0063, "eval_samples_per_second": 1.958, "eval_steps_per_second": 0.246, "epoch": 0.94168004, "global_step/max_steps": "220/233", "percentage": "94.42%", "elapsed_time": "8h 15m 40s", "remaining_time": "29m 17s"}
|
| 243 |
-
{"loss": 0.48112383, "token_acc": 0.83936344, "grad_norm": 0.12433523, "learning_rate": 5.2e-07, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007383, "epoch": 0.94596041, "global_step/max_steps": "221/233", "percentage": "94.85%", "elapsed_time": "8h 18m 32s", "remaining_time": "27m 4s"}
|
| 244 |
-
{"loss": 0.50242162, "token_acc": 0.84531625, "grad_norm": 0.23049626, "learning_rate": 4.7e-07, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007407, "epoch": 0.95024077, "global_step/max_steps": "222/233", "percentage": "95.28%", "elapsed_time": "8h 19m 12s", "remaining_time": "24m 44s"}
|
| 245 |
-
{"loss": 0.51799721, "token_acc": 0.83938443, "grad_norm": 0.13270883, "learning_rate": 4.3e-07, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007419, "epoch": 0.95452113, "global_step/max_steps": "223/233", "percentage": "95.71%", "elapsed_time": "8h 20m 37s", "remaining_time": "22m 26s"}
|
| 246 |
-
{"loss": 0.518457, "token_acc": 0.84743252, "grad_norm": 0.16565216, "learning_rate": 3.9e-07, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007428, "epoch": 0.9588015, "global_step/max_steps": "224/233", "percentage": "96.14%", "elapsed_time": "8h 22m 15s", "remaining_time": "20m 10s"}
|
| 247 |
-
{"loss": 0.51062959, "token_acc": 0.83592096, "grad_norm": 0.12720895, "learning_rate": 3.4e-07, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007436, "epoch": 0.96308186, "global_step/max_steps": "225/233", "percentage": "96.57%", "elapsed_time": "8h 23m 59s", "remaining_time": "17m 55s"}
|
| 248 |
-
{"loss": 0.50811839, "token_acc": 0.84152466, "grad_norm": 0.16164465, "learning_rate": 3e-07, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007445, "epoch": 0.96736223, "global_step/max_steps": "226/233", "percentage": "97.00%", "elapsed_time": "8h 25m 33s", "remaining_time": "15m 39s"}
|
| 249 |
-
{"loss": 0.51978874, "token_acc": 0.83158328, "grad_norm": 0.13573085, "learning_rate": 2.6e-07, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007447, "epoch": 0.97164259, "global_step/max_steps": "227/233", "percentage": "97.42%", "elapsed_time": "8h 27m 40s", "remaining_time": "13m 25s"}
|
| 250 |
-
{"loss": 0.49247271, "token_acc": 0.84161592, "grad_norm": 0.12217645, "learning_rate": 2.1e-07, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007453, "epoch": 0.97592295, "global_step/max_steps": "228/233", "percentage": "97.85%", "elapsed_time": "8h 29m 30s", "remaining_time": "11m 10s"}
|
| 251 |
-
{"loss": 0.50794172, "token_acc": 0.84790565, "grad_norm": 0.14334756, "learning_rate": 1.7e-07, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007469, "epoch": 0.98020332, "global_step/max_steps": "229/233", "percentage": "98.28%", "elapsed_time": "8h 30m 40s", "remaining_time": "8m 55s"}
|
| 252 |
-
{"loss": 0.52748251, "token_acc": 0.82375963, "grad_norm": 0.13245644, "learning_rate": 1.3e-07, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.00748, "epoch": 0.98448368, "global_step/max_steps": "230/233", "percentage": "98.71%", "elapsed_time": "8h 32m 10s", "remaining_time": "6m 40s"}
|
| 253 |
-
{"eval_loss": 0.50761044, "eval_token_acc": 0.83408849, "eval_runtime": 236.4465, "eval_samples_per_second": 1.954, "eval_steps_per_second": 0.245, "epoch": 0.98448368, "global_step/max_steps": "230/233", "percentage": "98.71%", "elapsed_time": "8h 36m 6s", "remaining_time": "6m 43s"}
|
| 254 |
-
{"loss": 0.52742505, "token_acc": 0.83657238, "grad_norm": 0.13020933, "learning_rate": 9e-08, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007407, "epoch": 0.98876404, "global_step/max_steps": "231/233", "percentage": "99.14%", "elapsed_time": "8h 39m 25s", "remaining_time": "4m 29s"}
|
| 255 |
-
{"loss": 0.48712298, "token_acc": 0.82561059, "grad_norm": 0.13148691, "learning_rate": 4e-08, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007419, "epoch": 0.99304441, "global_step/max_steps": "232/233", "percentage": "99.57%", "elapsed_time": "8h 40m 50s", "remaining_time": "2m 14s"}
|
| 256 |
-
{"loss": 0.50508392, "token_acc": 0.85302594, "grad_norm": 0.12396082, "learning_rate": 0.0, "memory(GiB)": 133.2, "train_speed(iter/s)": 0.007426, "epoch": 0.99732477, "global_step/max_steps": "233/233", "percentage": "100.00%", "elapsed_time": "8h 42m 33s", "remaining_time": "0s"}
|
|
|
|
| 1 |
+
{"loss": 1.05211091, "token_acc": 0.72991851, "grad_norm": 1.3166697, "learning_rate": 9.96e-06, "memory(GiB)": 48.3, "train_speed(iter/s)": 0.006625, "epoch": 0.00428036, "global_step/max_steps": "1/233", "percentage": "0.43%", "elapsed_time": "2m 4s", "remaining_time": "7h 59m 51s"}
|
| 2 |
+
{"loss": 0.76383126, "token_acc": 0.79082383, "grad_norm": 0.3090775, "learning_rate": 9.91e-06, "memory(GiB)": 127.66, "train_speed(iter/s)": 0.006892, "epoch": 0.00856073, "global_step/max_steps": "2/233", "percentage": "0.86%", "elapsed_time": "4m 23s", "remaining_time": "8h 26m 54s"}
|
| 3 |
+
{"loss": 0.68559116, "token_acc": 0.79723046, "grad_norm": 0.59014881, "learning_rate": 9.87e-06, "memory(GiB)": 127.66, "train_speed(iter/s)": 0.007851, "epoch": 0.01284109, "global_step/max_steps": "3/233", "percentage": "1.29%", "elapsed_time": "5m 55s", "remaining_time": "7h 33m 57s"}
|
| 4 |
+
{"loss": 0.65103149, "token_acc": 0.81592101, "grad_norm": 0.3545883, "learning_rate": 9.83e-06, "memory(GiB)": 127.66, "train_speed(iter/s)": 0.008732, "epoch": 0.01712146, "global_step/max_steps": "4/233", "percentage": "1.72%", "elapsed_time": "7m 11s", "remaining_time": "6h 51m 27s"}
|
| 5 |
+
{"loss": 0.63546801, "token_acc": 0.81665001, "grad_norm": 0.21685971, "learning_rate": 9.79e-06, "memory(GiB)": 127.66, "train_speed(iter/s)": 0.009836, "epoch": 0.02140182, "global_step/max_steps": "5/233", "percentage": "2.15%", "elapsed_time": "8m 1s", "remaining_time": "6h 5m 56s"}
|
| 6 |
+
{"loss": 0.62821972, "token_acc": 0.83078308, "grad_norm": 0.15905039, "learning_rate": 9.74e-06, "memory(GiB)": 127.66, "train_speed(iter/s)": 0.010504, "epoch": 0.02568218, "global_step/max_steps": "6/233", "percentage": "2.58%", "elapsed_time": "9m 4s", "remaining_time": "5h 43m 14s"}
|
| 7 |
+
{"loss": 0.57419938, "token_acc": 0.83345468, "grad_norm": 0.26453313, "learning_rate": 9.7e-06, "memory(GiB)": 127.66, "train_speed(iter/s)": 0.010586, "epoch": 0.02996255, "global_step/max_steps": "7/233", "percentage": "3.00%", "elapsed_time": "10m 34s", "remaining_time": "5h 41m 21s"}
|
| 8 |
+
{"loss": 0.61988533, "token_acc": 0.80090435, "grad_norm": 0.13725959, "learning_rate": 9.66e-06, "memory(GiB)": 127.66, "train_speed(iter/s)": 0.010552, "epoch": 0.03424291, "global_step/max_steps": "8/233", "percentage": "3.43%", "elapsed_time": "12m 11s", "remaining_time": "5h 42m 47s"}
|
| 9 |
+
{"loss": 0.58300537, "token_acc": 0.81416453, "grad_norm": 0.19543347, "learning_rate": 9.61e-06, "memory(GiB)": 127.66, "train_speed(iter/s)": 0.01057, "epoch": 0.03852327, "global_step/max_steps": "9/233", "percentage": "3.86%", "elapsed_time": "13m 44s", "remaining_time": "5h 42m 3s"}
|
| 10 |
+
{"loss": 0.58969021, "token_acc": 0.82695811, "grad_norm": 0.23631856, "learning_rate": 9.57e-06, "memory(GiB)": 127.66, "train_speed(iter/s)": 0.0102, "epoch": 0.04280364, "global_step/max_steps": "10/233", "percentage": "4.29%", "elapsed_time": "15m 53s", "remaining_time": "5h 54m 24s"}
|
| 11 |
+
{"eval_loss": 0.59994543, "eval_token_acc": 0.81003124, "eval_runtime": 233.3714, "eval_samples_per_second": 1.98, "eval_steps_per_second": 0.249, "epoch": 0.04280364, "global_step/max_steps": "10/233", "percentage": "4.29%", "elapsed_time": "19m 46s", "remaining_time": "7h 21m 8s"}
|
| 12 |
+
{"loss": 0.64508778, "token_acc": 0.81136651, "grad_norm": 0.15494038, "learning_rate": 9.53e-06, "memory(GiB)": 127.66, "train_speed(iter/s)": 0.007619, "epoch": 0.047084, "global_step/max_steps": "11/233", "percentage": "4.72%", "elapsed_time": "23m 36s", "remaining_time": "7h 56m 37s"}
|
| 13 |
+
{"loss": 0.58474982, "token_acc": 0.81496302, "grad_norm": 0.15614207, "learning_rate": 9.48e-06, "memory(GiB)": 127.66, "train_speed(iter/s)": 0.007909, "epoch": 0.05136437, "global_step/max_steps": "12/233", "percentage": "5.15%", "elapsed_time": "24m 50s", "remaining_time": "7h 37m 29s"}
|
| 14 |
+
{"loss": 0.62716937, "token_acc": 0.82339642, "grad_norm": 0.16823439, "learning_rate": 9.44e-06, "memory(GiB)": 127.66, "train_speed(iter/s)": 0.008011, "epoch": 0.05564473, "global_step/max_steps": "13/233", "percentage": "5.58%", "elapsed_time": "26m 35s", "remaining_time": "7h 30m 7s"}
|
| 15 |
+
{"loss": 0.59660435, "token_acc": 0.84487376, "grad_norm": 0.1322335, "learning_rate": 9.4e-06, "memory(GiB)": 127.66, "train_speed(iter/s)": 0.007891, "epoch": 0.05992509, "global_step/max_steps": "14/233", "percentage": "6.01%", "elapsed_time": "29m 7s", "remaining_time": "7h 35m 34s"}
|
| 16 |
+
{"loss": 0.56707138, "token_acc": 0.85295479, "grad_norm": 0.14563887, "learning_rate": 9.36e-06, "memory(GiB)": 127.66, "train_speed(iter/s)": 0.008169, "epoch": 0.06420546, "global_step/max_steps": "15/233", "percentage": "6.44%", "elapsed_time": "30m 9s", "remaining_time": "7h 18m 15s"}
|
| 17 |
+
{"loss": 0.57920754, "token_acc": 0.83316378, "grad_norm": 0.14242059, "learning_rate": 9.31e-06, "memory(GiB)": 127.66, "train_speed(iter/s)": 0.008288, "epoch": 0.06848582, "global_step/max_steps": "16/233", "percentage": "6.87%", "elapsed_time": "31m 43s", "remaining_time": "7h 10m 18s"}
|
| 18 |
+
{"loss": 0.60998094, "token_acc": 0.84979702, "grad_norm": 0.15288945, "learning_rate": 9.27e-06, "memory(GiB)": 127.66, "train_speed(iter/s)": 0.008223, "epoch": 0.07276619, "global_step/max_steps": "17/233", "percentage": "7.30%", "elapsed_time": "34m 0s", "remaining_time": "7h 12m 5s"}
|
| 19 |
+
{"loss": 0.55885768, "token_acc": 0.79892252, "grad_norm": 0.18015395, "learning_rate": 9.23e-06, "memory(GiB)": 127.66, "train_speed(iter/s)": 0.00823, "epoch": 0.07704655, "global_step/max_steps": "18/233", "percentage": "7.73%", "elapsed_time": "36m 0s", "remaining_time": "7h 10m 2s"}
|
| 20 |
+
{"loss": 0.57408869, "token_acc": 0.82806532, "grad_norm": 0.16707274, "learning_rate": 9.18e-06, "memory(GiB)": 127.66, "train_speed(iter/s)": 0.008308, "epoch": 0.08132691, "global_step/max_steps": "19/233", "percentage": "8.15%", "elapsed_time": "37m 40s", "remaining_time": "7h 4m 16s"}
|
| 21 |
+
{"loss": 0.63770878, "token_acc": 0.81977924, "grad_norm": 0.15583961, "learning_rate": 9.14e-06, "memory(GiB)": 127.66, "train_speed(iter/s)": 0.007956, "epoch": 0.08560728, "global_step/max_steps": "20/233", "percentage": "8.58%", "elapsed_time": "41m 27s", "remaining_time": "7h 21m 27s"}
|
| 22 |
+
{"eval_loss": 0.57062572, "eval_token_acc": 0.8170977, "eval_runtime": 232.3774, "eval_samples_per_second": 1.988, "eval_steps_per_second": 0.25, "epoch": 0.08560728, "global_step/max_steps": "20/233", "percentage": "8.58%", "elapsed_time": "45m 19s", "remaining_time": "8h 2m 42s"}
|
| 23 |
+
{"loss": 0.6081934, "token_acc": 0.82475915, "grad_norm": 0.13550253, "learning_rate": 9.1e-06, "memory(GiB)": 127.66, "train_speed(iter/s)": 0.007128, "epoch": 0.08988764, "global_step/max_steps": "21/233", "percentage": "9.01%", "elapsed_time": "48m 39s", "remaining_time": "8h 11m 10s"}
|
| 24 |
+
{"loss": 0.53626227, "token_acc": 0.81810307, "grad_norm": 0.12228937, "learning_rate": 9.06e-06, "memory(GiB)": 127.66, "train_speed(iter/s)": 0.007297, "epoch": 0.094168, "global_step/max_steps": "22/233", "percentage": "9.44%", "elapsed_time": "49m 48s", "remaining_time": "7h 57m 38s"}
|
| 25 |
+
{"loss": 0.58173132, "token_acc": 0.81788485, "grad_norm": 0.13067725, "learning_rate": 9.01e-06, "memory(GiB)": 127.66, "train_speed(iter/s)": 0.00744, "epoch": 0.09844837, "global_step/max_steps": "23/233", "percentage": "9.87%", "elapsed_time": "51m 4s", "remaining_time": "7h 46m 22s"}
|
| 26 |
+
{"loss": 0.59200102, "token_acc": 0.82601231, "grad_norm": 0.13315721, "learning_rate": 8.97e-06, "memory(GiB)": 127.66, "train_speed(iter/s)": 0.00751, "epoch": 0.10272873, "global_step/max_steps": "24/233", "percentage": "10.30%", "elapsed_time": "52m 48s", "remaining_time": "7h 39m 56s"}
|
| 27 |
+
{"loss": 0.53160775, "token_acc": 0.84765362, "grad_norm": 0.13677236, "learning_rate": 8.93e-06, "memory(GiB)": 127.66, "train_speed(iter/s)": 0.0076, "epoch": 0.1070091, "global_step/max_steps": "25/233", "percentage": "10.73%", "elapsed_time": "54m 22s", "remaining_time": "7h 32m 26s"}
|
| 28 |
+
{"loss": 0.55127698, "token_acc": 0.85220042, "grad_norm": 0.20182617, "learning_rate": 8.88e-06, "memory(GiB)": 127.66, "train_speed(iter/s)": 0.007722, "epoch": 0.11128946, "global_step/max_steps": "26/233", "percentage": "11.16%", "elapsed_time": "55m 40s", "remaining_time": "7h 23m 12s"}
|
| 29 |
+
{"loss": 0.57834888, "token_acc": 0.82124202, "grad_norm": 0.43602487, "learning_rate": 8.84e-06, "memory(GiB)": 127.66, "train_speed(iter/s)": 0.007816, "epoch": 0.11556982, "global_step/max_steps": "27/233", "percentage": "11.59%", "elapsed_time": "57m 7s", "remaining_time": "7h 15m 50s"}
|
| 30 |
+
{"loss": 0.58652842, "token_acc": 0.8147252, "grad_norm": 0.2080746, "learning_rate": 8.8e-06, "memory(GiB)": 127.66, "train_speed(iter/s)": 0.007916, "epoch": 0.11985019, "global_step/max_steps": "28/233", "percentage": "12.02%", "elapsed_time": "58m 30s", "remaining_time": "7h 8m 19s"}
|
| 31 |
+
{"loss": 0.57351971, "token_acc": 0.83113969, "grad_norm": 0.11781906, "learning_rate": 8.76e-06, "memory(GiB)": 127.66, "train_speed(iter/s)": 0.007964, "epoch": 0.12413055, "global_step/max_steps": "29/233", "percentage": "12.45%", "elapsed_time": "1h 0m 14s", "remaining_time": "7h 3m 46s"}
|
| 32 |
+
{"loss": 0.57332683, "token_acc": 0.82596983, "grad_norm": 0.14135404, "learning_rate": 8.71e-06, "memory(GiB)": 127.66, "train_speed(iter/s)": 0.008097, "epoch": 0.12841091, "global_step/max_steps": "30/233", "percentage": "12.88%", "elapsed_time": "1h 1m 18s", "remaining_time": "6h 54m 48s"}
|
| 33 |
+
{"eval_loss": 0.56049991, "eval_token_acc": 0.82029019, "eval_runtime": 232.6365, "eval_samples_per_second": 1.986, "eval_steps_per_second": 0.249, "epoch": 0.12841091, "global_step/max_steps": "30/233", "percentage": "12.88%", "elapsed_time": "1h 5m 10s", "remaining_time": "7h 21m 2s"}
|
| 34 |
+
{"loss": 0.59806776, "token_acc": 0.8235928, "grad_norm": 0.25007367, "learning_rate": 8.67e-06, "memory(GiB)": 127.66, "train_speed(iter/s)": 0.007572, "epoch": 0.13269128, "global_step/max_steps": "31/233", "percentage": "13.30%", "elapsed_time": "1h 7m 46s", "remaining_time": "7h 21m 40s"}
|
| 35 |
+
{"loss": 0.56375688, "token_acc": 0.82393812, "grad_norm": 0.12149891, "learning_rate": 8.63e-06, "memory(GiB)": 127.66, "train_speed(iter/s)": 0.007518, "epoch": 0.13697164, "global_step/max_steps": "32/233", "percentage": "13.73%", "elapsed_time": "1h 10m 29s", "remaining_time": "7h 22m 45s"}
|
| 36 |
+
{"loss": 0.56226754, "token_acc": 0.82827688, "grad_norm": 0.12714922, "learning_rate": 8.58e-06, "memory(GiB)": 127.66, "train_speed(iter/s)": 0.007548, "epoch": 0.14125201, "global_step/max_steps": "33/233", "percentage": "14.16%", "elapsed_time": "1h 12m 25s", "remaining_time": "7h 18m 54s"}
|
| 37 |
+
{"loss": 0.53598464, "token_acc": 0.82137048, "grad_norm": 0.18553115, "learning_rate": 8.54e-06, "memory(GiB)": 127.66, "train_speed(iter/s)": 0.007631, "epoch": 0.14553237, "global_step/max_steps": "34/233", "percentage": "14.59%", "elapsed_time": "1h 13m 48s", "remaining_time": "7h 11m 59s"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
special_tokens_map.json
CHANGED
|
@@ -1,23 +1,17 @@
|
|
| 1 |
{
|
| 2 |
"bos_token": {
|
| 3 |
-
"content": "<
|
| 4 |
"lstrip": false,
|
| 5 |
"normalized": false,
|
| 6 |
"rstrip": false,
|
| 7 |
"single_word": false
|
| 8 |
},
|
| 9 |
"eos_token": {
|
| 10 |
-
"content": "<
|
| 11 |
"lstrip": false,
|
| 12 |
"normalized": false,
|
| 13 |
"rstrip": false,
|
| 14 |
"single_word": false
|
| 15 |
},
|
| 16 |
-
"pad_token":
|
| 17 |
-
"content": "<|end▁of▁sentence|>",
|
| 18 |
-
"lstrip": false,
|
| 19 |
-
"normalized": false,
|
| 20 |
-
"rstrip": false,
|
| 21 |
-
"single_word": false
|
| 22 |
-
}
|
| 23 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"bos_token": {
|
| 3 |
+
"content": "<|begin_of_text|>",
|
| 4 |
"lstrip": false,
|
| 5 |
"normalized": false,
|
| 6 |
"rstrip": false,
|
| 7 |
"single_word": false
|
| 8 |
},
|
| 9 |
"eos_token": {
|
| 10 |
+
"content": "<|eot_id|>",
|
| 11 |
"lstrip": false,
|
| 12 |
"normalized": false,
|
| 13 |
"rstrip": false,
|
| 14 |
"single_word": false
|
| 15 |
},
|
| 16 |
+
"pad_token": "<|eot_id|>"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
}
|
tokenizer.json
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
|
| 3 |
+
size 17209920
|
tokenizer_config.json
CHANGED
|
@@ -1,10 +1,7 @@
|
|
| 1 |
{
|
| 2 |
-
"add_bos_token": true,
|
| 3 |
-
"add_eos_token": false,
|
| 4 |
-
"add_prefix_space": null,
|
| 5 |
"added_tokens_decoder": {
|
| 6 |
"128000": {
|
| 7 |
-
"content": "<
|
| 8 |
"lstrip": false,
|
| 9 |
"normalized": false,
|
| 10 |
"rstrip": false,
|
|
@@ -12,7 +9,7 @@
|
|
| 12 |
"special": true
|
| 13 |
},
|
| 14 |
"128001": {
|
| 15 |
-
"content": "<
|
| 16 |
"lstrip": false,
|
| 17 |
"normalized": false,
|
| 18 |
"rstrip": false,
|
|
@@ -92,39 +89,39 @@
|
|
| 92 |
"special": true
|
| 93 |
},
|
| 94 |
"128011": {
|
| 95 |
-
"content": "<
|
| 96 |
"lstrip": false,
|
| 97 |
"normalized": false,
|
| 98 |
"rstrip": false,
|
| 99 |
"single_word": false,
|
| 100 |
-
"special":
|
| 101 |
},
|
| 102 |
"128012": {
|
| 103 |
-
"content": "<
|
| 104 |
"lstrip": false,
|
| 105 |
"normalized": false,
|
| 106 |
"rstrip": false,
|
| 107 |
"single_word": false,
|
| 108 |
-
"special":
|
| 109 |
},
|
| 110 |
"128013": {
|
| 111 |
-
"content": "<
|
| 112 |
"lstrip": false,
|
| 113 |
"normalized": false,
|
| 114 |
"rstrip": false,
|
| 115 |
"single_word": false,
|
| 116 |
-
"special":
|
| 117 |
},
|
| 118 |
"128014": {
|
| 119 |
-
"content": "<
|
| 120 |
"lstrip": false,
|
| 121 |
"normalized": false,
|
| 122 |
"rstrip": false,
|
| 123 |
"single_word": false,
|
| 124 |
-
"special":
|
| 125 |
},
|
| 126 |
"128015": {
|
| 127 |
-
"content": "<
|
| 128 |
"lstrip": false,
|
| 129 |
"normalized": false,
|
| 130 |
"rstrip": false,
|
|
@@ -2052,16 +2049,16 @@
|
|
| 2052 |
"special": true
|
| 2053 |
}
|
| 2054 |
},
|
| 2055 |
-
"bos_token": "<
|
| 2056 |
-
"chat_template": "{% if not
|
| 2057 |
-
"clean_up_tokenization_spaces":
|
| 2058 |
-
"eos_token": "<
|
| 2059 |
"extra_special_tokens": {},
|
| 2060 |
-
"
|
| 2061 |
-
|
| 2062 |
-
|
| 2063 |
-
|
| 2064 |
-
"
|
| 2065 |
-
"
|
| 2066 |
-
"
|
| 2067 |
}
|
|
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
| 2 |
"added_tokens_decoder": {
|
| 3 |
"128000": {
|
| 4 |
+
"content": "<|begin_of_text|>",
|
| 5 |
"lstrip": false,
|
| 6 |
"normalized": false,
|
| 7 |
"rstrip": false,
|
|
|
|
| 9 |
"special": true
|
| 10 |
},
|
| 11 |
"128001": {
|
| 12 |
+
"content": "<|end_of_text|>",
|
| 13 |
"lstrip": false,
|
| 14 |
"normalized": false,
|
| 15 |
"rstrip": false,
|
|
|
|
| 89 |
"special": true
|
| 90 |
},
|
| 91 |
"128011": {
|
| 92 |
+
"content": "<|reserved_special_token_3|>",
|
| 93 |
"lstrip": false,
|
| 94 |
"normalized": false,
|
| 95 |
"rstrip": false,
|
| 96 |
"single_word": false,
|
| 97 |
+
"special": true
|
| 98 |
},
|
| 99 |
"128012": {
|
| 100 |
+
"content": "<|reserved_special_token_4|>",
|
| 101 |
"lstrip": false,
|
| 102 |
"normalized": false,
|
| 103 |
"rstrip": false,
|
| 104 |
"single_word": false,
|
| 105 |
+
"special": true
|
| 106 |
},
|
| 107 |
"128013": {
|
| 108 |
+
"content": "<|reserved_special_token_5|>",
|
| 109 |
"lstrip": false,
|
| 110 |
"normalized": false,
|
| 111 |
"rstrip": false,
|
| 112 |
"single_word": false,
|
| 113 |
+
"special": true
|
| 114 |
},
|
| 115 |
"128014": {
|
| 116 |
+
"content": "<|reserved_special_token_6|>",
|
| 117 |
"lstrip": false,
|
| 118 |
"normalized": false,
|
| 119 |
"rstrip": false,
|
| 120 |
"single_word": false,
|
| 121 |
+
"special": true
|
| 122 |
},
|
| 123 |
"128015": {
|
| 124 |
+
"content": "<|reserved_special_token_7|>",
|
| 125 |
"lstrip": false,
|
| 126 |
"normalized": false,
|
| 127 |
"rstrip": false,
|
|
|
|
| 2049 |
"special": true
|
| 2050 |
}
|
| 2051 |
},
|
| 2052 |
+
"bos_token": "<|begin_of_text|>",
|
| 2053 |
+
"chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
|
| 2054 |
+
"clean_up_tokenization_spaces": true,
|
| 2055 |
+
"eos_token": "<|eot_id|>",
|
| 2056 |
"extra_special_tokens": {},
|
| 2057 |
+
"model_input_names": [
|
| 2058 |
+
"input_ids",
|
| 2059 |
+
"attention_mask"
|
| 2060 |
+
],
|
| 2061 |
+
"model_max_length": 131072,
|
| 2062 |
+
"pad_token": "<|eot_id|>",
|
| 2063 |
+
"tokenizer_class": "PreTrainedTokenizerFast"
|
| 2064 |
}
|
training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 8184
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e7552206885bc82600583417a2960985b56f483e5e7b34bbb31b1c0f283fd7fb
|
| 3 |
size 8184
|