minhnam commited on
Commit
8ac1129
·
verified ·
1 Parent(s): 48122d7

Training in progress, step 20

Browse files
adapter_config.json CHANGED
@@ -28,7 +28,7 @@
28
  "r": 8,
29
  "rank_pattern": {},
30
  "revision": null,
31
- "target_modules": "^(model.language_model.*\\.(gate_proj|k_proj|o_proj|v_proj|up_proj|down_proj|q_proj))$",
32
  "target_parameters": null,
33
  "task_type": "CAUSAL_LM",
34
  "trainable_token_indices": null,
 
28
  "r": 8,
29
  "rank_pattern": {},
30
  "revision": null,
31
+ "target_modules": "^(model.language_model.*\\.(o_proj|gate_proj|v_proj|up_proj|down_proj|q_proj|k_proj))$",
32
  "target_parameters": null,
33
  "task_type": "CAUSAL_LM",
34
  "trainable_token_indices": null,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4b68405c13c36d3dadd1b34b3c6f10522b58473fec299474890e4fd40b1006d2
3
  size 87368144
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:54c0f0f0d8fd2cf688985df1e8ddc39fc1d4a2ff0d5860a078863e952fd2f230
3
  size 87368144
args.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "output_dir": "/home/tran555/workspace/MPS-for-LG/outputs/8b_improve/v1-20260307-233613",
3
  "overwrite_output_dir": false,
4
  "do_train": false,
5
  "do_eval": false,
@@ -29,7 +29,7 @@
29
  "log_level": "passive",
30
  "log_level_replica": "warning",
31
  "log_on_each_node": true,
32
- "logging_dir": "/home/tran555/workspace/MPS-for-LG/outputs/8b_improve/v1-20260307-233613/runs",
33
  "logging_strategy": "steps",
34
  "logging_first_step": true,
35
  "logging_steps": 10,
@@ -64,7 +64,7 @@
64
  "dataloader_num_workers": 8,
65
  "dataloader_prefetch_factor": null,
66
  "past_index": -1,
67
- "run_name": "/home/tran555/workspace/MPS-for-LG/outputs/8b_improve/v1-20260307-233613",
68
  "disable_tqdm": null,
69
  "remove_unused_columns": true,
70
  "label_names": null,
@@ -372,5 +372,5 @@
372
  ],
373
  "hub": "<class 'swift.hub.hub.HFHub'>",
374
  "evaluation_strategy": "steps",
375
- "training_args": "Seq2SeqTrainingArguments(output_dir='/home/tran555/workspace/MPS-for-LG/outputs/8b_improve/v1-20260307-233613', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.STEPS: 'steps'>, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=16, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/tran555/workspace/MPS-for-LG/outputs/8b_improve/v1-20260307-233613/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=10, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.STEPS: 'steps'>, save_steps=20, save_total_limit=2, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=50, dataloader_num_workers=8, dataloader_prefetch_factor=2, past_index=-1, run_name='/home/tran555/workspace/MPS-for-LG/outputs/8b_improve/v1-20260307-233613', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), parallelism_config=None, deepspeed=None, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH_FUSED: 'adamw_torch_fused'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard', 'wandb'], project='huggingface', trackio_space_id='trackio', ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=True, resume_from_checkpoint=None, hub_model_id='minhnam/8b_improve', hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, hub_revision=None, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=18000000, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=True, use_liger_kernel=False, liger_kernel_config=None, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, tuner_backend='peft', vit_gradient_checkpointing=True, router_aux_loss_coef=0.0, enable_dft_loss=False, enable_channel_loss=False, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, aligner_lr=None, vit_lr=None, use_logits_to_keep=None, ds3_gather_for_generation=True, resume_only_model=False, optimizer=None, loss_type=None, eval_metric=None, callbacks=[], early_stop_interval=None, eval_use_evalscope=False, eval_dataset=[], eval_dataset_args=None, eval_limit=None, eval_generation_config=None, extra_eval_args=None, tuner_type='lora', use_galore=False, galore_target_modules=None, galore_rank=128, galore_update_proj_gap=50, galore_scale=1.0, galore_proj_type='std', galore_optim_per_parameter=False, galore_with_embedding=False, galore_quantization=False, galore_proj_quant=False, galore_proj_bits=4, galore_proj_group_size=256, galore_cos_threshold=0.4, galore_gamma_proj=2, galore_queue_size=5, lisa_activated_layers=0, lisa_step_interval=20, use_flash_ckpt=False)"
376
  }
 
1
  {
2
+ "output_dir": "/home/tran555/workspace/MPS-for-LG/outputs/8b_improve/v2-20260308-015654",
3
  "overwrite_output_dir": false,
4
  "do_train": false,
5
  "do_eval": false,
 
29
  "log_level": "passive",
30
  "log_level_replica": "warning",
31
  "log_on_each_node": true,
32
+ "logging_dir": "/home/tran555/workspace/MPS-for-LG/outputs/8b_improve/v2-20260308-015654/runs",
33
  "logging_strategy": "steps",
34
  "logging_first_step": true,
35
  "logging_steps": 10,
 
64
  "dataloader_num_workers": 8,
65
  "dataloader_prefetch_factor": null,
66
  "past_index": -1,
67
+ "run_name": "/home/tran555/workspace/MPS-for-LG/outputs/8b_improve/v2-20260308-015654",
68
  "disable_tqdm": null,
69
  "remove_unused_columns": true,
70
  "label_names": null,
 
372
  ],
373
  "hub": "<class 'swift.hub.hub.HFHub'>",
374
  "evaluation_strategy": "steps",
375
+ "training_args": "Seq2SeqTrainingArguments(output_dir='/home/tran555/workspace/MPS-for-LG/outputs/8b_improve/v2-20260308-015654', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.STEPS: 'steps'>, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=16, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/tran555/workspace/MPS-for-LG/outputs/8b_improve/v2-20260308-015654/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=10, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.STEPS: 'steps'>, save_steps=20, save_total_limit=2, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=50, dataloader_num_workers=8, dataloader_prefetch_factor=2, past_index=-1, run_name='/home/tran555/workspace/MPS-for-LG/outputs/8b_improve/v2-20260308-015654', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), parallelism_config=None, deepspeed=None, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH_FUSED: 'adamw_torch_fused'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard', 'wandb'], project='huggingface', trackio_space_id='trackio', ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=True, resume_from_checkpoint=None, hub_model_id='minhnam/8b_improve', hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, hub_revision=None, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=18000000, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=True, use_liger_kernel=False, liger_kernel_config=None, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, tuner_backend='peft', vit_gradient_checkpointing=True, router_aux_loss_coef=0.0, enable_dft_loss=False, enable_channel_loss=False, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, aligner_lr=None, vit_lr=None, use_logits_to_keep=None, ds3_gather_for_generation=True, resume_only_model=False, optimizer=None, loss_type=None, eval_metric=None, callbacks=[], early_stop_interval=None, eval_use_evalscope=False, eval_dataset=[], eval_dataset_args=None, eval_limit=None, eval_generation_config=None, extra_eval_args=None, tuner_type='lora', use_galore=False, galore_target_modules=None, galore_rank=128, galore_update_proj_gap=50, galore_scale=1.0, galore_proj_type='std', galore_optim_per_parameter=False, galore_with_embedding=False, galore_quantization=False, galore_proj_quant=False, galore_proj_bits=4, galore_proj_group_size=256, galore_cos_threshold=0.4, galore_gamma_proj=2, galore_queue_size=5, lisa_activated_layers=0, lisa_step_interval=20, use_flash_ckpt=False)"
376
  }
logging.jsonl CHANGED
@@ -1,35 +1,4 @@
1
- {"eval_loss": 4.60694456, "eval_runtime": 133.9528, "eval_samples_per_second": 2.023, "eval_steps_per_second": 0.508, "eval_token_acc": 0.73034986, "epoch": 0, "global_step/max_steps": "0/981", "percentage": "0.00%", "elapsed_time": "2m 13s", "memory(GiB)": 22.08, "train_speed(iter/s)": 0.0}
2
- {"loss": 1.16878664, "grad_norm": 0.5406512, "learning_rate": 2e-06, "token_acc": 0.72249161, "epoch": 0.00306396, "global_step/max_steps": "1/981", "percentage": "0.10%", "elapsed_time": "2m 38s", "remaining_time": "1d 19h 1m 26s", "memory(GiB)": 26.53, "train_speed(iter/s)": 0.006327}
3
- {"loss": 1.1736908, "grad_norm": 0.58365995, "learning_rate": 2e-05, "token_acc": 0.72564066, "epoch": 0.0306396, "global_step/max_steps": "10/981", "percentage": "1.02%", "elapsed_time": "6m 14s", "remaining_time": "10h 6m 12s", "memory(GiB)": 30.81, "train_speed(iter/s)": 0.026696}
4
- {"loss": 1.10928001, "grad_norm": 0.30432767, "learning_rate": 4e-05, "token_acc": 0.72841056, "epoch": 0.0612792, "global_step/max_steps": "20/981", "percentage": "2.04%", "elapsed_time": "10m 13s", "remaining_time": "8h 11m 1s", "memory(GiB)": 30.83, "train_speed(iter/s)": 0.032619}
5
- {"loss": 1.01687355, "grad_norm": 0.32448515, "learning_rate": 6e-05, "token_acc": 0.73686372, "epoch": 0.09191881, "global_step/max_steps": "30/981", "percentage": "3.06%", "elapsed_time": "14m 18s", "remaining_time": "7h 33m 48s", "memory(GiB)": 31.14, "train_speed(iter/s)": 0.034927}
6
- {"loss": 0.96155195, "grad_norm": 0.2220816, "learning_rate": 8e-05, "token_acc": 0.74544827, "epoch": 0.12255841, "global_step/max_steps": "40/981", "percentage": "4.08%", "elapsed_time": "18m 18s", "remaining_time": "7h 10m 35s", "memory(GiB)": 31.14, "train_speed(iter/s)": 0.036422}
7
- {"loss": 0.90479593, "grad_norm": 0.20830688, "learning_rate": 0.0001, "token_acc": 0.75520697, "epoch": 0.15319801, "global_step/max_steps": "50/981", "percentage": "5.10%", "elapsed_time": "22m 13s", "remaining_time": "6h 53m 54s", "memory(GiB)": 31.14, "train_speed(iter/s)": 0.037488}
8
- {"eval_loss": 3.45873928, "eval_runtime": 127.2661, "eval_samples_per_second": 2.129, "eval_steps_per_second": 0.534, "eval_token_acc": 0.76083446, "epoch": 0.15319801, "global_step/max_steps": "50/981", "percentage": "5.10%", "elapsed_time": "24m 21s", "remaining_time": "7h 33m 24s", "memory(GiB)": 31.14, "train_speed(iter/s)": 0.034222}
9
- {"loss": 0.86441708, "grad_norm": 0.22690073, "learning_rate": 9.997e-05, "token_acc": 0.76218481, "epoch": 0.18383761, "global_step/max_steps": "60/981", "percentage": "6.12%", "elapsed_time": "28m 13s", "remaining_time": "7h 13m 8s", "memory(GiB)": 31.43, "train_speed(iter/s)": 0.035439}
10
- {"loss": 0.84347658, "grad_norm": 0.21746078, "learning_rate": 9.989e-05, "token_acc": 0.76587901, "epoch": 0.21447721, "global_step/max_steps": "70/981", "percentage": "7.14%", "elapsed_time": "32m 11s", "remaining_time": "6h 58m 57s", "memory(GiB)": 31.44, "train_speed(iter/s)": 0.036241}
11
- {"loss": 0.82026901, "grad_norm": 0.25000939, "learning_rate": 9.974e-05, "token_acc": 0.76941109, "epoch": 0.24511681, "global_step/max_steps": "80/981", "percentage": "8.15%", "elapsed_time": "36m 5s", "remaining_time": "6h 46m 31s", "memory(GiB)": 31.44, "train_speed(iter/s)": 0.036939}
12
- {"loss": 0.8162899, "grad_norm": 0.27177727, "learning_rate": 9.955e-05, "token_acc": 0.76922509, "epoch": 0.27575642, "global_step/max_steps": "90/981", "percentage": "9.17%", "elapsed_time": "40m 1s", "remaining_time": "6h 36m 16s", "memory(GiB)": 31.44, "train_speed(iter/s)": 0.037475}
13
- {"loss": 0.79723577, "grad_norm": 0.27377754, "learning_rate": 9.929e-05, "token_acc": 0.77333168, "epoch": 0.30639602, "global_step/max_steps": "100/981", "percentage": "10.19%", "elapsed_time": "43m 58s", "remaining_time": "6h 27m 25s", "memory(GiB)": 31.44, "train_speed(iter/s)": 0.0379}
14
- {"eval_loss": 3.05908155, "eval_runtime": 127.3528, "eval_samples_per_second": 2.128, "eval_steps_per_second": 0.534, "eval_token_acc": 0.78018345, "epoch": 0.30639602, "global_step/max_steps": "100/981", "percentage": "10.19%", "elapsed_time": "46m 5s", "remaining_time": "6h 46m 7s", "memory(GiB)": 31.44, "train_speed(iter/s)": 0.036155}
15
- {"loss": 0.77615604, "grad_norm": 0.28501898, "learning_rate": 9.898e-05, "token_acc": 0.77767518, "epoch": 0.33703562, "global_step/max_steps": "110/981", "percentage": "11.21%", "elapsed_time": "50m 3s", "remaining_time": "6h 36m 19s", "memory(GiB)": 31.44, "train_speed(iter/s)": 0.036627}
16
- {"loss": 0.75500541, "grad_norm": 0.31821913, "learning_rate": 9.861e-05, "token_acc": 0.78205977, "epoch": 0.36767522, "global_step/max_steps": "120/981", "percentage": "12.23%", "elapsed_time": "54m 6s", "remaining_time": "6h 28m 12s", "memory(GiB)": 31.44, "train_speed(iter/s)": 0.036965}
17
- {"loss": 0.75957093, "grad_norm": 0.30940121, "learning_rate": 9.819e-05, "token_acc": 0.78205525, "epoch": 0.39831482, "global_step/max_steps": "130/981", "percentage": "13.25%", "elapsed_time": "57m 57s", "remaining_time": "6h 19m 26s", "memory(GiB)": 31.44, "train_speed(iter/s)": 0.03738}
18
- {"loss": 0.75672827, "grad_norm": 0.30645499, "learning_rate": 9.771e-05, "token_acc": 0.78172702, "epoch": 0.42895442, "global_step/max_steps": "140/981", "percentage": "14.27%", "elapsed_time": "1h 1m 55s", "remaining_time": "6h 11m 59s", "memory(GiB)": 31.44, "train_speed(iter/s)": 0.03768}
19
- {"loss": 0.76083798, "grad_norm": 0.32120869, "learning_rate": 9.718e-05, "token_acc": 0.77918948, "epoch": 0.45959403, "global_step/max_steps": "150/981", "percentage": "15.29%", "elapsed_time": "1h 5m 51s", "remaining_time": "6h 4m 53s", "memory(GiB)": 31.44, "train_speed(iter/s)": 0.037956}
20
- {"eval_loss": 2.91496015, "eval_runtime": 127.6281, "eval_samples_per_second": 2.123, "eval_steps_per_second": 0.533, "eval_token_acc": 0.78703836, "epoch": 0.45959403, "global_step/max_steps": "150/981", "percentage": "15.29%", "elapsed_time": "1h 7m 59s", "remaining_time": "6h 16m 40s", "memory(GiB)": 31.44, "train_speed(iter/s)": 0.036768}
21
- {"loss": 0.74864783, "grad_norm": 0.32117596, "learning_rate": 9.659e-05, "token_acc": 0.7828482, "epoch": 0.49023363, "global_step/max_steps": "160/981", "percentage": "16.31%", "elapsed_time": "1h 12m 0s", "remaining_time": "6h 9m 29s", "memory(GiB)": 31.44, "train_speed(iter/s)": 0.037033}
22
- {"loss": 0.75316892, "grad_norm": 0.43682739, "learning_rate": 9.596e-05, "token_acc": 0.78050173, "epoch": 0.52087323, "global_step/max_steps": "170/981", "percentage": "17.33%", "elapsed_time": "1h 15m 56s", "remaining_time": "6h 2m 16s", "memory(GiB)": 31.44, "train_speed(iter/s)": 0.037311}
23
- {"loss": 0.73518486, "grad_norm": 0.34708086, "learning_rate": 9.527e-05, "token_acc": 0.78610289, "epoch": 0.55151283, "global_step/max_steps": "180/981", "percentage": "18.35%", "elapsed_time": "1h 19m 52s", "remaining_time": "5h 55m 27s", "memory(GiB)": 31.44, "train_speed(iter/s)": 0.037557}
24
- {"loss": 0.71814475, "grad_norm": 0.35068321, "learning_rate": 9.452e-05, "token_acc": 0.79092584, "epoch": 0.58215243, "global_step/max_steps": "190/981", "percentage": "19.37%", "elapsed_time": "1h 23m 48s", "remaining_time": "5h 48m 54s", "memory(GiB)": 31.44, "train_speed(iter/s)": 0.037785}
25
- {"loss": 0.71195493, "grad_norm": 0.35233057, "learning_rate": 9.373e-05, "token_acc": 0.79211222, "epoch": 0.61279203, "global_step/max_steps": "200/981", "percentage": "20.39%", "elapsed_time": "1h 27m 51s", "remaining_time": "5h 43m 5s", "memory(GiB)": 32.39, "train_speed(iter/s)": 0.03794}
26
- {"eval_loss": 2.84019732, "eval_runtime": 127.5929, "eval_samples_per_second": 2.124, "eval_steps_per_second": 0.533, "eval_token_acc": 0.79085061, "epoch": 0.61279203, "global_step/max_steps": "200/981", "percentage": "20.39%", "elapsed_time": "1h 29m 59s", "remaining_time": "5h 51m 23s", "memory(GiB)": 32.39, "train_speed(iter/s)": 0.037043}
27
- {"loss": 0.7260323, "grad_norm": 0.36426014, "learning_rate": 9.289e-05, "token_acc": 0.78744558, "epoch": 0.64343164, "global_step/max_steps": "210/981", "percentage": "21.41%", "elapsed_time": "1h 33m 54s", "remaining_time": "5h 44m 46s", "memory(GiB)": 32.39, "train_speed(iter/s)": 0.037271}
28
- {"loss": 0.72307291, "grad_norm": 0.33783168, "learning_rate": 9.2e-05, "token_acc": 0.78724322, "epoch": 0.67407124, "global_step/max_steps": "220/981", "percentage": "22.43%", "elapsed_time": "1h 37m 50s", "remaining_time": "5h 38m 27s", "memory(GiB)": 32.39, "train_speed(iter/s)": 0.037473}
29
- {"loss": 0.72595448, "grad_norm": 0.36016241, "learning_rate": 9.106e-05, "token_acc": 0.78829838, "epoch": 0.70471084, "global_step/max_steps": "230/981", "percentage": "23.45%", "elapsed_time": "1h 41m 44s", "remaining_time": "5h 32m 13s", "memory(GiB)": 32.39, "train_speed(iter/s)": 0.037676}
30
- {"loss": 0.71018023, "grad_norm": 0.35668209, "learning_rate": 9.007e-05, "token_acc": 0.7899977, "epoch": 0.73535044, "global_step/max_steps": "240/981", "percentage": "24.46%", "elapsed_time": "1h 45m 37s", "remaining_time": "5h 26m 6s", "memory(GiB)": 32.39, "train_speed(iter/s)": 0.037872}
31
- {"loss": 0.72569895, "grad_norm": 0.35717034, "learning_rate": 8.904e-05, "token_acc": 0.78774392, "epoch": 0.76599004, "global_step/max_steps": "250/981", "percentage": "25.48%", "elapsed_time": "1h 49m 30s", "remaining_time": "5h 20m 10s", "memory(GiB)": 32.39, "train_speed(iter/s)": 0.038051}
32
- {"eval_loss": 2.78667426, "eval_runtime": 127.4136, "eval_samples_per_second": 2.127, "eval_steps_per_second": 0.534, "eval_token_acc": 0.79334898, "epoch": 0.76599004, "global_step/max_steps": "250/981", "percentage": "25.48%", "elapsed_time": "1h 51m 37s", "remaining_time": "5h 26m 23s", "memory(GiB)": 32.39, "train_speed(iter/s)": 0.037327}
33
- {"loss": 0.72528644, "grad_norm": 0.36592126, "learning_rate": 8.796e-05, "token_acc": 0.78691859, "epoch": 0.79662964, "global_step/max_steps": "260/981", "percentage": "26.50%", "elapsed_time": "1h 55m 38s", "remaining_time": "5h 20m 39s", "memory(GiB)": 32.39, "train_speed(iter/s)": 0.037475}
34
- {"loss": 0.69857063, "grad_norm": 0.37026086, "learning_rate": 8.684e-05, "token_acc": 0.79334664, "epoch": 0.82726925, "global_step/max_steps": "270/981", "percentage": "27.52%", "elapsed_time": "1h 59m 32s", "remaining_time": "5h 14m 47s", "memory(GiB)": 32.39, "train_speed(iter/s)": 0.037644}
35
- {"loss": 0.70383086, "grad_norm": 0.37280416, "learning_rate": 8.568e-05, "token_acc": 0.79312025, "epoch": 0.85790885, "global_step/max_steps": "280/981", "percentage": "28.54%", "elapsed_time": "2h 3m 25s", "remaining_time": "5h 9m 0s", "memory(GiB)": 32.39, "train_speed(iter/s)": 0.03781}
 
1
+ {"eval_loss": 4.60694456, "eval_runtime": 132.1394, "eval_samples_per_second": 2.051, "eval_steps_per_second": 0.515, "eval_token_acc": 0.73034986, "epoch": 0, "global_step/max_steps": "0/981", "percentage": "0.00%", "elapsed_time": "2m 12s", "memory(GiB)": 22.08, "train_speed(iter/s)": 0.0}
2
+ {"loss": 1.16878664, "grad_norm": 0.5406909, "learning_rate": 2e-06, "token_acc": 0.72249161, "epoch": 0.00306396, "global_step/max_steps": "1/981", "percentage": "0.10%", "elapsed_time": "2m 35s", "remaining_time": "1d 18h 27m 54s", "memory(GiB)": 26.53, "train_speed(iter/s)": 0.00641}
3
+ {"loss": 1.17375967, "grad_norm": 0.58584249, "learning_rate": 2e-05, "token_acc": 0.72573775, "epoch": 0.0306396, "global_step/max_steps": "10/981", "percentage": "1.02%", "elapsed_time": "6m 13s", "remaining_time": "10h 4m 42s", "memory(GiB)": 30.81, "train_speed(iter/s)": 0.026762}
4
+ {"loss": 1.10936718, "grad_norm": 0.30921611, "learning_rate": 4e-05, "token_acc": 0.72818226, "epoch": 0.0612792, "global_step/max_steps": "20/981", "percentage": "2.04%", "elapsed_time": "10m 9s", "remaining_time": "8h 7m 44s", "memory(GiB)": 30.83, "train_speed(iter/s)": 0.032838}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
runs/events.out.tfevents.1772953118.h012.gautschi.rcac.purdue.edu.2974893.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87c3beb6fa4a4e2af68161b32c83510ccb2a1e97df929061d6f1fbbac534ec7b
3
+ size 8687
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:71bdfd0fc9daa882a6ef8a4448464ebcbb2d7ca79e2ed78b60eb195888e23930
3
  size 7313
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28f265fb913594b016142565a017a8043f0370b23ce88c91169f3acb5c67d1b6
3
  size 7313