diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..50689e8388dbc0f9d3327f2f96fac6ad3876057d --- /dev/null +++ b/README.md @@ -0,0 +1,138 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- quant_method: bitsandbytes +- load_in_8bit: True +- load_in_4bit: False +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 + +The following `bitsandbytes` quantization config was used during training: +- quant_method: bitsandbytes +- load_in_8bit: True +- load_in_4bit: False +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 + +The following `bitsandbytes` quantization config was used during training: +- quant_method: bitsandbytes +- load_in_8bit: True +- load_in_4bit: False +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 + +The following `bitsandbytes` quantization config was used during training: +- quant_method: bitsandbytes +- load_in_8bit: True +- load_in_4bit: False +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 + +The following `bitsandbytes` quantization config was used during training: +- quant_method: bitsandbytes +- load_in_8bit: True +- load_in_4bit: False +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 + +The following `bitsandbytes` quantization config was used during training: +- quant_method: bitsandbytes +- load_in_8bit: True +- load_in_4bit: False +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 + +The following `bitsandbytes` quantization config was used during training: +- quant_method: bitsandbytes +- load_in_8bit: True +- load_in_4bit: False +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 + +The following `bitsandbytes` quantization config was used during training: +- quant_method: bitsandbytes +- load_in_8bit: True +- load_in_4bit: False +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 + +The following `bitsandbytes` quantization config was used during training: +- quant_method: bitsandbytes +- load_in_8bit: True +- load_in_4bit: False +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 + +The following `bitsandbytes` quantization config was used during training: +- quant_method: bitsandbytes +- load_in_8bit: True +- load_in_4bit: False +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 +### Framework versions + +- PEFT 0.5.0 +- PEFT 0.5.0 +- PEFT 0.5.0 +- PEFT 0.5.0 +- PEFT 0.5.0 +- PEFT 0.5.0 +- PEFT 0.5.0 +- PEFT 0.5.0 +- PEFT 0.5.0 + +- PEFT 0.5.0 diff --git a/adapter_config.json b/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..96a10d09371f10f296c5b7fcb5b0ddd7be98eef2 --- /dev/null +++ b/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-2-7b-hf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/adapter_model.bin b/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..8bb1d20f3daedd8a56bbd6c071a5682ef9652455 --- /dev/null +++ b/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e89447616aaca469bb6106ebcb8ceb874e823d65a63eb3b53c8db20befadc78e +size 16823434 diff --git a/checkpoint-10/README.md b/checkpoint-10/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4787359fdf0253321d922e272bacc387ca46ce22 --- /dev/null +++ b/checkpoint-10/README.md @@ -0,0 +1,21 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- quant_method: bitsandbytes +- load_in_8bit: True +- load_in_4bit: False +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 +### Framework versions + + +- PEFT 0.5.0 diff --git a/checkpoint-10/adapter_config.json b/checkpoint-10/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..96a10d09371f10f296c5b7fcb5b0ddd7be98eef2 --- /dev/null +++ b/checkpoint-10/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-2-7b-hf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-10/adapter_model.safetensors b/checkpoint-10/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..68b3323de555f9ef8e4b39e5284f02e45ee4ef15 --- /dev/null +++ b/checkpoint-10/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ca7d05227d9acba4c7e6cfbf34f504664cb1204f380206708787481b36f515d +size 16794200 diff --git a/checkpoint-10/optimizer.pt b/checkpoint-10/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..5892c53c2181acd443e676401773526450a9183d --- /dev/null +++ b/checkpoint-10/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe2e4ca09b0a6c6516a6e0065447618b3db4cd6de700dc91ddb551e63b925136 +size 33663866 diff --git a/checkpoint-10/rng_state.pth b/checkpoint-10/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..0282722eb4aff0de4d7a5b0219b6e11f18941117 --- /dev/null +++ b/checkpoint-10/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfbd7c894a8329427aaddfb3f58ef2ae223d564384e6736a16a6c8f82f40c07a +size 14244 diff --git a/checkpoint-10/scheduler.pt b/checkpoint-10/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..795f58fc12649cf2a2773df821e661e88f28a061 --- /dev/null +++ b/checkpoint-10/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be4a312bb2bd4504a249badcb489acaa3a989d6be708dfc5ab7e5fbf10ccf35e +size 1064 diff --git a/checkpoint-10/trainer_state.json b/checkpoint-10/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..03746bfa6976ded6427837ea1266b505aba283c1 --- /dev/null +++ b/checkpoint-10/trainer_state.json @@ -0,0 +1,47 @@ +{ + "best_metric": 2.228980779647827, + "best_model_checkpoint": "/scratch/kwamea/llama-output/checkpoint-10", + "epoch": 0.7142857142857143, + "eval_steps": 5, + "global_step": 10, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.36, + "learning_rate": 6.428571428571429e-05, + "loss": 2.3035, + "step": 5 + }, + { + "epoch": 0.36, + "eval_loss": 2.2832000255584717, + "eval_runtime": 2.2848, + "eval_samples_per_second": 2.626, + "eval_steps_per_second": 0.438, + "step": 5 + }, + { + "epoch": 0.71, + "learning_rate": 2.857142857142857e-05, + "loss": 2.201, + "step": 10 + }, + { + "epoch": 0.71, + "eval_loss": 2.228980779647827, + "eval_runtime": 2.2845, + "eval_samples_per_second": 2.626, + "eval_steps_per_second": 0.438, + "step": 10 + } + ], + "logging_steps": 5, + "max_steps": 14, + "num_train_epochs": 1, + "save_steps": 10, + "total_flos": 3249703118438400.0, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-10/training_args.bin b/checkpoint-10/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d1a3256e456b171e039cc4dd303842f2117c4480 --- /dev/null +++ b/checkpoint-10/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e31c0035e9b22c0b3219696704e35bc756922ae5383b862d4d36c885235386d +size 4600 diff --git a/checkpoint-740/README.md b/checkpoint-740/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4787359fdf0253321d922e272bacc387ca46ce22 --- /dev/null +++ b/checkpoint-740/README.md @@ -0,0 +1,21 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- quant_method: bitsandbytes +- load_in_8bit: True +- load_in_4bit: False +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 +### Framework versions + + +- PEFT 0.5.0 diff --git a/checkpoint-740/adapter_config.json b/checkpoint-740/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..96a10d09371f10f296c5b7fcb5b0ddd7be98eef2 --- /dev/null +++ b/checkpoint-740/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-2-7b-hf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-740/adapter_model.safetensors b/checkpoint-740/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6ed35709aeef4948992fad13c8dfbb02773e6c48 --- /dev/null +++ b/checkpoint-740/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f03e1c534886cae902eb7a95808425f743c4cc3a1c59b1e07887784083ecba1d +size 16794200 diff --git a/checkpoint-740/optimizer.pt b/checkpoint-740/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..91fe8bed09ab5fa20aaba58a2e58f5d508dd09e7 --- /dev/null +++ b/checkpoint-740/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74d299e76fecfc33dfb4e3257e588b1f1b71b75702404d67c6e7b40b83877416 +size 33663866 diff --git a/checkpoint-740/rng_state.pth b/checkpoint-740/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..00ed38ceee966aabc6b21f0a80b053c17f9f69f0 --- /dev/null +++ b/checkpoint-740/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24f126ecca3177183e3c620be89ec9369ae206954d236b75754682aeb21d0ba2 +size 14244 diff --git a/checkpoint-740/scheduler.pt b/checkpoint-740/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..853123783f1b73c1215627880826262629746f12 --- /dev/null +++ b/checkpoint-740/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6234655278ff34dff7cdc63b30f02513595625a8f6adbe76c717d04fbf89be9f +size 1064 diff --git a/checkpoint-740/trainer_state.json b/checkpoint-740/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f77f653505768bb2ee077d825b49e49997ac9694 --- /dev/null +++ b/checkpoint-740/trainer_state.json @@ -0,0 +1,2091 @@ +{ + "best_metric": 1.6580705642700195, + "best_model_checkpoint": "/scratch/kwamea/llama-output/checkpoint-700", + "epoch": 1.9023136246786634, + "eval_steps": 5, + "global_step": 740, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 9.93573264781491e-05, + "loss": 1.9824, + "step": 5 + }, + { + "epoch": 0.01, + "eval_loss": 1.954728126525879, + "eval_runtime": 31.6803, + "eval_samples_per_second": 2.651, + "eval_steps_per_second": 0.347, + "step": 5 + }, + { + "epoch": 0.03, + "learning_rate": 9.87146529562982e-05, + "loss": 1.9249, + "step": 10 + }, + { + "epoch": 0.03, + "eval_loss": 1.8934264183044434, + "eval_runtime": 31.7798, + "eval_samples_per_second": 2.643, + "eval_steps_per_second": 0.346, + "step": 10 + }, + { + "epoch": 0.04, + "learning_rate": 9.80719794344473e-05, + "loss": 1.8609, + "step": 15 + }, + { + "epoch": 0.04, + "eval_loss": 1.8421852588653564, + "eval_runtime": 31.7999, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 15 + }, + { + "epoch": 0.05, + "learning_rate": 9.742930591259641e-05, + "loss": 1.8268, + "step": 20 + }, + { + "epoch": 0.05, + "eval_loss": 1.8279638290405273, + "eval_runtime": 31.8095, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 20 + }, + { + "epoch": 0.06, + "learning_rate": 9.67866323907455e-05, + "loss": 1.8349, + "step": 25 + }, + { + "epoch": 0.06, + "eval_loss": 1.813267469406128, + "eval_runtime": 31.8291, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 0.346, + "step": 25 + }, + { + "epoch": 0.08, + "learning_rate": 9.61439588688946e-05, + "loss": 1.8239, + "step": 30 + }, + { + "epoch": 0.08, + "eval_loss": 1.8008779287338257, + "eval_runtime": 31.82, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 30 + }, + { + "epoch": 0.09, + "learning_rate": 9.550128534704372e-05, + "loss": 1.7177, + "step": 35 + }, + { + "epoch": 0.09, + "eval_loss": 1.789797067642212, + "eval_runtime": 31.8423, + "eval_samples_per_second": 2.638, + "eval_steps_per_second": 0.345, + "step": 35 + }, + { + "epoch": 0.1, + "learning_rate": 9.485861182519281e-05, + "loss": 1.7962, + "step": 40 + }, + { + "epoch": 0.1, + "eval_loss": 1.7808287143707275, + "eval_runtime": 31.8387, + "eval_samples_per_second": 2.638, + "eval_steps_per_second": 0.345, + "step": 40 + }, + { + "epoch": 0.12, + "learning_rate": 9.421593830334192e-05, + "loss": 1.715, + "step": 45 + }, + { + "epoch": 0.12, + "eval_loss": 1.771236777305603, + "eval_runtime": 31.8526, + "eval_samples_per_second": 2.637, + "eval_steps_per_second": 0.345, + "step": 45 + }, + { + "epoch": 0.13, + "learning_rate": 9.357326478149101e-05, + "loss": 1.7577, + "step": 50 + }, + { + "epoch": 0.13, + "eval_loss": 1.7619521617889404, + "eval_runtime": 31.828, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 0.346, + "step": 50 + }, + { + "epoch": 0.14, + "learning_rate": 9.29305912596401e-05, + "loss": 1.7323, + "step": 55 + }, + { + "epoch": 0.14, + "eval_loss": 1.7440986633300781, + "eval_runtime": 31.8301, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 0.346, + "step": 55 + }, + { + "epoch": 0.15, + "learning_rate": 9.228791773778921e-05, + "loss": 1.7122, + "step": 60 + }, + { + "epoch": 0.15, + "eval_loss": 1.712996006011963, + "eval_runtime": 31.8185, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 60 + }, + { + "epoch": 0.17, + "learning_rate": 9.16452442159383e-05, + "loss": 1.7042, + "step": 65 + }, + { + "epoch": 0.17, + "eval_loss": 1.7101707458496094, + "eval_runtime": 31.8153, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 65 + }, + { + "epoch": 0.18, + "learning_rate": 9.100257069408741e-05, + "loss": 1.7242, + "step": 70 + }, + { + "epoch": 0.18, + "eval_loss": 1.7038702964782715, + "eval_runtime": 31.8191, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 70 + }, + { + "epoch": 0.19, + "learning_rate": 9.03598971722365e-05, + "loss": 1.7033, + "step": 75 + }, + { + "epoch": 0.19, + "eval_loss": 1.7004183530807495, + "eval_runtime": 31.811, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 75 + }, + { + "epoch": 0.21, + "learning_rate": 8.97172236503856e-05, + "loss": 1.6934, + "step": 80 + }, + { + "epoch": 0.21, + "eval_loss": 1.6985355615615845, + "eval_runtime": 31.8221, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 80 + }, + { + "epoch": 0.22, + "learning_rate": 8.907455012853471e-05, + "loss": 1.6443, + "step": 85 + }, + { + "epoch": 0.22, + "eval_loss": 1.696852684020996, + "eval_runtime": 31.8219, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 85 + }, + { + "epoch": 0.23, + "learning_rate": 8.84318766066838e-05, + "loss": 1.7008, + "step": 90 + }, + { + "epoch": 0.23, + "eval_loss": 1.695594072341919, + "eval_runtime": 31.8205, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 90 + }, + { + "epoch": 0.24, + "learning_rate": 8.778920308483291e-05, + "loss": 1.6873, + "step": 95 + }, + { + "epoch": 0.24, + "eval_loss": 1.6928850412368774, + "eval_runtime": 31.814, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 95 + }, + { + "epoch": 0.26, + "learning_rate": 8.7146529562982e-05, + "loss": 1.6721, + "step": 100 + }, + { + "epoch": 0.26, + "eval_loss": 1.6919386386871338, + "eval_runtime": 31.8127, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 100 + }, + { + "epoch": 0.27, + "learning_rate": 8.650385604113111e-05, + "loss": 1.6669, + "step": 105 + }, + { + "epoch": 0.27, + "eval_loss": 1.6905940771102905, + "eval_runtime": 31.8874, + "eval_samples_per_second": 2.634, + "eval_steps_per_second": 0.345, + "step": 105 + }, + { + "epoch": 0.28, + "learning_rate": 8.586118251928022e-05, + "loss": 1.7009, + "step": 110 + }, + { + "epoch": 0.28, + "eval_loss": 1.689422607421875, + "eval_runtime": 31.8489, + "eval_samples_per_second": 2.637, + "eval_steps_per_second": 0.345, + "step": 110 + }, + { + "epoch": 0.3, + "learning_rate": 8.521850899742931e-05, + "loss": 1.7151, + "step": 115 + }, + { + "epoch": 0.3, + "eval_loss": 1.688527226448059, + "eval_runtime": 31.8318, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 0.346, + "step": 115 + }, + { + "epoch": 0.31, + "learning_rate": 8.457583547557842e-05, + "loss": 1.7165, + "step": 120 + }, + { + "epoch": 0.31, + "eval_loss": 1.686748743057251, + "eval_runtime": 31.835, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 0.346, + "step": 120 + }, + { + "epoch": 0.32, + "learning_rate": 8.393316195372751e-05, + "loss": 1.7015, + "step": 125 + }, + { + "epoch": 0.32, + "eval_loss": 1.6855812072753906, + "eval_runtime": 31.8269, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 0.346, + "step": 125 + }, + { + "epoch": 0.33, + "learning_rate": 8.32904884318766e-05, + "loss": 1.6818, + "step": 130 + }, + { + "epoch": 0.33, + "eval_loss": 1.6846078634262085, + "eval_runtime": 31.8617, + "eval_samples_per_second": 2.636, + "eval_steps_per_second": 0.345, + "step": 130 + }, + { + "epoch": 0.35, + "learning_rate": 8.264781491002571e-05, + "loss": 1.7151, + "step": 135 + }, + { + "epoch": 0.35, + "eval_loss": 1.6845166683197021, + "eval_runtime": 31.8631, + "eval_samples_per_second": 2.636, + "eval_steps_per_second": 0.345, + "step": 135 + }, + { + "epoch": 0.36, + "learning_rate": 8.200514138817481e-05, + "loss": 1.6667, + "step": 140 + }, + { + "epoch": 0.36, + "eval_loss": 1.6832246780395508, + "eval_runtime": 31.8276, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 0.346, + "step": 140 + }, + { + "epoch": 0.37, + "learning_rate": 8.136246786632391e-05, + "loss": 1.6586, + "step": 145 + }, + { + "epoch": 0.37, + "eval_loss": 1.6824573278427124, + "eval_runtime": 31.8115, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 145 + }, + { + "epoch": 0.39, + "learning_rate": 8.071979434447301e-05, + "loss": 1.6999, + "step": 150 + }, + { + "epoch": 0.39, + "eval_loss": 1.681317687034607, + "eval_runtime": 31.805, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 150 + }, + { + "epoch": 0.4, + "learning_rate": 8.007712082262212e-05, + "loss": 1.6894, + "step": 155 + }, + { + "epoch": 0.4, + "eval_loss": 1.6812355518341064, + "eval_runtime": 31.8039, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 155 + }, + { + "epoch": 0.41, + "learning_rate": 7.943444730077121e-05, + "loss": 1.6669, + "step": 160 + }, + { + "epoch": 0.41, + "eval_loss": 1.679802656173706, + "eval_runtime": 31.8021, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 160 + }, + { + "epoch": 0.42, + "learning_rate": 7.87917737789203e-05, + "loss": 1.6481, + "step": 165 + }, + { + "epoch": 0.42, + "eval_loss": 1.679994821548462, + "eval_runtime": 31.8201, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 165 + }, + { + "epoch": 0.44, + "learning_rate": 7.814910025706941e-05, + "loss": 1.7042, + "step": 170 + }, + { + "epoch": 0.44, + "eval_loss": 1.6786224842071533, + "eval_runtime": 31.8184, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 170 + }, + { + "epoch": 0.45, + "learning_rate": 7.750642673521852e-05, + "loss": 1.6564, + "step": 175 + }, + { + "epoch": 0.45, + "eval_loss": 1.6783201694488525, + "eval_runtime": 31.8137, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 175 + }, + { + "epoch": 0.46, + "learning_rate": 7.686375321336761e-05, + "loss": 1.6714, + "step": 180 + }, + { + "epoch": 0.46, + "eval_loss": 1.677714228630066, + "eval_runtime": 31.804, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 180 + }, + { + "epoch": 0.48, + "learning_rate": 7.622107969151672e-05, + "loss": 1.6705, + "step": 185 + }, + { + "epoch": 0.48, + "eval_loss": 1.6772773265838623, + "eval_runtime": 31.8077, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 185 + }, + { + "epoch": 0.49, + "learning_rate": 7.557840616966581e-05, + "loss": 1.6624, + "step": 190 + }, + { + "epoch": 0.49, + "eval_loss": 1.6766114234924316, + "eval_runtime": 31.8164, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 190 + }, + { + "epoch": 0.5, + "learning_rate": 7.493573264781492e-05, + "loss": 1.6415, + "step": 195 + }, + { + "epoch": 0.5, + "eval_loss": 1.675940752029419, + "eval_runtime": 31.8124, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 195 + }, + { + "epoch": 0.51, + "learning_rate": 7.429305912596401e-05, + "loss": 1.6633, + "step": 200 + }, + { + "epoch": 0.51, + "eval_loss": 1.6751865148544312, + "eval_runtime": 31.8123, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 200 + }, + { + "epoch": 0.53, + "learning_rate": 7.365038560411311e-05, + "loss": 1.6142, + "step": 205 + }, + { + "epoch": 0.53, + "eval_loss": 1.6750575304031372, + "eval_runtime": 31.8049, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 205 + }, + { + "epoch": 0.54, + "learning_rate": 7.300771208226222e-05, + "loss": 1.6736, + "step": 210 + }, + { + "epoch": 0.54, + "eval_loss": 1.6746423244476318, + "eval_runtime": 31.8111, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 210 + }, + { + "epoch": 0.55, + "learning_rate": 7.236503856041131e-05, + "loss": 1.6414, + "step": 215 + }, + { + "epoch": 0.55, + "eval_loss": 1.6740491390228271, + "eval_runtime": 31.8083, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 215 + }, + { + "epoch": 0.57, + "learning_rate": 7.172236503856042e-05, + "loss": 1.6725, + "step": 220 + }, + { + "epoch": 0.57, + "eval_loss": 1.673694372177124, + "eval_runtime": 31.8025, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 220 + }, + { + "epoch": 0.58, + "learning_rate": 7.107969151670951e-05, + "loss": 1.679, + "step": 225 + }, + { + "epoch": 0.58, + "eval_loss": 1.6733818054199219, + "eval_runtime": 31.8077, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 225 + }, + { + "epoch": 0.59, + "learning_rate": 7.043701799485862e-05, + "loss": 1.7204, + "step": 230 + }, + { + "epoch": 0.59, + "eval_loss": 1.6725033521652222, + "eval_runtime": 31.8079, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 230 + }, + { + "epoch": 0.6, + "learning_rate": 6.979434447300771e-05, + "loss": 1.6816, + "step": 235 + }, + { + "epoch": 0.6, + "eval_loss": 1.67205011844635, + "eval_runtime": 31.809, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 235 + }, + { + "epoch": 0.62, + "learning_rate": 6.91516709511568e-05, + "loss": 1.634, + "step": 240 + }, + { + "epoch": 0.62, + "eval_loss": 1.6717994213104248, + "eval_runtime": 31.8075, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 240 + }, + { + "epoch": 0.63, + "learning_rate": 6.850899742930593e-05, + "loss": 1.6761, + "step": 245 + }, + { + "epoch": 0.63, + "eval_loss": 1.6714757680892944, + "eval_runtime": 31.8052, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 245 + }, + { + "epoch": 0.64, + "learning_rate": 6.786632390745502e-05, + "loss": 1.6996, + "step": 250 + }, + { + "epoch": 0.64, + "eval_loss": 1.671559453010559, + "eval_runtime": 31.8089, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 250 + }, + { + "epoch": 0.66, + "learning_rate": 6.722365038560411e-05, + "loss": 1.6302, + "step": 255 + }, + { + "epoch": 0.66, + "eval_loss": 1.6711652278900146, + "eval_runtime": 31.8105, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 255 + }, + { + "epoch": 0.67, + "learning_rate": 6.658097686375322e-05, + "loss": 1.6611, + "step": 260 + }, + { + "epoch": 0.67, + "eval_loss": 1.6704061031341553, + "eval_runtime": 31.8135, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 260 + }, + { + "epoch": 0.68, + "learning_rate": 6.593830334190231e-05, + "loss": 1.6586, + "step": 265 + }, + { + "epoch": 0.68, + "eval_loss": 1.6705658435821533, + "eval_runtime": 31.7998, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 265 + }, + { + "epoch": 0.69, + "learning_rate": 6.529562982005142e-05, + "loss": 1.6838, + "step": 270 + }, + { + "epoch": 0.69, + "eval_loss": 1.6697953939437866, + "eval_runtime": 31.7994, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 270 + }, + { + "epoch": 0.71, + "learning_rate": 6.465295629820052e-05, + "loss": 1.6499, + "step": 275 + }, + { + "epoch": 0.71, + "eval_loss": 1.669343113899231, + "eval_runtime": 31.8147, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 275 + }, + { + "epoch": 0.72, + "learning_rate": 6.401028277634962e-05, + "loss": 1.6424, + "step": 280 + }, + { + "epoch": 0.72, + "eval_loss": 1.6693596839904785, + "eval_runtime": 31.8155, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 280 + }, + { + "epoch": 0.73, + "learning_rate": 6.336760925449872e-05, + "loss": 1.6238, + "step": 285 + }, + { + "epoch": 0.73, + "eval_loss": 1.669128179550171, + "eval_runtime": 31.8219, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 285 + }, + { + "epoch": 0.75, + "learning_rate": 6.272493573264781e-05, + "loss": 1.6538, + "step": 290 + }, + { + "epoch": 0.75, + "eval_loss": 1.6691263914108276, + "eval_runtime": 31.81, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 290 + }, + { + "epoch": 0.76, + "learning_rate": 6.208226221079692e-05, + "loss": 1.656, + "step": 295 + }, + { + "epoch": 0.76, + "eval_loss": 1.66847825050354, + "eval_runtime": 31.7918, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 295 + }, + { + "epoch": 0.77, + "learning_rate": 6.143958868894601e-05, + "loss": 1.6407, + "step": 300 + }, + { + "epoch": 0.77, + "eval_loss": 1.6683858633041382, + "eval_runtime": 31.8114, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 300 + }, + { + "epoch": 0.78, + "learning_rate": 6.079691516709511e-05, + "loss": 1.6468, + "step": 305 + }, + { + "epoch": 0.78, + "eval_loss": 1.6678352355957031, + "eval_runtime": 31.7975, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 305 + }, + { + "epoch": 0.8, + "learning_rate": 6.015424164524421e-05, + "loss": 1.6579, + "step": 310 + }, + { + "epoch": 0.8, + "eval_loss": 1.6675294637680054, + "eval_runtime": 31.8059, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 310 + }, + { + "epoch": 0.81, + "learning_rate": 5.951156812339333e-05, + "loss": 1.6331, + "step": 315 + }, + { + "epoch": 0.81, + "eval_loss": 1.6670397520065308, + "eval_runtime": 31.7994, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 315 + }, + { + "epoch": 0.82, + "learning_rate": 5.886889460154242e-05, + "loss": 1.6634, + "step": 320 + }, + { + "epoch": 0.82, + "eval_loss": 1.6668330430984497, + "eval_runtime": 31.807, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 320 + }, + { + "epoch": 0.84, + "learning_rate": 5.822622107969152e-05, + "loss": 1.6406, + "step": 325 + }, + { + "epoch": 0.84, + "eval_loss": 1.6668546199798584, + "eval_runtime": 31.8004, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 325 + }, + { + "epoch": 0.85, + "learning_rate": 5.758354755784062e-05, + "loss": 1.6614, + "step": 330 + }, + { + "epoch": 0.85, + "eval_loss": 1.666812539100647, + "eval_runtime": 31.8023, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 330 + }, + { + "epoch": 0.86, + "learning_rate": 5.694087403598972e-05, + "loss": 1.6598, + "step": 335 + }, + { + "epoch": 0.86, + "eval_loss": 1.666398048400879, + "eval_runtime": 31.8087, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 335 + }, + { + "epoch": 0.87, + "learning_rate": 5.6298200514138824e-05, + "loss": 1.6698, + "step": 340 + }, + { + "epoch": 0.87, + "eval_loss": 1.6660391092300415, + "eval_runtime": 31.8016, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 340 + }, + { + "epoch": 0.89, + "learning_rate": 5.5655526992287924e-05, + "loss": 1.6346, + "step": 345 + }, + { + "epoch": 0.89, + "eval_loss": 1.665901780128479, + "eval_runtime": 31.7878, + "eval_samples_per_second": 2.643, + "eval_steps_per_second": 0.346, + "step": 345 + }, + { + "epoch": 0.9, + "learning_rate": 5.501285347043702e-05, + "loss": 1.6617, + "step": 350 + }, + { + "epoch": 0.9, + "eval_loss": 1.666027307510376, + "eval_runtime": 31.7999, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 350 + }, + { + "epoch": 0.91, + "learning_rate": 5.437017994858612e-05, + "loss": 1.6623, + "step": 355 + }, + { + "epoch": 0.91, + "eval_loss": 1.6658612489700317, + "eval_runtime": 31.8128, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 355 + }, + { + "epoch": 0.93, + "learning_rate": 5.372750642673522e-05, + "loss": 1.6737, + "step": 360 + }, + { + "epoch": 0.93, + "eval_loss": 1.6656837463378906, + "eval_runtime": 31.8094, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 360 + }, + { + "epoch": 0.94, + "learning_rate": 5.308483290488432e-05, + "loss": 1.6981, + "step": 365 + }, + { + "epoch": 0.94, + "eval_loss": 1.6653661727905273, + "eval_runtime": 31.8031, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 365 + }, + { + "epoch": 0.95, + "learning_rate": 5.244215938303342e-05, + "loss": 1.6832, + "step": 370 + }, + { + "epoch": 0.95, + "eval_loss": 1.6652257442474365, + "eval_runtime": 31.8077, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 370 + }, + { + "epoch": 0.96, + "learning_rate": 5.1799485861182514e-05, + "loss": 1.6925, + "step": 375 + }, + { + "epoch": 0.96, + "eval_loss": 1.664839267730713, + "eval_runtime": 31.8082, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 375 + }, + { + "epoch": 0.98, + "learning_rate": 5.1156812339331615e-05, + "loss": 1.6721, + "step": 380 + }, + { + "epoch": 0.98, + "eval_loss": 1.6642415523529053, + "eval_runtime": 31.7927, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 380 + }, + { + "epoch": 0.99, + "learning_rate": 5.051413881748073e-05, + "loss": 1.6208, + "step": 385 + }, + { + "epoch": 0.99, + "eval_loss": 1.6641273498535156, + "eval_runtime": 31.8006, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 385 + }, + { + "epoch": 1.0, + "learning_rate": 4.987146529562982e-05, + "loss": 1.6519, + "step": 390 + }, + { + "epoch": 1.0, + "eval_loss": 1.6645455360412598, + "eval_runtime": 31.8026, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 390 + }, + { + "epoch": 1.02, + "learning_rate": 4.922879177377892e-05, + "loss": 1.6581, + "step": 395 + }, + { + "epoch": 1.02, + "eval_loss": 1.6641591787338257, + "eval_runtime": 31.8025, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 395 + }, + { + "epoch": 1.03, + "learning_rate": 4.8586118251928024e-05, + "loss": 1.6768, + "step": 400 + }, + { + "epoch": 1.03, + "eval_loss": 1.6637850999832153, + "eval_runtime": 31.806, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 400 + }, + { + "epoch": 1.04, + "learning_rate": 4.7943444730077124e-05, + "loss": 1.6257, + "step": 405 + }, + { + "epoch": 1.04, + "eval_loss": 1.6636526584625244, + "eval_runtime": 31.8116, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 405 + }, + { + "epoch": 1.05, + "learning_rate": 4.7300771208226225e-05, + "loss": 1.6522, + "step": 410 + }, + { + "epoch": 1.05, + "eval_loss": 1.6634690761566162, + "eval_runtime": 31.8063, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 410 + }, + { + "epoch": 1.07, + "learning_rate": 4.6658097686375325e-05, + "loss": 1.615, + "step": 415 + }, + { + "epoch": 1.07, + "eval_loss": 1.6637413501739502, + "eval_runtime": 31.8115, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 415 + }, + { + "epoch": 1.08, + "learning_rate": 4.6015424164524426e-05, + "loss": 1.5967, + "step": 420 + }, + { + "epoch": 1.08, + "eval_loss": 1.6633188724517822, + "eval_runtime": 31.8016, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 420 + }, + { + "epoch": 1.09, + "learning_rate": 4.537275064267352e-05, + "loss": 1.6708, + "step": 425 + }, + { + "epoch": 1.09, + "eval_loss": 1.6634286642074585, + "eval_runtime": 31.8074, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 425 + }, + { + "epoch": 1.11, + "learning_rate": 4.473007712082262e-05, + "loss": 1.6696, + "step": 430 + }, + { + "epoch": 1.11, + "eval_loss": 1.663163661956787, + "eval_runtime": 31.799, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 430 + }, + { + "epoch": 1.12, + "learning_rate": 4.408740359897173e-05, + "loss": 1.6344, + "step": 435 + }, + { + "epoch": 1.12, + "eval_loss": 1.6632493734359741, + "eval_runtime": 31.8032, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 435 + }, + { + "epoch": 1.13, + "learning_rate": 4.344473007712083e-05, + "loss": 1.5922, + "step": 440 + }, + { + "epoch": 1.13, + "eval_loss": 1.6627737283706665, + "eval_runtime": 31.8064, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 440 + }, + { + "epoch": 1.14, + "learning_rate": 4.280205655526993e-05, + "loss": 1.6541, + "step": 445 + }, + { + "epoch": 1.14, + "eval_loss": 1.6630815267562866, + "eval_runtime": 31.8083, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 445 + }, + { + "epoch": 1.16, + "learning_rate": 4.215938303341902e-05, + "loss": 1.6348, + "step": 450 + }, + { + "epoch": 1.16, + "eval_loss": 1.662713885307312, + "eval_runtime": 31.7977, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 450 + }, + { + "epoch": 1.17, + "learning_rate": 4.151670951156812e-05, + "loss": 1.6444, + "step": 455 + }, + { + "epoch": 1.17, + "eval_loss": 1.6629937887191772, + "eval_runtime": 31.799, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 455 + }, + { + "epoch": 1.18, + "learning_rate": 4.0874035989717224e-05, + "loss": 1.6629, + "step": 460 + }, + { + "epoch": 1.18, + "eval_loss": 1.6626592874526978, + "eval_runtime": 31.8136, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 460 + }, + { + "epoch": 1.2, + "learning_rate": 4.0231362467866324e-05, + "loss": 1.6327, + "step": 465 + }, + { + "epoch": 1.2, + "eval_loss": 1.6625391244888306, + "eval_runtime": 31.802, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 465 + }, + { + "epoch": 1.21, + "learning_rate": 3.958868894601543e-05, + "loss": 1.6236, + "step": 470 + }, + { + "epoch": 1.21, + "eval_loss": 1.6622934341430664, + "eval_runtime": 31.8055, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 470 + }, + { + "epoch": 1.22, + "learning_rate": 3.8946015424164526e-05, + "loss": 1.6232, + "step": 475 + }, + { + "epoch": 1.22, + "eval_loss": 1.661791443824768, + "eval_runtime": 31.8035, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 475 + }, + { + "epoch": 1.23, + "learning_rate": 3.8303341902313626e-05, + "loss": 1.6374, + "step": 480 + }, + { + "epoch": 1.23, + "eval_loss": 1.6618887186050415, + "eval_runtime": 31.7892, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 480 + }, + { + "epoch": 1.25, + "learning_rate": 3.766066838046273e-05, + "loss": 1.6422, + "step": 485 + }, + { + "epoch": 1.25, + "eval_loss": 1.6620476245880127, + "eval_runtime": 31.7974, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 485 + }, + { + "epoch": 1.26, + "learning_rate": 3.701799485861183e-05, + "loss": 1.631, + "step": 490 + }, + { + "epoch": 1.26, + "eval_loss": 1.6622774600982666, + "eval_runtime": 31.7978, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 490 + }, + { + "epoch": 1.27, + "learning_rate": 3.637532133676093e-05, + "loss": 1.609, + "step": 495 + }, + { + "epoch": 1.27, + "eval_loss": 1.661568284034729, + "eval_runtime": 31.8034, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 495 + }, + { + "epoch": 1.29, + "learning_rate": 3.573264781491003e-05, + "loss": 1.6444, + "step": 500 + }, + { + "epoch": 1.29, + "eval_loss": 1.6610949039459229, + "eval_runtime": 31.8005, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 500 + }, + { + "epoch": 1.3, + "learning_rate": 3.508997429305913e-05, + "loss": 1.6387, + "step": 505 + }, + { + "epoch": 1.3, + "eval_loss": 1.6611486673355103, + "eval_runtime": 31.8079, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 505 + }, + { + "epoch": 1.31, + "learning_rate": 3.444730077120823e-05, + "loss": 1.6507, + "step": 510 + }, + { + "epoch": 1.31, + "eval_loss": 1.661084532737732, + "eval_runtime": 31.8028, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 510 + }, + { + "epoch": 1.32, + "learning_rate": 3.380462724935733e-05, + "loss": 1.6377, + "step": 515 + }, + { + "epoch": 1.32, + "eval_loss": 1.6610103845596313, + "eval_runtime": 31.8101, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 515 + }, + { + "epoch": 1.34, + "learning_rate": 3.316195372750643e-05, + "loss": 1.6351, + "step": 520 + }, + { + "epoch": 1.34, + "eval_loss": 1.6608690023422241, + "eval_runtime": 31.7986, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 520 + }, + { + "epoch": 1.35, + "learning_rate": 3.251928020565553e-05, + "loss": 1.6196, + "step": 525 + }, + { + "epoch": 1.35, + "eval_loss": 1.6606587171554565, + "eval_runtime": 31.8015, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 525 + }, + { + "epoch": 1.36, + "learning_rate": 3.1876606683804625e-05, + "loss": 1.646, + "step": 530 + }, + { + "epoch": 1.36, + "eval_loss": 1.6605886220932007, + "eval_runtime": 31.804, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 530 + }, + { + "epoch": 1.38, + "learning_rate": 3.1233933161953726e-05, + "loss": 1.6824, + "step": 535 + }, + { + "epoch": 1.38, + "eval_loss": 1.6603879928588867, + "eval_runtime": 31.8111, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 535 + }, + { + "epoch": 1.39, + "learning_rate": 3.059125964010283e-05, + "loss": 1.6115, + "step": 540 + }, + { + "epoch": 1.39, + "eval_loss": 1.660337209701538, + "eval_runtime": 31.805, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 540 + }, + { + "epoch": 1.4, + "learning_rate": 2.994858611825193e-05, + "loss": 1.6243, + "step": 545 + }, + { + "epoch": 1.4, + "eval_loss": 1.6606225967407227, + "eval_runtime": 31.808, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 545 + }, + { + "epoch": 1.41, + "learning_rate": 2.930591259640103e-05, + "loss": 1.6486, + "step": 550 + }, + { + "epoch": 1.41, + "eval_loss": 1.6604374647140503, + "eval_runtime": 31.8072, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 550 + }, + { + "epoch": 1.43, + "learning_rate": 2.866323907455013e-05, + "loss": 1.6865, + "step": 555 + }, + { + "epoch": 1.43, + "eval_loss": 1.6601544618606567, + "eval_runtime": 31.8069, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 555 + }, + { + "epoch": 1.44, + "learning_rate": 2.802056555269923e-05, + "loss": 1.6616, + "step": 560 + }, + { + "epoch": 1.44, + "eval_loss": 1.660386562347412, + "eval_runtime": 31.803, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 560 + }, + { + "epoch": 1.45, + "learning_rate": 2.737789203084833e-05, + "loss": 1.6336, + "step": 565 + }, + { + "epoch": 1.45, + "eval_loss": 1.660247802734375, + "eval_runtime": 31.8084, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 565 + }, + { + "epoch": 1.47, + "learning_rate": 2.673521850899743e-05, + "loss": 1.6699, + "step": 570 + }, + { + "epoch": 1.47, + "eval_loss": 1.660292148590088, + "eval_runtime": 31.8031, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 570 + }, + { + "epoch": 1.48, + "learning_rate": 2.6092544987146534e-05, + "loss": 1.6472, + "step": 575 + }, + { + "epoch": 1.48, + "eval_loss": 1.6598683595657349, + "eval_runtime": 31.8181, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 575 + }, + { + "epoch": 1.49, + "learning_rate": 2.5449871465295634e-05, + "loss": 1.6723, + "step": 580 + }, + { + "epoch": 1.49, + "eval_loss": 1.6599065065383911, + "eval_runtime": 31.8124, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 580 + }, + { + "epoch": 1.5, + "learning_rate": 2.480719794344473e-05, + "loss": 1.663, + "step": 585 + }, + { + "epoch": 1.5, + "eval_loss": 1.6599762439727783, + "eval_runtime": 31.8175, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 585 + }, + { + "epoch": 1.52, + "learning_rate": 2.4164524421593832e-05, + "loss": 1.6519, + "step": 590 + }, + { + "epoch": 1.52, + "eval_loss": 1.6599925756454468, + "eval_runtime": 31.8209, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 590 + }, + { + "epoch": 1.53, + "learning_rate": 2.3521850899742933e-05, + "loss": 1.6345, + "step": 595 + }, + { + "epoch": 1.53, + "eval_loss": 1.6600078344345093, + "eval_runtime": 31.8031, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 595 + }, + { + "epoch": 1.54, + "learning_rate": 2.2879177377892033e-05, + "loss": 1.654, + "step": 600 + }, + { + "epoch": 1.54, + "eval_loss": 1.6599884033203125, + "eval_runtime": 31.8281, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 0.346, + "step": 600 + }, + { + "epoch": 1.56, + "learning_rate": 2.2236503856041134e-05, + "loss": 1.691, + "step": 605 + }, + { + "epoch": 1.56, + "eval_loss": 1.6599520444869995, + "eval_runtime": 31.8168, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 605 + }, + { + "epoch": 1.57, + "learning_rate": 2.159383033419023e-05, + "loss": 1.6714, + "step": 610 + }, + { + "epoch": 1.57, + "eval_loss": 1.659857988357544, + "eval_runtime": 31.8153, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 610 + }, + { + "epoch": 1.58, + "learning_rate": 2.095115681233933e-05, + "loss": 1.5977, + "step": 615 + }, + { + "epoch": 1.58, + "eval_loss": 1.6596992015838623, + "eval_runtime": 31.8087, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 615 + }, + { + "epoch": 1.59, + "learning_rate": 2.0308483290488432e-05, + "loss": 1.6429, + "step": 620 + }, + { + "epoch": 1.59, + "eval_loss": 1.6595497131347656, + "eval_runtime": 31.8151, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 620 + }, + { + "epoch": 1.61, + "learning_rate": 1.9665809768637533e-05, + "loss": 1.6091, + "step": 625 + }, + { + "epoch": 1.61, + "eval_loss": 1.6597559452056885, + "eval_runtime": 31.8076, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 625 + }, + { + "epoch": 1.62, + "learning_rate": 1.9023136246786633e-05, + "loss": 1.5956, + "step": 630 + }, + { + "epoch": 1.62, + "eval_loss": 1.6593599319458008, + "eval_runtime": 31.8058, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 630 + }, + { + "epoch": 1.63, + "learning_rate": 1.8380462724935734e-05, + "loss": 1.6697, + "step": 635 + }, + { + "epoch": 1.63, + "eval_loss": 1.6592466831207275, + "eval_runtime": 31.8589, + "eval_samples_per_second": 2.637, + "eval_steps_per_second": 0.345, + "step": 635 + }, + { + "epoch": 1.65, + "learning_rate": 1.7737789203084834e-05, + "loss": 1.6535, + "step": 640 + }, + { + "epoch": 1.65, + "eval_loss": 1.6592165231704712, + "eval_runtime": 31.8225, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 640 + }, + { + "epoch": 1.66, + "learning_rate": 1.7095115681233935e-05, + "loss": 1.6722, + "step": 645 + }, + { + "epoch": 1.66, + "eval_loss": 1.659324049949646, + "eval_runtime": 31.8388, + "eval_samples_per_second": 2.638, + "eval_steps_per_second": 0.345, + "step": 645 + }, + { + "epoch": 1.67, + "learning_rate": 1.6452442159383032e-05, + "loss": 1.6347, + "step": 650 + }, + { + "epoch": 1.67, + "eval_loss": 1.6590396165847778, + "eval_runtime": 31.8174, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 650 + }, + { + "epoch": 1.68, + "learning_rate": 1.5809768637532136e-05, + "loss": 1.6223, + "step": 655 + }, + { + "epoch": 1.68, + "eval_loss": 1.658969521522522, + "eval_runtime": 31.8275, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 0.346, + "step": 655 + }, + { + "epoch": 1.7, + "learning_rate": 1.5167095115681235e-05, + "loss": 1.6309, + "step": 660 + }, + { + "epoch": 1.7, + "eval_loss": 1.6587274074554443, + "eval_runtime": 31.8146, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 660 + }, + { + "epoch": 1.71, + "learning_rate": 1.4524421593830334e-05, + "loss": 1.6499, + "step": 665 + }, + { + "epoch": 1.71, + "eval_loss": 1.65865957736969, + "eval_runtime": 31.8091, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 665 + }, + { + "epoch": 1.72, + "learning_rate": 1.3881748071979436e-05, + "loss": 1.6713, + "step": 670 + }, + { + "epoch": 1.72, + "eval_loss": 1.6586016416549683, + "eval_runtime": 31.8229, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 670 + }, + { + "epoch": 1.74, + "learning_rate": 1.3239074550128535e-05, + "loss": 1.6348, + "step": 675 + }, + { + "epoch": 1.74, + "eval_loss": 1.6584962606430054, + "eval_runtime": 31.8035, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 675 + }, + { + "epoch": 1.75, + "learning_rate": 1.2596401028277636e-05, + "loss": 1.6447, + "step": 680 + }, + { + "epoch": 1.75, + "eval_loss": 1.6585614681243896, + "eval_runtime": 31.8101, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 680 + }, + { + "epoch": 1.76, + "learning_rate": 1.1953727506426736e-05, + "loss": 1.6511, + "step": 685 + }, + { + "epoch": 1.76, + "eval_loss": 1.6583601236343384, + "eval_runtime": 31.8125, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 685 + }, + { + "epoch": 1.77, + "learning_rate": 1.1311053984575835e-05, + "loss": 1.6496, + "step": 690 + }, + { + "epoch": 1.77, + "eval_loss": 1.6583328247070312, + "eval_runtime": 31.8179, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 690 + }, + { + "epoch": 1.79, + "learning_rate": 1.0668380462724936e-05, + "loss": 1.6421, + "step": 695 + }, + { + "epoch": 1.79, + "eval_loss": 1.6583188772201538, + "eval_runtime": 31.8165, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 695 + }, + { + "epoch": 1.8, + "learning_rate": 1.0025706940874038e-05, + "loss": 1.6126, + "step": 700 + }, + { + "epoch": 1.8, + "eval_loss": 1.6580705642700195, + "eval_runtime": 31.8164, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 700 + }, + { + "epoch": 1.81, + "learning_rate": 9.383033419023137e-06, + "loss": 1.6226, + "step": 705 + }, + { + "epoch": 1.81, + "eval_loss": 1.6584465503692627, + "eval_runtime": 31.816, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 705 + }, + { + "epoch": 1.83, + "learning_rate": 8.740359897172237e-06, + "loss": 1.6923, + "step": 710 + }, + { + "epoch": 1.83, + "eval_loss": 1.658342957496643, + "eval_runtime": 31.8215, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 710 + }, + { + "epoch": 1.84, + "learning_rate": 8.097686375321336e-06, + "loss": 1.6224, + "step": 715 + }, + { + "epoch": 1.84, + "eval_loss": 1.6582502126693726, + "eval_runtime": 31.812, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 715 + }, + { + "epoch": 1.85, + "learning_rate": 7.4550128534704376e-06, + "loss": 1.6587, + "step": 720 + }, + { + "epoch": 1.85, + "eval_loss": 1.658232569694519, + "eval_runtime": 31.8059, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 720 + }, + { + "epoch": 1.86, + "learning_rate": 6.812339331619537e-06, + "loss": 1.6641, + "step": 725 + }, + { + "epoch": 1.86, + "eval_loss": 1.6582419872283936, + "eval_runtime": 31.81, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 725 + }, + { + "epoch": 1.88, + "learning_rate": 6.169665809768638e-06, + "loss": 1.6419, + "step": 730 + }, + { + "epoch": 1.88, + "eval_loss": 1.6582107543945312, + "eval_runtime": 31.833, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 0.346, + "step": 730 + }, + { + "epoch": 1.89, + "learning_rate": 5.526992287917738e-06, + "loss": 1.652, + "step": 735 + }, + { + "epoch": 1.89, + "eval_loss": 1.6582270860671997, + "eval_runtime": 31.8801, + "eval_samples_per_second": 2.635, + "eval_steps_per_second": 0.345, + "step": 735 + }, + { + "epoch": 1.9, + "learning_rate": 4.884318766066838e-06, + "loss": 1.6487, + "step": 740 + }, + { + "epoch": 1.9, + "eval_loss": 1.6582915782928467, + "eval_runtime": 31.8265, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 0.346, + "step": 740 + } + ], + "logging_steps": 5, + "max_steps": 778, + "num_train_epochs": 2, + "save_steps": 10, + "total_flos": 2.4039678818648064e+17, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-740/training_args.bin b/checkpoint-740/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..48088b6a3851b1352bfa758dd2b3ad558168087f --- /dev/null +++ b/checkpoint-740/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccc6a8542abd91027de58ebfccafdf919108680707834fe188d91cae077ef8e9 +size 4600 diff --git a/checkpoint-750/README.md b/checkpoint-750/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4787359fdf0253321d922e272bacc387ca46ce22 --- /dev/null +++ b/checkpoint-750/README.md @@ -0,0 +1,21 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- quant_method: bitsandbytes +- load_in_8bit: True +- load_in_4bit: False +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 +### Framework versions + + +- PEFT 0.5.0 diff --git a/checkpoint-750/adapter_config.json b/checkpoint-750/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..96a10d09371f10f296c5b7fcb5b0ddd7be98eef2 --- /dev/null +++ b/checkpoint-750/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-2-7b-hf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-750/adapter_model.safetensors b/checkpoint-750/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..143440317ac9047a7e8542ba944d29a3cc4cf075 --- /dev/null +++ b/checkpoint-750/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0db055afa4e779064330cfa471192b67fc333a5cae786bb3fa230f8be5a897f +size 16794200 diff --git a/checkpoint-750/optimizer.pt b/checkpoint-750/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..00a4770a81a11c57a6ce6d1e89a2a3bd4730e080 --- /dev/null +++ b/checkpoint-750/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07c6460edcba33b54b6ac46970a9eb988c702b55e3f0bc05da88f3f21dbc55b5 +size 33663866 diff --git a/checkpoint-750/rng_state.pth b/checkpoint-750/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..51c56c2426708b8edf000d5db85d40b860820885 --- /dev/null +++ b/checkpoint-750/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67ce38b2aa4d6ac4ebbc000d2ae5e7ed24241a918687e62a058926935f4a4446 +size 14244 diff --git a/checkpoint-750/scheduler.pt b/checkpoint-750/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..002e00fa616f4cf5f6e5044cb49dfaad24a6e685 --- /dev/null +++ b/checkpoint-750/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c4aa7fa9070b6f1e0780c2b6ec486cdbc883e6f06e8f42ae16069e6d9cd1a7c +size 1064 diff --git a/checkpoint-750/trainer_state.json b/checkpoint-750/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6e6b76c10ec8b4745cf28906499921621890d393 --- /dev/null +++ b/checkpoint-750/trainer_state.json @@ -0,0 +1,2119 @@ +{ + "best_metric": 1.6578683853149414, + "best_model_checkpoint": "/scratch/kwamea/llama-output/checkpoint-750", + "epoch": 1.9280205655526992, + "eval_steps": 5, + "global_step": 750, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 9.93573264781491e-05, + "loss": 1.9824, + "step": 5 + }, + { + "epoch": 0.01, + "eval_loss": 1.954728126525879, + "eval_runtime": 31.6803, + "eval_samples_per_second": 2.651, + "eval_steps_per_second": 0.347, + "step": 5 + }, + { + "epoch": 0.03, + "learning_rate": 9.87146529562982e-05, + "loss": 1.9249, + "step": 10 + }, + { + "epoch": 0.03, + "eval_loss": 1.8934264183044434, + "eval_runtime": 31.7798, + "eval_samples_per_second": 2.643, + "eval_steps_per_second": 0.346, + "step": 10 + }, + { + "epoch": 0.04, + "learning_rate": 9.80719794344473e-05, + "loss": 1.8609, + "step": 15 + }, + { + "epoch": 0.04, + "eval_loss": 1.8421852588653564, + "eval_runtime": 31.7999, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 15 + }, + { + "epoch": 0.05, + "learning_rate": 9.742930591259641e-05, + "loss": 1.8268, + "step": 20 + }, + { + "epoch": 0.05, + "eval_loss": 1.8279638290405273, + "eval_runtime": 31.8095, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 20 + }, + { + "epoch": 0.06, + "learning_rate": 9.67866323907455e-05, + "loss": 1.8349, + "step": 25 + }, + { + "epoch": 0.06, + "eval_loss": 1.813267469406128, + "eval_runtime": 31.8291, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 0.346, + "step": 25 + }, + { + "epoch": 0.08, + "learning_rate": 9.61439588688946e-05, + "loss": 1.8239, + "step": 30 + }, + { + "epoch": 0.08, + "eval_loss": 1.8008779287338257, + "eval_runtime": 31.82, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 30 + }, + { + "epoch": 0.09, + "learning_rate": 9.550128534704372e-05, + "loss": 1.7177, + "step": 35 + }, + { + "epoch": 0.09, + "eval_loss": 1.789797067642212, + "eval_runtime": 31.8423, + "eval_samples_per_second": 2.638, + "eval_steps_per_second": 0.345, + "step": 35 + }, + { + "epoch": 0.1, + "learning_rate": 9.485861182519281e-05, + "loss": 1.7962, + "step": 40 + }, + { + "epoch": 0.1, + "eval_loss": 1.7808287143707275, + "eval_runtime": 31.8387, + "eval_samples_per_second": 2.638, + "eval_steps_per_second": 0.345, + "step": 40 + }, + { + "epoch": 0.12, + "learning_rate": 9.421593830334192e-05, + "loss": 1.715, + "step": 45 + }, + { + "epoch": 0.12, + "eval_loss": 1.771236777305603, + "eval_runtime": 31.8526, + "eval_samples_per_second": 2.637, + "eval_steps_per_second": 0.345, + "step": 45 + }, + { + "epoch": 0.13, + "learning_rate": 9.357326478149101e-05, + "loss": 1.7577, + "step": 50 + }, + { + "epoch": 0.13, + "eval_loss": 1.7619521617889404, + "eval_runtime": 31.828, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 0.346, + "step": 50 + }, + { + "epoch": 0.14, + "learning_rate": 9.29305912596401e-05, + "loss": 1.7323, + "step": 55 + }, + { + "epoch": 0.14, + "eval_loss": 1.7440986633300781, + "eval_runtime": 31.8301, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 0.346, + "step": 55 + }, + { + "epoch": 0.15, + "learning_rate": 9.228791773778921e-05, + "loss": 1.7122, + "step": 60 + }, + { + "epoch": 0.15, + "eval_loss": 1.712996006011963, + "eval_runtime": 31.8185, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 60 + }, + { + "epoch": 0.17, + "learning_rate": 9.16452442159383e-05, + "loss": 1.7042, + "step": 65 + }, + { + "epoch": 0.17, + "eval_loss": 1.7101707458496094, + "eval_runtime": 31.8153, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 65 + }, + { + "epoch": 0.18, + "learning_rate": 9.100257069408741e-05, + "loss": 1.7242, + "step": 70 + }, + { + "epoch": 0.18, + "eval_loss": 1.7038702964782715, + "eval_runtime": 31.8191, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 70 + }, + { + "epoch": 0.19, + "learning_rate": 9.03598971722365e-05, + "loss": 1.7033, + "step": 75 + }, + { + "epoch": 0.19, + "eval_loss": 1.7004183530807495, + "eval_runtime": 31.811, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 75 + }, + { + "epoch": 0.21, + "learning_rate": 8.97172236503856e-05, + "loss": 1.6934, + "step": 80 + }, + { + "epoch": 0.21, + "eval_loss": 1.6985355615615845, + "eval_runtime": 31.8221, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 80 + }, + { + "epoch": 0.22, + "learning_rate": 8.907455012853471e-05, + "loss": 1.6443, + "step": 85 + }, + { + "epoch": 0.22, + "eval_loss": 1.696852684020996, + "eval_runtime": 31.8219, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 85 + }, + { + "epoch": 0.23, + "learning_rate": 8.84318766066838e-05, + "loss": 1.7008, + "step": 90 + }, + { + "epoch": 0.23, + "eval_loss": 1.695594072341919, + "eval_runtime": 31.8205, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 90 + }, + { + "epoch": 0.24, + "learning_rate": 8.778920308483291e-05, + "loss": 1.6873, + "step": 95 + }, + { + "epoch": 0.24, + "eval_loss": 1.6928850412368774, + "eval_runtime": 31.814, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 95 + }, + { + "epoch": 0.26, + "learning_rate": 8.7146529562982e-05, + "loss": 1.6721, + "step": 100 + }, + { + "epoch": 0.26, + "eval_loss": 1.6919386386871338, + "eval_runtime": 31.8127, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 100 + }, + { + "epoch": 0.27, + "learning_rate": 8.650385604113111e-05, + "loss": 1.6669, + "step": 105 + }, + { + "epoch": 0.27, + "eval_loss": 1.6905940771102905, + "eval_runtime": 31.8874, + "eval_samples_per_second": 2.634, + "eval_steps_per_second": 0.345, + "step": 105 + }, + { + "epoch": 0.28, + "learning_rate": 8.586118251928022e-05, + "loss": 1.7009, + "step": 110 + }, + { + "epoch": 0.28, + "eval_loss": 1.689422607421875, + "eval_runtime": 31.8489, + "eval_samples_per_second": 2.637, + "eval_steps_per_second": 0.345, + "step": 110 + }, + { + "epoch": 0.3, + "learning_rate": 8.521850899742931e-05, + "loss": 1.7151, + "step": 115 + }, + { + "epoch": 0.3, + "eval_loss": 1.688527226448059, + "eval_runtime": 31.8318, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 0.346, + "step": 115 + }, + { + "epoch": 0.31, + "learning_rate": 8.457583547557842e-05, + "loss": 1.7165, + "step": 120 + }, + { + "epoch": 0.31, + "eval_loss": 1.686748743057251, + "eval_runtime": 31.835, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 0.346, + "step": 120 + }, + { + "epoch": 0.32, + "learning_rate": 8.393316195372751e-05, + "loss": 1.7015, + "step": 125 + }, + { + "epoch": 0.32, + "eval_loss": 1.6855812072753906, + "eval_runtime": 31.8269, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 0.346, + "step": 125 + }, + { + "epoch": 0.33, + "learning_rate": 8.32904884318766e-05, + "loss": 1.6818, + "step": 130 + }, + { + "epoch": 0.33, + "eval_loss": 1.6846078634262085, + "eval_runtime": 31.8617, + "eval_samples_per_second": 2.636, + "eval_steps_per_second": 0.345, + "step": 130 + }, + { + "epoch": 0.35, + "learning_rate": 8.264781491002571e-05, + "loss": 1.7151, + "step": 135 + }, + { + "epoch": 0.35, + "eval_loss": 1.6845166683197021, + "eval_runtime": 31.8631, + "eval_samples_per_second": 2.636, + "eval_steps_per_second": 0.345, + "step": 135 + }, + { + "epoch": 0.36, + "learning_rate": 8.200514138817481e-05, + "loss": 1.6667, + "step": 140 + }, + { + "epoch": 0.36, + "eval_loss": 1.6832246780395508, + "eval_runtime": 31.8276, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 0.346, + "step": 140 + }, + { + "epoch": 0.37, + "learning_rate": 8.136246786632391e-05, + "loss": 1.6586, + "step": 145 + }, + { + "epoch": 0.37, + "eval_loss": 1.6824573278427124, + "eval_runtime": 31.8115, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 145 + }, + { + "epoch": 0.39, + "learning_rate": 8.071979434447301e-05, + "loss": 1.6999, + "step": 150 + }, + { + "epoch": 0.39, + "eval_loss": 1.681317687034607, + "eval_runtime": 31.805, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 150 + }, + { + "epoch": 0.4, + "learning_rate": 8.007712082262212e-05, + "loss": 1.6894, + "step": 155 + }, + { + "epoch": 0.4, + "eval_loss": 1.6812355518341064, + "eval_runtime": 31.8039, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 155 + }, + { + "epoch": 0.41, + "learning_rate": 7.943444730077121e-05, + "loss": 1.6669, + "step": 160 + }, + { + "epoch": 0.41, + "eval_loss": 1.679802656173706, + "eval_runtime": 31.8021, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 160 + }, + { + "epoch": 0.42, + "learning_rate": 7.87917737789203e-05, + "loss": 1.6481, + "step": 165 + }, + { + "epoch": 0.42, + "eval_loss": 1.679994821548462, + "eval_runtime": 31.8201, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 165 + }, + { + "epoch": 0.44, + "learning_rate": 7.814910025706941e-05, + "loss": 1.7042, + "step": 170 + }, + { + "epoch": 0.44, + "eval_loss": 1.6786224842071533, + "eval_runtime": 31.8184, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 170 + }, + { + "epoch": 0.45, + "learning_rate": 7.750642673521852e-05, + "loss": 1.6564, + "step": 175 + }, + { + "epoch": 0.45, + "eval_loss": 1.6783201694488525, + "eval_runtime": 31.8137, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 175 + }, + { + "epoch": 0.46, + "learning_rate": 7.686375321336761e-05, + "loss": 1.6714, + "step": 180 + }, + { + "epoch": 0.46, + "eval_loss": 1.677714228630066, + "eval_runtime": 31.804, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 180 + }, + { + "epoch": 0.48, + "learning_rate": 7.622107969151672e-05, + "loss": 1.6705, + "step": 185 + }, + { + "epoch": 0.48, + "eval_loss": 1.6772773265838623, + "eval_runtime": 31.8077, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 185 + }, + { + "epoch": 0.49, + "learning_rate": 7.557840616966581e-05, + "loss": 1.6624, + "step": 190 + }, + { + "epoch": 0.49, + "eval_loss": 1.6766114234924316, + "eval_runtime": 31.8164, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 190 + }, + { + "epoch": 0.5, + "learning_rate": 7.493573264781492e-05, + "loss": 1.6415, + "step": 195 + }, + { + "epoch": 0.5, + "eval_loss": 1.675940752029419, + "eval_runtime": 31.8124, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 195 + }, + { + "epoch": 0.51, + "learning_rate": 7.429305912596401e-05, + "loss": 1.6633, + "step": 200 + }, + { + "epoch": 0.51, + "eval_loss": 1.6751865148544312, + "eval_runtime": 31.8123, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 200 + }, + { + "epoch": 0.53, + "learning_rate": 7.365038560411311e-05, + "loss": 1.6142, + "step": 205 + }, + { + "epoch": 0.53, + "eval_loss": 1.6750575304031372, + "eval_runtime": 31.8049, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 205 + }, + { + "epoch": 0.54, + "learning_rate": 7.300771208226222e-05, + "loss": 1.6736, + "step": 210 + }, + { + "epoch": 0.54, + "eval_loss": 1.6746423244476318, + "eval_runtime": 31.8111, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 210 + }, + { + "epoch": 0.55, + "learning_rate": 7.236503856041131e-05, + "loss": 1.6414, + "step": 215 + }, + { + "epoch": 0.55, + "eval_loss": 1.6740491390228271, + "eval_runtime": 31.8083, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 215 + }, + { + "epoch": 0.57, + "learning_rate": 7.172236503856042e-05, + "loss": 1.6725, + "step": 220 + }, + { + "epoch": 0.57, + "eval_loss": 1.673694372177124, + "eval_runtime": 31.8025, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 220 + }, + { + "epoch": 0.58, + "learning_rate": 7.107969151670951e-05, + "loss": 1.679, + "step": 225 + }, + { + "epoch": 0.58, + "eval_loss": 1.6733818054199219, + "eval_runtime": 31.8077, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 225 + }, + { + "epoch": 0.59, + "learning_rate": 7.043701799485862e-05, + "loss": 1.7204, + "step": 230 + }, + { + "epoch": 0.59, + "eval_loss": 1.6725033521652222, + "eval_runtime": 31.8079, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 230 + }, + { + "epoch": 0.6, + "learning_rate": 6.979434447300771e-05, + "loss": 1.6816, + "step": 235 + }, + { + "epoch": 0.6, + "eval_loss": 1.67205011844635, + "eval_runtime": 31.809, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 235 + }, + { + "epoch": 0.62, + "learning_rate": 6.91516709511568e-05, + "loss": 1.634, + "step": 240 + }, + { + "epoch": 0.62, + "eval_loss": 1.6717994213104248, + "eval_runtime": 31.8075, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 240 + }, + { + "epoch": 0.63, + "learning_rate": 6.850899742930593e-05, + "loss": 1.6761, + "step": 245 + }, + { + "epoch": 0.63, + "eval_loss": 1.6714757680892944, + "eval_runtime": 31.8052, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 245 + }, + { + "epoch": 0.64, + "learning_rate": 6.786632390745502e-05, + "loss": 1.6996, + "step": 250 + }, + { + "epoch": 0.64, + "eval_loss": 1.671559453010559, + "eval_runtime": 31.8089, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 250 + }, + { + "epoch": 0.66, + "learning_rate": 6.722365038560411e-05, + "loss": 1.6302, + "step": 255 + }, + { + "epoch": 0.66, + "eval_loss": 1.6711652278900146, + "eval_runtime": 31.8105, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 255 + }, + { + "epoch": 0.67, + "learning_rate": 6.658097686375322e-05, + "loss": 1.6611, + "step": 260 + }, + { + "epoch": 0.67, + "eval_loss": 1.6704061031341553, + "eval_runtime": 31.8135, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 260 + }, + { + "epoch": 0.68, + "learning_rate": 6.593830334190231e-05, + "loss": 1.6586, + "step": 265 + }, + { + "epoch": 0.68, + "eval_loss": 1.6705658435821533, + "eval_runtime": 31.7998, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 265 + }, + { + "epoch": 0.69, + "learning_rate": 6.529562982005142e-05, + "loss": 1.6838, + "step": 270 + }, + { + "epoch": 0.69, + "eval_loss": 1.6697953939437866, + "eval_runtime": 31.7994, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 270 + }, + { + "epoch": 0.71, + "learning_rate": 6.465295629820052e-05, + "loss": 1.6499, + "step": 275 + }, + { + "epoch": 0.71, + "eval_loss": 1.669343113899231, + "eval_runtime": 31.8147, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 275 + }, + { + "epoch": 0.72, + "learning_rate": 6.401028277634962e-05, + "loss": 1.6424, + "step": 280 + }, + { + "epoch": 0.72, + "eval_loss": 1.6693596839904785, + "eval_runtime": 31.8155, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 280 + }, + { + "epoch": 0.73, + "learning_rate": 6.336760925449872e-05, + "loss": 1.6238, + "step": 285 + }, + { + "epoch": 0.73, + "eval_loss": 1.669128179550171, + "eval_runtime": 31.8219, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 285 + }, + { + "epoch": 0.75, + "learning_rate": 6.272493573264781e-05, + "loss": 1.6538, + "step": 290 + }, + { + "epoch": 0.75, + "eval_loss": 1.6691263914108276, + "eval_runtime": 31.81, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 290 + }, + { + "epoch": 0.76, + "learning_rate": 6.208226221079692e-05, + "loss": 1.656, + "step": 295 + }, + { + "epoch": 0.76, + "eval_loss": 1.66847825050354, + "eval_runtime": 31.7918, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 295 + }, + { + "epoch": 0.77, + "learning_rate": 6.143958868894601e-05, + "loss": 1.6407, + "step": 300 + }, + { + "epoch": 0.77, + "eval_loss": 1.6683858633041382, + "eval_runtime": 31.8114, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 300 + }, + { + "epoch": 0.78, + "learning_rate": 6.079691516709511e-05, + "loss": 1.6468, + "step": 305 + }, + { + "epoch": 0.78, + "eval_loss": 1.6678352355957031, + "eval_runtime": 31.7975, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 305 + }, + { + "epoch": 0.8, + "learning_rate": 6.015424164524421e-05, + "loss": 1.6579, + "step": 310 + }, + { + "epoch": 0.8, + "eval_loss": 1.6675294637680054, + "eval_runtime": 31.8059, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 310 + }, + { + "epoch": 0.81, + "learning_rate": 5.951156812339333e-05, + "loss": 1.6331, + "step": 315 + }, + { + "epoch": 0.81, + "eval_loss": 1.6670397520065308, + "eval_runtime": 31.7994, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 315 + }, + { + "epoch": 0.82, + "learning_rate": 5.886889460154242e-05, + "loss": 1.6634, + "step": 320 + }, + { + "epoch": 0.82, + "eval_loss": 1.6668330430984497, + "eval_runtime": 31.807, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 320 + }, + { + "epoch": 0.84, + "learning_rate": 5.822622107969152e-05, + "loss": 1.6406, + "step": 325 + }, + { + "epoch": 0.84, + "eval_loss": 1.6668546199798584, + "eval_runtime": 31.8004, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 325 + }, + { + "epoch": 0.85, + "learning_rate": 5.758354755784062e-05, + "loss": 1.6614, + "step": 330 + }, + { + "epoch": 0.85, + "eval_loss": 1.666812539100647, + "eval_runtime": 31.8023, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 330 + }, + { + "epoch": 0.86, + "learning_rate": 5.694087403598972e-05, + "loss": 1.6598, + "step": 335 + }, + { + "epoch": 0.86, + "eval_loss": 1.666398048400879, + "eval_runtime": 31.8087, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 335 + }, + { + "epoch": 0.87, + "learning_rate": 5.6298200514138824e-05, + "loss": 1.6698, + "step": 340 + }, + { + "epoch": 0.87, + "eval_loss": 1.6660391092300415, + "eval_runtime": 31.8016, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 340 + }, + { + "epoch": 0.89, + "learning_rate": 5.5655526992287924e-05, + "loss": 1.6346, + "step": 345 + }, + { + "epoch": 0.89, + "eval_loss": 1.665901780128479, + "eval_runtime": 31.7878, + "eval_samples_per_second": 2.643, + "eval_steps_per_second": 0.346, + "step": 345 + }, + { + "epoch": 0.9, + "learning_rate": 5.501285347043702e-05, + "loss": 1.6617, + "step": 350 + }, + { + "epoch": 0.9, + "eval_loss": 1.666027307510376, + "eval_runtime": 31.7999, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 350 + }, + { + "epoch": 0.91, + "learning_rate": 5.437017994858612e-05, + "loss": 1.6623, + "step": 355 + }, + { + "epoch": 0.91, + "eval_loss": 1.6658612489700317, + "eval_runtime": 31.8128, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 355 + }, + { + "epoch": 0.93, + "learning_rate": 5.372750642673522e-05, + "loss": 1.6737, + "step": 360 + }, + { + "epoch": 0.93, + "eval_loss": 1.6656837463378906, + "eval_runtime": 31.8094, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 360 + }, + { + "epoch": 0.94, + "learning_rate": 5.308483290488432e-05, + "loss": 1.6981, + "step": 365 + }, + { + "epoch": 0.94, + "eval_loss": 1.6653661727905273, + "eval_runtime": 31.8031, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 365 + }, + { + "epoch": 0.95, + "learning_rate": 5.244215938303342e-05, + "loss": 1.6832, + "step": 370 + }, + { + "epoch": 0.95, + "eval_loss": 1.6652257442474365, + "eval_runtime": 31.8077, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 370 + }, + { + "epoch": 0.96, + "learning_rate": 5.1799485861182514e-05, + "loss": 1.6925, + "step": 375 + }, + { + "epoch": 0.96, + "eval_loss": 1.664839267730713, + "eval_runtime": 31.8082, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 375 + }, + { + "epoch": 0.98, + "learning_rate": 5.1156812339331615e-05, + "loss": 1.6721, + "step": 380 + }, + { + "epoch": 0.98, + "eval_loss": 1.6642415523529053, + "eval_runtime": 31.7927, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 380 + }, + { + "epoch": 0.99, + "learning_rate": 5.051413881748073e-05, + "loss": 1.6208, + "step": 385 + }, + { + "epoch": 0.99, + "eval_loss": 1.6641273498535156, + "eval_runtime": 31.8006, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 385 + }, + { + "epoch": 1.0, + "learning_rate": 4.987146529562982e-05, + "loss": 1.6519, + "step": 390 + }, + { + "epoch": 1.0, + "eval_loss": 1.6645455360412598, + "eval_runtime": 31.8026, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 390 + }, + { + "epoch": 1.02, + "learning_rate": 4.922879177377892e-05, + "loss": 1.6581, + "step": 395 + }, + { + "epoch": 1.02, + "eval_loss": 1.6641591787338257, + "eval_runtime": 31.8025, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 395 + }, + { + "epoch": 1.03, + "learning_rate": 4.8586118251928024e-05, + "loss": 1.6768, + "step": 400 + }, + { + "epoch": 1.03, + "eval_loss": 1.6637850999832153, + "eval_runtime": 31.806, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 400 + }, + { + "epoch": 1.04, + "learning_rate": 4.7943444730077124e-05, + "loss": 1.6257, + "step": 405 + }, + { + "epoch": 1.04, + "eval_loss": 1.6636526584625244, + "eval_runtime": 31.8116, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 405 + }, + { + "epoch": 1.05, + "learning_rate": 4.7300771208226225e-05, + "loss": 1.6522, + "step": 410 + }, + { + "epoch": 1.05, + "eval_loss": 1.6634690761566162, + "eval_runtime": 31.8063, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 410 + }, + { + "epoch": 1.07, + "learning_rate": 4.6658097686375325e-05, + "loss": 1.615, + "step": 415 + }, + { + "epoch": 1.07, + "eval_loss": 1.6637413501739502, + "eval_runtime": 31.8115, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 415 + }, + { + "epoch": 1.08, + "learning_rate": 4.6015424164524426e-05, + "loss": 1.5967, + "step": 420 + }, + { + "epoch": 1.08, + "eval_loss": 1.6633188724517822, + "eval_runtime": 31.8016, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 420 + }, + { + "epoch": 1.09, + "learning_rate": 4.537275064267352e-05, + "loss": 1.6708, + "step": 425 + }, + { + "epoch": 1.09, + "eval_loss": 1.6634286642074585, + "eval_runtime": 31.8074, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 425 + }, + { + "epoch": 1.11, + "learning_rate": 4.473007712082262e-05, + "loss": 1.6696, + "step": 430 + }, + { + "epoch": 1.11, + "eval_loss": 1.663163661956787, + "eval_runtime": 31.799, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 430 + }, + { + "epoch": 1.12, + "learning_rate": 4.408740359897173e-05, + "loss": 1.6344, + "step": 435 + }, + { + "epoch": 1.12, + "eval_loss": 1.6632493734359741, + "eval_runtime": 31.8032, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 435 + }, + { + "epoch": 1.13, + "learning_rate": 4.344473007712083e-05, + "loss": 1.5922, + "step": 440 + }, + { + "epoch": 1.13, + "eval_loss": 1.6627737283706665, + "eval_runtime": 31.8064, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 440 + }, + { + "epoch": 1.14, + "learning_rate": 4.280205655526993e-05, + "loss": 1.6541, + "step": 445 + }, + { + "epoch": 1.14, + "eval_loss": 1.6630815267562866, + "eval_runtime": 31.8083, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 445 + }, + { + "epoch": 1.16, + "learning_rate": 4.215938303341902e-05, + "loss": 1.6348, + "step": 450 + }, + { + "epoch": 1.16, + "eval_loss": 1.662713885307312, + "eval_runtime": 31.7977, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 450 + }, + { + "epoch": 1.17, + "learning_rate": 4.151670951156812e-05, + "loss": 1.6444, + "step": 455 + }, + { + "epoch": 1.17, + "eval_loss": 1.6629937887191772, + "eval_runtime": 31.799, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 455 + }, + { + "epoch": 1.18, + "learning_rate": 4.0874035989717224e-05, + "loss": 1.6629, + "step": 460 + }, + { + "epoch": 1.18, + "eval_loss": 1.6626592874526978, + "eval_runtime": 31.8136, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 460 + }, + { + "epoch": 1.2, + "learning_rate": 4.0231362467866324e-05, + "loss": 1.6327, + "step": 465 + }, + { + "epoch": 1.2, + "eval_loss": 1.6625391244888306, + "eval_runtime": 31.802, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 465 + }, + { + "epoch": 1.21, + "learning_rate": 3.958868894601543e-05, + "loss": 1.6236, + "step": 470 + }, + { + "epoch": 1.21, + "eval_loss": 1.6622934341430664, + "eval_runtime": 31.8055, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 470 + }, + { + "epoch": 1.22, + "learning_rate": 3.8946015424164526e-05, + "loss": 1.6232, + "step": 475 + }, + { + "epoch": 1.22, + "eval_loss": 1.661791443824768, + "eval_runtime": 31.8035, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 475 + }, + { + "epoch": 1.23, + "learning_rate": 3.8303341902313626e-05, + "loss": 1.6374, + "step": 480 + }, + { + "epoch": 1.23, + "eval_loss": 1.6618887186050415, + "eval_runtime": 31.7892, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 480 + }, + { + "epoch": 1.25, + "learning_rate": 3.766066838046273e-05, + "loss": 1.6422, + "step": 485 + }, + { + "epoch": 1.25, + "eval_loss": 1.6620476245880127, + "eval_runtime": 31.7974, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 485 + }, + { + "epoch": 1.26, + "learning_rate": 3.701799485861183e-05, + "loss": 1.631, + "step": 490 + }, + { + "epoch": 1.26, + "eval_loss": 1.6622774600982666, + "eval_runtime": 31.7978, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 490 + }, + { + "epoch": 1.27, + "learning_rate": 3.637532133676093e-05, + "loss": 1.609, + "step": 495 + }, + { + "epoch": 1.27, + "eval_loss": 1.661568284034729, + "eval_runtime": 31.8034, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 495 + }, + { + "epoch": 1.29, + "learning_rate": 3.573264781491003e-05, + "loss": 1.6444, + "step": 500 + }, + { + "epoch": 1.29, + "eval_loss": 1.6610949039459229, + "eval_runtime": 31.8005, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 500 + }, + { + "epoch": 1.3, + "learning_rate": 3.508997429305913e-05, + "loss": 1.6387, + "step": 505 + }, + { + "epoch": 1.3, + "eval_loss": 1.6611486673355103, + "eval_runtime": 31.8079, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 505 + }, + { + "epoch": 1.31, + "learning_rate": 3.444730077120823e-05, + "loss": 1.6507, + "step": 510 + }, + { + "epoch": 1.31, + "eval_loss": 1.661084532737732, + "eval_runtime": 31.8028, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 510 + }, + { + "epoch": 1.32, + "learning_rate": 3.380462724935733e-05, + "loss": 1.6377, + "step": 515 + }, + { + "epoch": 1.32, + "eval_loss": 1.6610103845596313, + "eval_runtime": 31.8101, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 515 + }, + { + "epoch": 1.34, + "learning_rate": 3.316195372750643e-05, + "loss": 1.6351, + "step": 520 + }, + { + "epoch": 1.34, + "eval_loss": 1.6608690023422241, + "eval_runtime": 31.7986, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 520 + }, + { + "epoch": 1.35, + "learning_rate": 3.251928020565553e-05, + "loss": 1.6196, + "step": 525 + }, + { + "epoch": 1.35, + "eval_loss": 1.6606587171554565, + "eval_runtime": 31.8015, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 525 + }, + { + "epoch": 1.36, + "learning_rate": 3.1876606683804625e-05, + "loss": 1.646, + "step": 530 + }, + { + "epoch": 1.36, + "eval_loss": 1.6605886220932007, + "eval_runtime": 31.804, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 530 + }, + { + "epoch": 1.38, + "learning_rate": 3.1233933161953726e-05, + "loss": 1.6824, + "step": 535 + }, + { + "epoch": 1.38, + "eval_loss": 1.6603879928588867, + "eval_runtime": 31.8111, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 535 + }, + { + "epoch": 1.39, + "learning_rate": 3.059125964010283e-05, + "loss": 1.6115, + "step": 540 + }, + { + "epoch": 1.39, + "eval_loss": 1.660337209701538, + "eval_runtime": 31.805, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 540 + }, + { + "epoch": 1.4, + "learning_rate": 2.994858611825193e-05, + "loss": 1.6243, + "step": 545 + }, + { + "epoch": 1.4, + "eval_loss": 1.6606225967407227, + "eval_runtime": 31.808, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 545 + }, + { + "epoch": 1.41, + "learning_rate": 2.930591259640103e-05, + "loss": 1.6486, + "step": 550 + }, + { + "epoch": 1.41, + "eval_loss": 1.6604374647140503, + "eval_runtime": 31.8072, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 550 + }, + { + "epoch": 1.43, + "learning_rate": 2.866323907455013e-05, + "loss": 1.6865, + "step": 555 + }, + { + "epoch": 1.43, + "eval_loss": 1.6601544618606567, + "eval_runtime": 31.8069, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 555 + }, + { + "epoch": 1.44, + "learning_rate": 2.802056555269923e-05, + "loss": 1.6616, + "step": 560 + }, + { + "epoch": 1.44, + "eval_loss": 1.660386562347412, + "eval_runtime": 31.803, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 560 + }, + { + "epoch": 1.45, + "learning_rate": 2.737789203084833e-05, + "loss": 1.6336, + "step": 565 + }, + { + "epoch": 1.45, + "eval_loss": 1.660247802734375, + "eval_runtime": 31.8084, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 565 + }, + { + "epoch": 1.47, + "learning_rate": 2.673521850899743e-05, + "loss": 1.6699, + "step": 570 + }, + { + "epoch": 1.47, + "eval_loss": 1.660292148590088, + "eval_runtime": 31.8031, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 570 + }, + { + "epoch": 1.48, + "learning_rate": 2.6092544987146534e-05, + "loss": 1.6472, + "step": 575 + }, + { + "epoch": 1.48, + "eval_loss": 1.6598683595657349, + "eval_runtime": 31.8181, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 575 + }, + { + "epoch": 1.49, + "learning_rate": 2.5449871465295634e-05, + "loss": 1.6723, + "step": 580 + }, + { + "epoch": 1.49, + "eval_loss": 1.6599065065383911, + "eval_runtime": 31.8124, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 580 + }, + { + "epoch": 1.5, + "learning_rate": 2.480719794344473e-05, + "loss": 1.663, + "step": 585 + }, + { + "epoch": 1.5, + "eval_loss": 1.6599762439727783, + "eval_runtime": 31.8175, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 585 + }, + { + "epoch": 1.52, + "learning_rate": 2.4164524421593832e-05, + "loss": 1.6519, + "step": 590 + }, + { + "epoch": 1.52, + "eval_loss": 1.6599925756454468, + "eval_runtime": 31.8209, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 590 + }, + { + "epoch": 1.53, + "learning_rate": 2.3521850899742933e-05, + "loss": 1.6345, + "step": 595 + }, + { + "epoch": 1.53, + "eval_loss": 1.6600078344345093, + "eval_runtime": 31.8031, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 595 + }, + { + "epoch": 1.54, + "learning_rate": 2.2879177377892033e-05, + "loss": 1.654, + "step": 600 + }, + { + "epoch": 1.54, + "eval_loss": 1.6599884033203125, + "eval_runtime": 31.8281, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 0.346, + "step": 600 + }, + { + "epoch": 1.56, + "learning_rate": 2.2236503856041134e-05, + "loss": 1.691, + "step": 605 + }, + { + "epoch": 1.56, + "eval_loss": 1.6599520444869995, + "eval_runtime": 31.8168, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 605 + }, + { + "epoch": 1.57, + "learning_rate": 2.159383033419023e-05, + "loss": 1.6714, + "step": 610 + }, + { + "epoch": 1.57, + "eval_loss": 1.659857988357544, + "eval_runtime": 31.8153, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 610 + }, + { + "epoch": 1.58, + "learning_rate": 2.095115681233933e-05, + "loss": 1.5977, + "step": 615 + }, + { + "epoch": 1.58, + "eval_loss": 1.6596992015838623, + "eval_runtime": 31.8087, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 615 + }, + { + "epoch": 1.59, + "learning_rate": 2.0308483290488432e-05, + "loss": 1.6429, + "step": 620 + }, + { + "epoch": 1.59, + "eval_loss": 1.6595497131347656, + "eval_runtime": 31.8151, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 620 + }, + { + "epoch": 1.61, + "learning_rate": 1.9665809768637533e-05, + "loss": 1.6091, + "step": 625 + }, + { + "epoch": 1.61, + "eval_loss": 1.6597559452056885, + "eval_runtime": 31.8076, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 625 + }, + { + "epoch": 1.62, + "learning_rate": 1.9023136246786633e-05, + "loss": 1.5956, + "step": 630 + }, + { + "epoch": 1.62, + "eval_loss": 1.6593599319458008, + "eval_runtime": 31.8058, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 630 + }, + { + "epoch": 1.63, + "learning_rate": 1.8380462724935734e-05, + "loss": 1.6697, + "step": 635 + }, + { + "epoch": 1.63, + "eval_loss": 1.6592466831207275, + "eval_runtime": 31.8589, + "eval_samples_per_second": 2.637, + "eval_steps_per_second": 0.345, + "step": 635 + }, + { + "epoch": 1.65, + "learning_rate": 1.7737789203084834e-05, + "loss": 1.6535, + "step": 640 + }, + { + "epoch": 1.65, + "eval_loss": 1.6592165231704712, + "eval_runtime": 31.8225, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 640 + }, + { + "epoch": 1.66, + "learning_rate": 1.7095115681233935e-05, + "loss": 1.6722, + "step": 645 + }, + { + "epoch": 1.66, + "eval_loss": 1.659324049949646, + "eval_runtime": 31.8388, + "eval_samples_per_second": 2.638, + "eval_steps_per_second": 0.345, + "step": 645 + }, + { + "epoch": 1.67, + "learning_rate": 1.6452442159383032e-05, + "loss": 1.6347, + "step": 650 + }, + { + "epoch": 1.67, + "eval_loss": 1.6590396165847778, + "eval_runtime": 31.8174, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 650 + }, + { + "epoch": 1.68, + "learning_rate": 1.5809768637532136e-05, + "loss": 1.6223, + "step": 655 + }, + { + "epoch": 1.68, + "eval_loss": 1.658969521522522, + "eval_runtime": 31.8275, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 0.346, + "step": 655 + }, + { + "epoch": 1.7, + "learning_rate": 1.5167095115681235e-05, + "loss": 1.6309, + "step": 660 + }, + { + "epoch": 1.7, + "eval_loss": 1.6587274074554443, + "eval_runtime": 31.8146, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 660 + }, + { + "epoch": 1.71, + "learning_rate": 1.4524421593830334e-05, + "loss": 1.6499, + "step": 665 + }, + { + "epoch": 1.71, + "eval_loss": 1.65865957736969, + "eval_runtime": 31.8091, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 665 + }, + { + "epoch": 1.72, + "learning_rate": 1.3881748071979436e-05, + "loss": 1.6713, + "step": 670 + }, + { + "epoch": 1.72, + "eval_loss": 1.6586016416549683, + "eval_runtime": 31.8229, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 670 + }, + { + "epoch": 1.74, + "learning_rate": 1.3239074550128535e-05, + "loss": 1.6348, + "step": 675 + }, + { + "epoch": 1.74, + "eval_loss": 1.6584962606430054, + "eval_runtime": 31.8035, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 675 + }, + { + "epoch": 1.75, + "learning_rate": 1.2596401028277636e-05, + "loss": 1.6447, + "step": 680 + }, + { + "epoch": 1.75, + "eval_loss": 1.6585614681243896, + "eval_runtime": 31.8101, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 680 + }, + { + "epoch": 1.76, + "learning_rate": 1.1953727506426736e-05, + "loss": 1.6511, + "step": 685 + }, + { + "epoch": 1.76, + "eval_loss": 1.6583601236343384, + "eval_runtime": 31.8125, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 685 + }, + { + "epoch": 1.77, + "learning_rate": 1.1311053984575835e-05, + "loss": 1.6496, + "step": 690 + }, + { + "epoch": 1.77, + "eval_loss": 1.6583328247070312, + "eval_runtime": 31.8179, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 690 + }, + { + "epoch": 1.79, + "learning_rate": 1.0668380462724936e-05, + "loss": 1.6421, + "step": 695 + }, + { + "epoch": 1.79, + "eval_loss": 1.6583188772201538, + "eval_runtime": 31.8165, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 695 + }, + { + "epoch": 1.8, + "learning_rate": 1.0025706940874038e-05, + "loss": 1.6126, + "step": 700 + }, + { + "epoch": 1.8, + "eval_loss": 1.6580705642700195, + "eval_runtime": 31.8164, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 700 + }, + { + "epoch": 1.81, + "learning_rate": 9.383033419023137e-06, + "loss": 1.6226, + "step": 705 + }, + { + "epoch": 1.81, + "eval_loss": 1.6584465503692627, + "eval_runtime": 31.816, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 705 + }, + { + "epoch": 1.83, + "learning_rate": 8.740359897172237e-06, + "loss": 1.6923, + "step": 710 + }, + { + "epoch": 1.83, + "eval_loss": 1.658342957496643, + "eval_runtime": 31.8215, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 710 + }, + { + "epoch": 1.84, + "learning_rate": 8.097686375321336e-06, + "loss": 1.6224, + "step": 715 + }, + { + "epoch": 1.84, + "eval_loss": 1.6582502126693726, + "eval_runtime": 31.812, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 715 + }, + { + "epoch": 1.85, + "learning_rate": 7.4550128534704376e-06, + "loss": 1.6587, + "step": 720 + }, + { + "epoch": 1.85, + "eval_loss": 1.658232569694519, + "eval_runtime": 31.8059, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 720 + }, + { + "epoch": 1.86, + "learning_rate": 6.812339331619537e-06, + "loss": 1.6641, + "step": 725 + }, + { + "epoch": 1.86, + "eval_loss": 1.6582419872283936, + "eval_runtime": 31.81, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 725 + }, + { + "epoch": 1.88, + "learning_rate": 6.169665809768638e-06, + "loss": 1.6419, + "step": 730 + }, + { + "epoch": 1.88, + "eval_loss": 1.6582107543945312, + "eval_runtime": 31.833, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 0.346, + "step": 730 + }, + { + "epoch": 1.89, + "learning_rate": 5.526992287917738e-06, + "loss": 1.652, + "step": 735 + }, + { + "epoch": 1.89, + "eval_loss": 1.6582270860671997, + "eval_runtime": 31.8801, + "eval_samples_per_second": 2.635, + "eval_steps_per_second": 0.345, + "step": 735 + }, + { + "epoch": 1.9, + "learning_rate": 4.884318766066838e-06, + "loss": 1.6487, + "step": 740 + }, + { + "epoch": 1.9, + "eval_loss": 1.6582915782928467, + "eval_runtime": 31.8265, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 0.346, + "step": 740 + }, + { + "epoch": 1.92, + "learning_rate": 4.241645244215939e-06, + "loss": 1.6245, + "step": 745 + }, + { + "epoch": 1.92, + "eval_loss": 1.658047080039978, + "eval_runtime": 31.8109, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 745 + }, + { + "epoch": 1.93, + "learning_rate": 3.598971722365039e-06, + "loss": 1.6553, + "step": 750 + }, + { + "epoch": 1.93, + "eval_loss": 1.6578683853149414, + "eval_runtime": 31.8025, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 750 + } + ], + "logging_steps": 5, + "max_steps": 778, + "num_train_epochs": 2, + "save_steps": 10, + "total_flos": 2.4364649130491904e+17, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-750/training_args.bin b/checkpoint-750/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..48088b6a3851b1352bfa758dd2b3ad558168087f --- /dev/null +++ b/checkpoint-750/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccc6a8542abd91027de58ebfccafdf919108680707834fe188d91cae077ef8e9 +size 4600 diff --git a/checkpoint-760/README.md b/checkpoint-760/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4787359fdf0253321d922e272bacc387ca46ce22 --- /dev/null +++ b/checkpoint-760/README.md @@ -0,0 +1,21 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- quant_method: bitsandbytes +- load_in_8bit: True +- load_in_4bit: False +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 +### Framework versions + + +- PEFT 0.5.0 diff --git a/checkpoint-760/adapter_config.json b/checkpoint-760/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..96a10d09371f10f296c5b7fcb5b0ddd7be98eef2 --- /dev/null +++ b/checkpoint-760/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-2-7b-hf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-760/adapter_model.safetensors b/checkpoint-760/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ec3ff1ff24a2d65fbd38637eea3d979bc06c6ca8 --- /dev/null +++ b/checkpoint-760/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36e33012a5495bd07e00b9e59e90574c650664430061690402da200b176d8494 +size 16794200 diff --git a/checkpoint-760/optimizer.pt b/checkpoint-760/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..6d4a5be0f8a7e810e6ed0500727d4c7729fca773 --- /dev/null +++ b/checkpoint-760/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86d459429a5d6f3262e6fbe255a2ee4932b6410a1a80593cdce61cd73243aef6 +size 33663866 diff --git a/checkpoint-760/rng_state.pth b/checkpoint-760/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..d579f0c187efedb51372635068d524dd688cf10c --- /dev/null +++ b/checkpoint-760/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6973cbf09e3463c2ccbdf4a6bd0f0551c3955052f339950757f1b93f508f887a +size 14244 diff --git a/checkpoint-760/scheduler.pt b/checkpoint-760/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c66a0293f5ec8f3ca095b7bd6f1bdfe96fd76669 --- /dev/null +++ b/checkpoint-760/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bf90aa5789bb0cee3a7c1f3d247be3d2ae5fb54b24e3bbd48be306d070a5211 +size 1064 diff --git a/checkpoint-760/trainer_state.json b/checkpoint-760/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e7f8564f2e28235ad8c24fff50f61d6a05714565 --- /dev/null +++ b/checkpoint-760/trainer_state.json @@ -0,0 +1,2147 @@ +{ + "best_metric": 1.6578683853149414, + "best_model_checkpoint": "/scratch/kwamea/llama-output/checkpoint-750", + "epoch": 1.9537275064267352, + "eval_steps": 5, + "global_step": 760, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 9.93573264781491e-05, + "loss": 1.9824, + "step": 5 + }, + { + "epoch": 0.01, + "eval_loss": 1.954728126525879, + "eval_runtime": 31.6803, + "eval_samples_per_second": 2.651, + "eval_steps_per_second": 0.347, + "step": 5 + }, + { + "epoch": 0.03, + "learning_rate": 9.87146529562982e-05, + "loss": 1.9249, + "step": 10 + }, + { + "epoch": 0.03, + "eval_loss": 1.8934264183044434, + "eval_runtime": 31.7798, + "eval_samples_per_second": 2.643, + "eval_steps_per_second": 0.346, + "step": 10 + }, + { + "epoch": 0.04, + "learning_rate": 9.80719794344473e-05, + "loss": 1.8609, + "step": 15 + }, + { + "epoch": 0.04, + "eval_loss": 1.8421852588653564, + "eval_runtime": 31.7999, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 15 + }, + { + "epoch": 0.05, + "learning_rate": 9.742930591259641e-05, + "loss": 1.8268, + "step": 20 + }, + { + "epoch": 0.05, + "eval_loss": 1.8279638290405273, + "eval_runtime": 31.8095, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 20 + }, + { + "epoch": 0.06, + "learning_rate": 9.67866323907455e-05, + "loss": 1.8349, + "step": 25 + }, + { + "epoch": 0.06, + "eval_loss": 1.813267469406128, + "eval_runtime": 31.8291, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 0.346, + "step": 25 + }, + { + "epoch": 0.08, + "learning_rate": 9.61439588688946e-05, + "loss": 1.8239, + "step": 30 + }, + { + "epoch": 0.08, + "eval_loss": 1.8008779287338257, + "eval_runtime": 31.82, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 30 + }, + { + "epoch": 0.09, + "learning_rate": 9.550128534704372e-05, + "loss": 1.7177, + "step": 35 + }, + { + "epoch": 0.09, + "eval_loss": 1.789797067642212, + "eval_runtime": 31.8423, + "eval_samples_per_second": 2.638, + "eval_steps_per_second": 0.345, + "step": 35 + }, + { + "epoch": 0.1, + "learning_rate": 9.485861182519281e-05, + "loss": 1.7962, + "step": 40 + }, + { + "epoch": 0.1, + "eval_loss": 1.7808287143707275, + "eval_runtime": 31.8387, + "eval_samples_per_second": 2.638, + "eval_steps_per_second": 0.345, + "step": 40 + }, + { + "epoch": 0.12, + "learning_rate": 9.421593830334192e-05, + "loss": 1.715, + "step": 45 + }, + { + "epoch": 0.12, + "eval_loss": 1.771236777305603, + "eval_runtime": 31.8526, + "eval_samples_per_second": 2.637, + "eval_steps_per_second": 0.345, + "step": 45 + }, + { + "epoch": 0.13, + "learning_rate": 9.357326478149101e-05, + "loss": 1.7577, + "step": 50 + }, + { + "epoch": 0.13, + "eval_loss": 1.7619521617889404, + "eval_runtime": 31.828, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 0.346, + "step": 50 + }, + { + "epoch": 0.14, + "learning_rate": 9.29305912596401e-05, + "loss": 1.7323, + "step": 55 + }, + { + "epoch": 0.14, + "eval_loss": 1.7440986633300781, + "eval_runtime": 31.8301, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 0.346, + "step": 55 + }, + { + "epoch": 0.15, + "learning_rate": 9.228791773778921e-05, + "loss": 1.7122, + "step": 60 + }, + { + "epoch": 0.15, + "eval_loss": 1.712996006011963, + "eval_runtime": 31.8185, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 60 + }, + { + "epoch": 0.17, + "learning_rate": 9.16452442159383e-05, + "loss": 1.7042, + "step": 65 + }, + { + "epoch": 0.17, + "eval_loss": 1.7101707458496094, + "eval_runtime": 31.8153, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 65 + }, + { + "epoch": 0.18, + "learning_rate": 9.100257069408741e-05, + "loss": 1.7242, + "step": 70 + }, + { + "epoch": 0.18, + "eval_loss": 1.7038702964782715, + "eval_runtime": 31.8191, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 70 + }, + { + "epoch": 0.19, + "learning_rate": 9.03598971722365e-05, + "loss": 1.7033, + "step": 75 + }, + { + "epoch": 0.19, + "eval_loss": 1.7004183530807495, + "eval_runtime": 31.811, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 75 + }, + { + "epoch": 0.21, + "learning_rate": 8.97172236503856e-05, + "loss": 1.6934, + "step": 80 + }, + { + "epoch": 0.21, + "eval_loss": 1.6985355615615845, + "eval_runtime": 31.8221, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 80 + }, + { + "epoch": 0.22, + "learning_rate": 8.907455012853471e-05, + "loss": 1.6443, + "step": 85 + }, + { + "epoch": 0.22, + "eval_loss": 1.696852684020996, + "eval_runtime": 31.8219, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 85 + }, + { + "epoch": 0.23, + "learning_rate": 8.84318766066838e-05, + "loss": 1.7008, + "step": 90 + }, + { + "epoch": 0.23, + "eval_loss": 1.695594072341919, + "eval_runtime": 31.8205, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 90 + }, + { + "epoch": 0.24, + "learning_rate": 8.778920308483291e-05, + "loss": 1.6873, + "step": 95 + }, + { + "epoch": 0.24, + "eval_loss": 1.6928850412368774, + "eval_runtime": 31.814, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 95 + }, + { + "epoch": 0.26, + "learning_rate": 8.7146529562982e-05, + "loss": 1.6721, + "step": 100 + }, + { + "epoch": 0.26, + "eval_loss": 1.6919386386871338, + "eval_runtime": 31.8127, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 100 + }, + { + "epoch": 0.27, + "learning_rate": 8.650385604113111e-05, + "loss": 1.6669, + "step": 105 + }, + { + "epoch": 0.27, + "eval_loss": 1.6905940771102905, + "eval_runtime": 31.8874, + "eval_samples_per_second": 2.634, + "eval_steps_per_second": 0.345, + "step": 105 + }, + { + "epoch": 0.28, + "learning_rate": 8.586118251928022e-05, + "loss": 1.7009, + "step": 110 + }, + { + "epoch": 0.28, + "eval_loss": 1.689422607421875, + "eval_runtime": 31.8489, + "eval_samples_per_second": 2.637, + "eval_steps_per_second": 0.345, + "step": 110 + }, + { + "epoch": 0.3, + "learning_rate": 8.521850899742931e-05, + "loss": 1.7151, + "step": 115 + }, + { + "epoch": 0.3, + "eval_loss": 1.688527226448059, + "eval_runtime": 31.8318, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 0.346, + "step": 115 + }, + { + "epoch": 0.31, + "learning_rate": 8.457583547557842e-05, + "loss": 1.7165, + "step": 120 + }, + { + "epoch": 0.31, + "eval_loss": 1.686748743057251, + "eval_runtime": 31.835, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 0.346, + "step": 120 + }, + { + "epoch": 0.32, + "learning_rate": 8.393316195372751e-05, + "loss": 1.7015, + "step": 125 + }, + { + "epoch": 0.32, + "eval_loss": 1.6855812072753906, + "eval_runtime": 31.8269, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 0.346, + "step": 125 + }, + { + "epoch": 0.33, + "learning_rate": 8.32904884318766e-05, + "loss": 1.6818, + "step": 130 + }, + { + "epoch": 0.33, + "eval_loss": 1.6846078634262085, + "eval_runtime": 31.8617, + "eval_samples_per_second": 2.636, + "eval_steps_per_second": 0.345, + "step": 130 + }, + { + "epoch": 0.35, + "learning_rate": 8.264781491002571e-05, + "loss": 1.7151, + "step": 135 + }, + { + "epoch": 0.35, + "eval_loss": 1.6845166683197021, + "eval_runtime": 31.8631, + "eval_samples_per_second": 2.636, + "eval_steps_per_second": 0.345, + "step": 135 + }, + { + "epoch": 0.36, + "learning_rate": 8.200514138817481e-05, + "loss": 1.6667, + "step": 140 + }, + { + "epoch": 0.36, + "eval_loss": 1.6832246780395508, + "eval_runtime": 31.8276, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 0.346, + "step": 140 + }, + { + "epoch": 0.37, + "learning_rate": 8.136246786632391e-05, + "loss": 1.6586, + "step": 145 + }, + { + "epoch": 0.37, + "eval_loss": 1.6824573278427124, + "eval_runtime": 31.8115, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 145 + }, + { + "epoch": 0.39, + "learning_rate": 8.071979434447301e-05, + "loss": 1.6999, + "step": 150 + }, + { + "epoch": 0.39, + "eval_loss": 1.681317687034607, + "eval_runtime": 31.805, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 150 + }, + { + "epoch": 0.4, + "learning_rate": 8.007712082262212e-05, + "loss": 1.6894, + "step": 155 + }, + { + "epoch": 0.4, + "eval_loss": 1.6812355518341064, + "eval_runtime": 31.8039, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 155 + }, + { + "epoch": 0.41, + "learning_rate": 7.943444730077121e-05, + "loss": 1.6669, + "step": 160 + }, + { + "epoch": 0.41, + "eval_loss": 1.679802656173706, + "eval_runtime": 31.8021, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 160 + }, + { + "epoch": 0.42, + "learning_rate": 7.87917737789203e-05, + "loss": 1.6481, + "step": 165 + }, + { + "epoch": 0.42, + "eval_loss": 1.679994821548462, + "eval_runtime": 31.8201, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 165 + }, + { + "epoch": 0.44, + "learning_rate": 7.814910025706941e-05, + "loss": 1.7042, + "step": 170 + }, + { + "epoch": 0.44, + "eval_loss": 1.6786224842071533, + "eval_runtime": 31.8184, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 170 + }, + { + "epoch": 0.45, + "learning_rate": 7.750642673521852e-05, + "loss": 1.6564, + "step": 175 + }, + { + "epoch": 0.45, + "eval_loss": 1.6783201694488525, + "eval_runtime": 31.8137, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 175 + }, + { + "epoch": 0.46, + "learning_rate": 7.686375321336761e-05, + "loss": 1.6714, + "step": 180 + }, + { + "epoch": 0.46, + "eval_loss": 1.677714228630066, + "eval_runtime": 31.804, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 180 + }, + { + "epoch": 0.48, + "learning_rate": 7.622107969151672e-05, + "loss": 1.6705, + "step": 185 + }, + { + "epoch": 0.48, + "eval_loss": 1.6772773265838623, + "eval_runtime": 31.8077, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 185 + }, + { + "epoch": 0.49, + "learning_rate": 7.557840616966581e-05, + "loss": 1.6624, + "step": 190 + }, + { + "epoch": 0.49, + "eval_loss": 1.6766114234924316, + "eval_runtime": 31.8164, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 190 + }, + { + "epoch": 0.5, + "learning_rate": 7.493573264781492e-05, + "loss": 1.6415, + "step": 195 + }, + { + "epoch": 0.5, + "eval_loss": 1.675940752029419, + "eval_runtime": 31.8124, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 195 + }, + { + "epoch": 0.51, + "learning_rate": 7.429305912596401e-05, + "loss": 1.6633, + "step": 200 + }, + { + "epoch": 0.51, + "eval_loss": 1.6751865148544312, + "eval_runtime": 31.8123, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 200 + }, + { + "epoch": 0.53, + "learning_rate": 7.365038560411311e-05, + "loss": 1.6142, + "step": 205 + }, + { + "epoch": 0.53, + "eval_loss": 1.6750575304031372, + "eval_runtime": 31.8049, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 205 + }, + { + "epoch": 0.54, + "learning_rate": 7.300771208226222e-05, + "loss": 1.6736, + "step": 210 + }, + { + "epoch": 0.54, + "eval_loss": 1.6746423244476318, + "eval_runtime": 31.8111, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 210 + }, + { + "epoch": 0.55, + "learning_rate": 7.236503856041131e-05, + "loss": 1.6414, + "step": 215 + }, + { + "epoch": 0.55, + "eval_loss": 1.6740491390228271, + "eval_runtime": 31.8083, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 215 + }, + { + "epoch": 0.57, + "learning_rate": 7.172236503856042e-05, + "loss": 1.6725, + "step": 220 + }, + { + "epoch": 0.57, + "eval_loss": 1.673694372177124, + "eval_runtime": 31.8025, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 220 + }, + { + "epoch": 0.58, + "learning_rate": 7.107969151670951e-05, + "loss": 1.679, + "step": 225 + }, + { + "epoch": 0.58, + "eval_loss": 1.6733818054199219, + "eval_runtime": 31.8077, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 225 + }, + { + "epoch": 0.59, + "learning_rate": 7.043701799485862e-05, + "loss": 1.7204, + "step": 230 + }, + { + "epoch": 0.59, + "eval_loss": 1.6725033521652222, + "eval_runtime": 31.8079, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 230 + }, + { + "epoch": 0.6, + "learning_rate": 6.979434447300771e-05, + "loss": 1.6816, + "step": 235 + }, + { + "epoch": 0.6, + "eval_loss": 1.67205011844635, + "eval_runtime": 31.809, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 235 + }, + { + "epoch": 0.62, + "learning_rate": 6.91516709511568e-05, + "loss": 1.634, + "step": 240 + }, + { + "epoch": 0.62, + "eval_loss": 1.6717994213104248, + "eval_runtime": 31.8075, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 240 + }, + { + "epoch": 0.63, + "learning_rate": 6.850899742930593e-05, + "loss": 1.6761, + "step": 245 + }, + { + "epoch": 0.63, + "eval_loss": 1.6714757680892944, + "eval_runtime": 31.8052, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 245 + }, + { + "epoch": 0.64, + "learning_rate": 6.786632390745502e-05, + "loss": 1.6996, + "step": 250 + }, + { + "epoch": 0.64, + "eval_loss": 1.671559453010559, + "eval_runtime": 31.8089, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 250 + }, + { + "epoch": 0.66, + "learning_rate": 6.722365038560411e-05, + "loss": 1.6302, + "step": 255 + }, + { + "epoch": 0.66, + "eval_loss": 1.6711652278900146, + "eval_runtime": 31.8105, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 255 + }, + { + "epoch": 0.67, + "learning_rate": 6.658097686375322e-05, + "loss": 1.6611, + "step": 260 + }, + { + "epoch": 0.67, + "eval_loss": 1.6704061031341553, + "eval_runtime": 31.8135, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 260 + }, + { + "epoch": 0.68, + "learning_rate": 6.593830334190231e-05, + "loss": 1.6586, + "step": 265 + }, + { + "epoch": 0.68, + "eval_loss": 1.6705658435821533, + "eval_runtime": 31.7998, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 265 + }, + { + "epoch": 0.69, + "learning_rate": 6.529562982005142e-05, + "loss": 1.6838, + "step": 270 + }, + { + "epoch": 0.69, + "eval_loss": 1.6697953939437866, + "eval_runtime": 31.7994, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 270 + }, + { + "epoch": 0.71, + "learning_rate": 6.465295629820052e-05, + "loss": 1.6499, + "step": 275 + }, + { + "epoch": 0.71, + "eval_loss": 1.669343113899231, + "eval_runtime": 31.8147, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 275 + }, + { + "epoch": 0.72, + "learning_rate": 6.401028277634962e-05, + "loss": 1.6424, + "step": 280 + }, + { + "epoch": 0.72, + "eval_loss": 1.6693596839904785, + "eval_runtime": 31.8155, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 280 + }, + { + "epoch": 0.73, + "learning_rate": 6.336760925449872e-05, + "loss": 1.6238, + "step": 285 + }, + { + "epoch": 0.73, + "eval_loss": 1.669128179550171, + "eval_runtime": 31.8219, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 285 + }, + { + "epoch": 0.75, + "learning_rate": 6.272493573264781e-05, + "loss": 1.6538, + "step": 290 + }, + { + "epoch": 0.75, + "eval_loss": 1.6691263914108276, + "eval_runtime": 31.81, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 290 + }, + { + "epoch": 0.76, + "learning_rate": 6.208226221079692e-05, + "loss": 1.656, + "step": 295 + }, + { + "epoch": 0.76, + "eval_loss": 1.66847825050354, + "eval_runtime": 31.7918, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 295 + }, + { + "epoch": 0.77, + "learning_rate": 6.143958868894601e-05, + "loss": 1.6407, + "step": 300 + }, + { + "epoch": 0.77, + "eval_loss": 1.6683858633041382, + "eval_runtime": 31.8114, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 300 + }, + { + "epoch": 0.78, + "learning_rate": 6.079691516709511e-05, + "loss": 1.6468, + "step": 305 + }, + { + "epoch": 0.78, + "eval_loss": 1.6678352355957031, + "eval_runtime": 31.7975, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 305 + }, + { + "epoch": 0.8, + "learning_rate": 6.015424164524421e-05, + "loss": 1.6579, + "step": 310 + }, + { + "epoch": 0.8, + "eval_loss": 1.6675294637680054, + "eval_runtime": 31.8059, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 310 + }, + { + "epoch": 0.81, + "learning_rate": 5.951156812339333e-05, + "loss": 1.6331, + "step": 315 + }, + { + "epoch": 0.81, + "eval_loss": 1.6670397520065308, + "eval_runtime": 31.7994, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 315 + }, + { + "epoch": 0.82, + "learning_rate": 5.886889460154242e-05, + "loss": 1.6634, + "step": 320 + }, + { + "epoch": 0.82, + "eval_loss": 1.6668330430984497, + "eval_runtime": 31.807, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 320 + }, + { + "epoch": 0.84, + "learning_rate": 5.822622107969152e-05, + "loss": 1.6406, + "step": 325 + }, + { + "epoch": 0.84, + "eval_loss": 1.6668546199798584, + "eval_runtime": 31.8004, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 325 + }, + { + "epoch": 0.85, + "learning_rate": 5.758354755784062e-05, + "loss": 1.6614, + "step": 330 + }, + { + "epoch": 0.85, + "eval_loss": 1.666812539100647, + "eval_runtime": 31.8023, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 330 + }, + { + "epoch": 0.86, + "learning_rate": 5.694087403598972e-05, + "loss": 1.6598, + "step": 335 + }, + { + "epoch": 0.86, + "eval_loss": 1.666398048400879, + "eval_runtime": 31.8087, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 335 + }, + { + "epoch": 0.87, + "learning_rate": 5.6298200514138824e-05, + "loss": 1.6698, + "step": 340 + }, + { + "epoch": 0.87, + "eval_loss": 1.6660391092300415, + "eval_runtime": 31.8016, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 340 + }, + { + "epoch": 0.89, + "learning_rate": 5.5655526992287924e-05, + "loss": 1.6346, + "step": 345 + }, + { + "epoch": 0.89, + "eval_loss": 1.665901780128479, + "eval_runtime": 31.7878, + "eval_samples_per_second": 2.643, + "eval_steps_per_second": 0.346, + "step": 345 + }, + { + "epoch": 0.9, + "learning_rate": 5.501285347043702e-05, + "loss": 1.6617, + "step": 350 + }, + { + "epoch": 0.9, + "eval_loss": 1.666027307510376, + "eval_runtime": 31.7999, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 350 + }, + { + "epoch": 0.91, + "learning_rate": 5.437017994858612e-05, + "loss": 1.6623, + "step": 355 + }, + { + "epoch": 0.91, + "eval_loss": 1.6658612489700317, + "eval_runtime": 31.8128, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 355 + }, + { + "epoch": 0.93, + "learning_rate": 5.372750642673522e-05, + "loss": 1.6737, + "step": 360 + }, + { + "epoch": 0.93, + "eval_loss": 1.6656837463378906, + "eval_runtime": 31.8094, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 360 + }, + { + "epoch": 0.94, + "learning_rate": 5.308483290488432e-05, + "loss": 1.6981, + "step": 365 + }, + { + "epoch": 0.94, + "eval_loss": 1.6653661727905273, + "eval_runtime": 31.8031, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 365 + }, + { + "epoch": 0.95, + "learning_rate": 5.244215938303342e-05, + "loss": 1.6832, + "step": 370 + }, + { + "epoch": 0.95, + "eval_loss": 1.6652257442474365, + "eval_runtime": 31.8077, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 370 + }, + { + "epoch": 0.96, + "learning_rate": 5.1799485861182514e-05, + "loss": 1.6925, + "step": 375 + }, + { + "epoch": 0.96, + "eval_loss": 1.664839267730713, + "eval_runtime": 31.8082, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 375 + }, + { + "epoch": 0.98, + "learning_rate": 5.1156812339331615e-05, + "loss": 1.6721, + "step": 380 + }, + { + "epoch": 0.98, + "eval_loss": 1.6642415523529053, + "eval_runtime": 31.7927, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 380 + }, + { + "epoch": 0.99, + "learning_rate": 5.051413881748073e-05, + "loss": 1.6208, + "step": 385 + }, + { + "epoch": 0.99, + "eval_loss": 1.6641273498535156, + "eval_runtime": 31.8006, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 385 + }, + { + "epoch": 1.0, + "learning_rate": 4.987146529562982e-05, + "loss": 1.6519, + "step": 390 + }, + { + "epoch": 1.0, + "eval_loss": 1.6645455360412598, + "eval_runtime": 31.8026, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 390 + }, + { + "epoch": 1.02, + "learning_rate": 4.922879177377892e-05, + "loss": 1.6581, + "step": 395 + }, + { + "epoch": 1.02, + "eval_loss": 1.6641591787338257, + "eval_runtime": 31.8025, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 395 + }, + { + "epoch": 1.03, + "learning_rate": 4.8586118251928024e-05, + "loss": 1.6768, + "step": 400 + }, + { + "epoch": 1.03, + "eval_loss": 1.6637850999832153, + "eval_runtime": 31.806, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 400 + }, + { + "epoch": 1.04, + "learning_rate": 4.7943444730077124e-05, + "loss": 1.6257, + "step": 405 + }, + { + "epoch": 1.04, + "eval_loss": 1.6636526584625244, + "eval_runtime": 31.8116, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 405 + }, + { + "epoch": 1.05, + "learning_rate": 4.7300771208226225e-05, + "loss": 1.6522, + "step": 410 + }, + { + "epoch": 1.05, + "eval_loss": 1.6634690761566162, + "eval_runtime": 31.8063, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 410 + }, + { + "epoch": 1.07, + "learning_rate": 4.6658097686375325e-05, + "loss": 1.615, + "step": 415 + }, + { + "epoch": 1.07, + "eval_loss": 1.6637413501739502, + "eval_runtime": 31.8115, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 415 + }, + { + "epoch": 1.08, + "learning_rate": 4.6015424164524426e-05, + "loss": 1.5967, + "step": 420 + }, + { + "epoch": 1.08, + "eval_loss": 1.6633188724517822, + "eval_runtime": 31.8016, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 420 + }, + { + "epoch": 1.09, + "learning_rate": 4.537275064267352e-05, + "loss": 1.6708, + "step": 425 + }, + { + "epoch": 1.09, + "eval_loss": 1.6634286642074585, + "eval_runtime": 31.8074, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 425 + }, + { + "epoch": 1.11, + "learning_rate": 4.473007712082262e-05, + "loss": 1.6696, + "step": 430 + }, + { + "epoch": 1.11, + "eval_loss": 1.663163661956787, + "eval_runtime": 31.799, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 430 + }, + { + "epoch": 1.12, + "learning_rate": 4.408740359897173e-05, + "loss": 1.6344, + "step": 435 + }, + { + "epoch": 1.12, + "eval_loss": 1.6632493734359741, + "eval_runtime": 31.8032, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 435 + }, + { + "epoch": 1.13, + "learning_rate": 4.344473007712083e-05, + "loss": 1.5922, + "step": 440 + }, + { + "epoch": 1.13, + "eval_loss": 1.6627737283706665, + "eval_runtime": 31.8064, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 440 + }, + { + "epoch": 1.14, + "learning_rate": 4.280205655526993e-05, + "loss": 1.6541, + "step": 445 + }, + { + "epoch": 1.14, + "eval_loss": 1.6630815267562866, + "eval_runtime": 31.8083, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 445 + }, + { + "epoch": 1.16, + "learning_rate": 4.215938303341902e-05, + "loss": 1.6348, + "step": 450 + }, + { + "epoch": 1.16, + "eval_loss": 1.662713885307312, + "eval_runtime": 31.7977, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 450 + }, + { + "epoch": 1.17, + "learning_rate": 4.151670951156812e-05, + "loss": 1.6444, + "step": 455 + }, + { + "epoch": 1.17, + "eval_loss": 1.6629937887191772, + "eval_runtime": 31.799, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 455 + }, + { + "epoch": 1.18, + "learning_rate": 4.0874035989717224e-05, + "loss": 1.6629, + "step": 460 + }, + { + "epoch": 1.18, + "eval_loss": 1.6626592874526978, + "eval_runtime": 31.8136, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 460 + }, + { + "epoch": 1.2, + "learning_rate": 4.0231362467866324e-05, + "loss": 1.6327, + "step": 465 + }, + { + "epoch": 1.2, + "eval_loss": 1.6625391244888306, + "eval_runtime": 31.802, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 465 + }, + { + "epoch": 1.21, + "learning_rate": 3.958868894601543e-05, + "loss": 1.6236, + "step": 470 + }, + { + "epoch": 1.21, + "eval_loss": 1.6622934341430664, + "eval_runtime": 31.8055, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 470 + }, + { + "epoch": 1.22, + "learning_rate": 3.8946015424164526e-05, + "loss": 1.6232, + "step": 475 + }, + { + "epoch": 1.22, + "eval_loss": 1.661791443824768, + "eval_runtime": 31.8035, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 475 + }, + { + "epoch": 1.23, + "learning_rate": 3.8303341902313626e-05, + "loss": 1.6374, + "step": 480 + }, + { + "epoch": 1.23, + "eval_loss": 1.6618887186050415, + "eval_runtime": 31.7892, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 480 + }, + { + "epoch": 1.25, + "learning_rate": 3.766066838046273e-05, + "loss": 1.6422, + "step": 485 + }, + { + "epoch": 1.25, + "eval_loss": 1.6620476245880127, + "eval_runtime": 31.7974, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 485 + }, + { + "epoch": 1.26, + "learning_rate": 3.701799485861183e-05, + "loss": 1.631, + "step": 490 + }, + { + "epoch": 1.26, + "eval_loss": 1.6622774600982666, + "eval_runtime": 31.7978, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 490 + }, + { + "epoch": 1.27, + "learning_rate": 3.637532133676093e-05, + "loss": 1.609, + "step": 495 + }, + { + "epoch": 1.27, + "eval_loss": 1.661568284034729, + "eval_runtime": 31.8034, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 495 + }, + { + "epoch": 1.29, + "learning_rate": 3.573264781491003e-05, + "loss": 1.6444, + "step": 500 + }, + { + "epoch": 1.29, + "eval_loss": 1.6610949039459229, + "eval_runtime": 31.8005, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 500 + }, + { + "epoch": 1.3, + "learning_rate": 3.508997429305913e-05, + "loss": 1.6387, + "step": 505 + }, + { + "epoch": 1.3, + "eval_loss": 1.6611486673355103, + "eval_runtime": 31.8079, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 505 + }, + { + "epoch": 1.31, + "learning_rate": 3.444730077120823e-05, + "loss": 1.6507, + "step": 510 + }, + { + "epoch": 1.31, + "eval_loss": 1.661084532737732, + "eval_runtime": 31.8028, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 510 + }, + { + "epoch": 1.32, + "learning_rate": 3.380462724935733e-05, + "loss": 1.6377, + "step": 515 + }, + { + "epoch": 1.32, + "eval_loss": 1.6610103845596313, + "eval_runtime": 31.8101, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 515 + }, + { + "epoch": 1.34, + "learning_rate": 3.316195372750643e-05, + "loss": 1.6351, + "step": 520 + }, + { + "epoch": 1.34, + "eval_loss": 1.6608690023422241, + "eval_runtime": 31.7986, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 520 + }, + { + "epoch": 1.35, + "learning_rate": 3.251928020565553e-05, + "loss": 1.6196, + "step": 525 + }, + { + "epoch": 1.35, + "eval_loss": 1.6606587171554565, + "eval_runtime": 31.8015, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 525 + }, + { + "epoch": 1.36, + "learning_rate": 3.1876606683804625e-05, + "loss": 1.646, + "step": 530 + }, + { + "epoch": 1.36, + "eval_loss": 1.6605886220932007, + "eval_runtime": 31.804, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 530 + }, + { + "epoch": 1.38, + "learning_rate": 3.1233933161953726e-05, + "loss": 1.6824, + "step": 535 + }, + { + "epoch": 1.38, + "eval_loss": 1.6603879928588867, + "eval_runtime": 31.8111, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 535 + }, + { + "epoch": 1.39, + "learning_rate": 3.059125964010283e-05, + "loss": 1.6115, + "step": 540 + }, + { + "epoch": 1.39, + "eval_loss": 1.660337209701538, + "eval_runtime": 31.805, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 540 + }, + { + "epoch": 1.4, + "learning_rate": 2.994858611825193e-05, + "loss": 1.6243, + "step": 545 + }, + { + "epoch": 1.4, + "eval_loss": 1.6606225967407227, + "eval_runtime": 31.808, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 545 + }, + { + "epoch": 1.41, + "learning_rate": 2.930591259640103e-05, + "loss": 1.6486, + "step": 550 + }, + { + "epoch": 1.41, + "eval_loss": 1.6604374647140503, + "eval_runtime": 31.8072, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 550 + }, + { + "epoch": 1.43, + "learning_rate": 2.866323907455013e-05, + "loss": 1.6865, + "step": 555 + }, + { + "epoch": 1.43, + "eval_loss": 1.6601544618606567, + "eval_runtime": 31.8069, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 555 + }, + { + "epoch": 1.44, + "learning_rate": 2.802056555269923e-05, + "loss": 1.6616, + "step": 560 + }, + { + "epoch": 1.44, + "eval_loss": 1.660386562347412, + "eval_runtime": 31.803, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 560 + }, + { + "epoch": 1.45, + "learning_rate": 2.737789203084833e-05, + "loss": 1.6336, + "step": 565 + }, + { + "epoch": 1.45, + "eval_loss": 1.660247802734375, + "eval_runtime": 31.8084, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 565 + }, + { + "epoch": 1.47, + "learning_rate": 2.673521850899743e-05, + "loss": 1.6699, + "step": 570 + }, + { + "epoch": 1.47, + "eval_loss": 1.660292148590088, + "eval_runtime": 31.8031, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 570 + }, + { + "epoch": 1.48, + "learning_rate": 2.6092544987146534e-05, + "loss": 1.6472, + "step": 575 + }, + { + "epoch": 1.48, + "eval_loss": 1.6598683595657349, + "eval_runtime": 31.8181, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 575 + }, + { + "epoch": 1.49, + "learning_rate": 2.5449871465295634e-05, + "loss": 1.6723, + "step": 580 + }, + { + "epoch": 1.49, + "eval_loss": 1.6599065065383911, + "eval_runtime": 31.8124, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 580 + }, + { + "epoch": 1.5, + "learning_rate": 2.480719794344473e-05, + "loss": 1.663, + "step": 585 + }, + { + "epoch": 1.5, + "eval_loss": 1.6599762439727783, + "eval_runtime": 31.8175, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 585 + }, + { + "epoch": 1.52, + "learning_rate": 2.4164524421593832e-05, + "loss": 1.6519, + "step": 590 + }, + { + "epoch": 1.52, + "eval_loss": 1.6599925756454468, + "eval_runtime": 31.8209, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 590 + }, + { + "epoch": 1.53, + "learning_rate": 2.3521850899742933e-05, + "loss": 1.6345, + "step": 595 + }, + { + "epoch": 1.53, + "eval_loss": 1.6600078344345093, + "eval_runtime": 31.8031, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 595 + }, + { + "epoch": 1.54, + "learning_rate": 2.2879177377892033e-05, + "loss": 1.654, + "step": 600 + }, + { + "epoch": 1.54, + "eval_loss": 1.6599884033203125, + "eval_runtime": 31.8281, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 0.346, + "step": 600 + }, + { + "epoch": 1.56, + "learning_rate": 2.2236503856041134e-05, + "loss": 1.691, + "step": 605 + }, + { + "epoch": 1.56, + "eval_loss": 1.6599520444869995, + "eval_runtime": 31.8168, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 605 + }, + { + "epoch": 1.57, + "learning_rate": 2.159383033419023e-05, + "loss": 1.6714, + "step": 610 + }, + { + "epoch": 1.57, + "eval_loss": 1.659857988357544, + "eval_runtime": 31.8153, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 610 + }, + { + "epoch": 1.58, + "learning_rate": 2.095115681233933e-05, + "loss": 1.5977, + "step": 615 + }, + { + "epoch": 1.58, + "eval_loss": 1.6596992015838623, + "eval_runtime": 31.8087, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 615 + }, + { + "epoch": 1.59, + "learning_rate": 2.0308483290488432e-05, + "loss": 1.6429, + "step": 620 + }, + { + "epoch": 1.59, + "eval_loss": 1.6595497131347656, + "eval_runtime": 31.8151, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 620 + }, + { + "epoch": 1.61, + "learning_rate": 1.9665809768637533e-05, + "loss": 1.6091, + "step": 625 + }, + { + "epoch": 1.61, + "eval_loss": 1.6597559452056885, + "eval_runtime": 31.8076, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 625 + }, + { + "epoch": 1.62, + "learning_rate": 1.9023136246786633e-05, + "loss": 1.5956, + "step": 630 + }, + { + "epoch": 1.62, + "eval_loss": 1.6593599319458008, + "eval_runtime": 31.8058, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 630 + }, + { + "epoch": 1.63, + "learning_rate": 1.8380462724935734e-05, + "loss": 1.6697, + "step": 635 + }, + { + "epoch": 1.63, + "eval_loss": 1.6592466831207275, + "eval_runtime": 31.8589, + "eval_samples_per_second": 2.637, + "eval_steps_per_second": 0.345, + "step": 635 + }, + { + "epoch": 1.65, + "learning_rate": 1.7737789203084834e-05, + "loss": 1.6535, + "step": 640 + }, + { + "epoch": 1.65, + "eval_loss": 1.6592165231704712, + "eval_runtime": 31.8225, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 640 + }, + { + "epoch": 1.66, + "learning_rate": 1.7095115681233935e-05, + "loss": 1.6722, + "step": 645 + }, + { + "epoch": 1.66, + "eval_loss": 1.659324049949646, + "eval_runtime": 31.8388, + "eval_samples_per_second": 2.638, + "eval_steps_per_second": 0.345, + "step": 645 + }, + { + "epoch": 1.67, + "learning_rate": 1.6452442159383032e-05, + "loss": 1.6347, + "step": 650 + }, + { + "epoch": 1.67, + "eval_loss": 1.6590396165847778, + "eval_runtime": 31.8174, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 650 + }, + { + "epoch": 1.68, + "learning_rate": 1.5809768637532136e-05, + "loss": 1.6223, + "step": 655 + }, + { + "epoch": 1.68, + "eval_loss": 1.658969521522522, + "eval_runtime": 31.8275, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 0.346, + "step": 655 + }, + { + "epoch": 1.7, + "learning_rate": 1.5167095115681235e-05, + "loss": 1.6309, + "step": 660 + }, + { + "epoch": 1.7, + "eval_loss": 1.6587274074554443, + "eval_runtime": 31.8146, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 660 + }, + { + "epoch": 1.71, + "learning_rate": 1.4524421593830334e-05, + "loss": 1.6499, + "step": 665 + }, + { + "epoch": 1.71, + "eval_loss": 1.65865957736969, + "eval_runtime": 31.8091, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 665 + }, + { + "epoch": 1.72, + "learning_rate": 1.3881748071979436e-05, + "loss": 1.6713, + "step": 670 + }, + { + "epoch": 1.72, + "eval_loss": 1.6586016416549683, + "eval_runtime": 31.8229, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 670 + }, + { + "epoch": 1.74, + "learning_rate": 1.3239074550128535e-05, + "loss": 1.6348, + "step": 675 + }, + { + "epoch": 1.74, + "eval_loss": 1.6584962606430054, + "eval_runtime": 31.8035, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 675 + }, + { + "epoch": 1.75, + "learning_rate": 1.2596401028277636e-05, + "loss": 1.6447, + "step": 680 + }, + { + "epoch": 1.75, + "eval_loss": 1.6585614681243896, + "eval_runtime": 31.8101, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 680 + }, + { + "epoch": 1.76, + "learning_rate": 1.1953727506426736e-05, + "loss": 1.6511, + "step": 685 + }, + { + "epoch": 1.76, + "eval_loss": 1.6583601236343384, + "eval_runtime": 31.8125, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 685 + }, + { + "epoch": 1.77, + "learning_rate": 1.1311053984575835e-05, + "loss": 1.6496, + "step": 690 + }, + { + "epoch": 1.77, + "eval_loss": 1.6583328247070312, + "eval_runtime": 31.8179, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 690 + }, + { + "epoch": 1.79, + "learning_rate": 1.0668380462724936e-05, + "loss": 1.6421, + "step": 695 + }, + { + "epoch": 1.79, + "eval_loss": 1.6583188772201538, + "eval_runtime": 31.8165, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 695 + }, + { + "epoch": 1.8, + "learning_rate": 1.0025706940874038e-05, + "loss": 1.6126, + "step": 700 + }, + { + "epoch": 1.8, + "eval_loss": 1.6580705642700195, + "eval_runtime": 31.8164, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 700 + }, + { + "epoch": 1.81, + "learning_rate": 9.383033419023137e-06, + "loss": 1.6226, + "step": 705 + }, + { + "epoch": 1.81, + "eval_loss": 1.6584465503692627, + "eval_runtime": 31.816, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 705 + }, + { + "epoch": 1.83, + "learning_rate": 8.740359897172237e-06, + "loss": 1.6923, + "step": 710 + }, + { + "epoch": 1.83, + "eval_loss": 1.658342957496643, + "eval_runtime": 31.8215, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 710 + }, + { + "epoch": 1.84, + "learning_rate": 8.097686375321336e-06, + "loss": 1.6224, + "step": 715 + }, + { + "epoch": 1.84, + "eval_loss": 1.6582502126693726, + "eval_runtime": 31.812, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 715 + }, + { + "epoch": 1.85, + "learning_rate": 7.4550128534704376e-06, + "loss": 1.6587, + "step": 720 + }, + { + "epoch": 1.85, + "eval_loss": 1.658232569694519, + "eval_runtime": 31.8059, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 720 + }, + { + "epoch": 1.86, + "learning_rate": 6.812339331619537e-06, + "loss": 1.6641, + "step": 725 + }, + { + "epoch": 1.86, + "eval_loss": 1.6582419872283936, + "eval_runtime": 31.81, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 725 + }, + { + "epoch": 1.88, + "learning_rate": 6.169665809768638e-06, + "loss": 1.6419, + "step": 730 + }, + { + "epoch": 1.88, + "eval_loss": 1.6582107543945312, + "eval_runtime": 31.833, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 0.346, + "step": 730 + }, + { + "epoch": 1.89, + "learning_rate": 5.526992287917738e-06, + "loss": 1.652, + "step": 735 + }, + { + "epoch": 1.89, + "eval_loss": 1.6582270860671997, + "eval_runtime": 31.8801, + "eval_samples_per_second": 2.635, + "eval_steps_per_second": 0.345, + "step": 735 + }, + { + "epoch": 1.9, + "learning_rate": 4.884318766066838e-06, + "loss": 1.6487, + "step": 740 + }, + { + "epoch": 1.9, + "eval_loss": 1.6582915782928467, + "eval_runtime": 31.8265, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 0.346, + "step": 740 + }, + { + "epoch": 1.92, + "learning_rate": 4.241645244215939e-06, + "loss": 1.6245, + "step": 745 + }, + { + "epoch": 1.92, + "eval_loss": 1.658047080039978, + "eval_runtime": 31.8109, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 745 + }, + { + "epoch": 1.93, + "learning_rate": 3.598971722365039e-06, + "loss": 1.6553, + "step": 750 + }, + { + "epoch": 1.93, + "eval_loss": 1.6578683853149414, + "eval_runtime": 31.8025, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 750 + }, + { + "epoch": 1.94, + "learning_rate": 2.956298200514139e-06, + "loss": 1.653, + "step": 755 + }, + { + "epoch": 1.94, + "eval_loss": 1.6578799486160278, + "eval_runtime": 31.8005, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 755 + }, + { + "epoch": 1.95, + "learning_rate": 2.313624678663239e-06, + "loss": 1.6519, + "step": 760 + }, + { + "epoch": 1.95, + "eval_loss": 1.657995343208313, + "eval_runtime": 31.8085, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 760 + } + ], + "logging_steps": 5, + "max_steps": 778, + "num_train_epochs": 2, + "save_steps": 10, + "total_flos": 2.4689619442335744e+17, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-760/training_args.bin b/checkpoint-760/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..48088b6a3851b1352bfa758dd2b3ad558168087f --- /dev/null +++ b/checkpoint-760/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccc6a8542abd91027de58ebfccafdf919108680707834fe188d91cae077ef8e9 +size 4600 diff --git a/checkpoint-770/README.md b/checkpoint-770/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4787359fdf0253321d922e272bacc387ca46ce22 --- /dev/null +++ b/checkpoint-770/README.md @@ -0,0 +1,21 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- quant_method: bitsandbytes +- load_in_8bit: True +- load_in_4bit: False +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 +### Framework versions + + +- PEFT 0.5.0 diff --git a/checkpoint-770/adapter_config.json b/checkpoint-770/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..96a10d09371f10f296c5b7fcb5b0ddd7be98eef2 --- /dev/null +++ b/checkpoint-770/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-2-7b-hf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-770/adapter_model.safetensors b/checkpoint-770/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b5eab878403d0a3c13825bdb7249276e3a1d607e --- /dev/null +++ b/checkpoint-770/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f53f206e0cb885454f819ae3409abc106eeba62514646529f6f9e4bbb61ba4bb +size 16794200 diff --git a/checkpoint-770/optimizer.pt b/checkpoint-770/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..f250a5f28034c0081cc1d7892652401fd9b4e91f --- /dev/null +++ b/checkpoint-770/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79d8992ce3454e6f5b2392b746972be89665bcb4818ce6f2e23af544ed03c810 +size 33663866 diff --git a/checkpoint-770/rng_state.pth b/checkpoint-770/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..75a797d23d416b5044e9122b50b023e8ab5e119f --- /dev/null +++ b/checkpoint-770/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5f9ad2e286a1e70aa146089c04236927b587bd36b55c15839ea4c0e9f97abe +size 14244 diff --git a/checkpoint-770/scheduler.pt b/checkpoint-770/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..22125270212ab2e0935147bc4e9203010710ee4c --- /dev/null +++ b/checkpoint-770/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20a2696e9318ad2bf716807b888007e1a89e6c3dfb2d5365a6d1bd45f962d7a5 +size 1064 diff --git a/checkpoint-770/trainer_state.json b/checkpoint-770/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e0455b11c5d35b56424a849adf58775172a9cd4f --- /dev/null +++ b/checkpoint-770/trainer_state.json @@ -0,0 +1,2175 @@ +{ + "best_metric": 1.6578683853149414, + "best_model_checkpoint": "/scratch/kwamea/llama-output/checkpoint-750", + "epoch": 1.9794344473007712, + "eval_steps": 5, + "global_step": 770, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 9.93573264781491e-05, + "loss": 1.9824, + "step": 5 + }, + { + "epoch": 0.01, + "eval_loss": 1.954728126525879, + "eval_runtime": 31.6803, + "eval_samples_per_second": 2.651, + "eval_steps_per_second": 0.347, + "step": 5 + }, + { + "epoch": 0.03, + "learning_rate": 9.87146529562982e-05, + "loss": 1.9249, + "step": 10 + }, + { + "epoch": 0.03, + "eval_loss": 1.8934264183044434, + "eval_runtime": 31.7798, + "eval_samples_per_second": 2.643, + "eval_steps_per_second": 0.346, + "step": 10 + }, + { + "epoch": 0.04, + "learning_rate": 9.80719794344473e-05, + "loss": 1.8609, + "step": 15 + }, + { + "epoch": 0.04, + "eval_loss": 1.8421852588653564, + "eval_runtime": 31.7999, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 15 + }, + { + "epoch": 0.05, + "learning_rate": 9.742930591259641e-05, + "loss": 1.8268, + "step": 20 + }, + { + "epoch": 0.05, + "eval_loss": 1.8279638290405273, + "eval_runtime": 31.8095, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 20 + }, + { + "epoch": 0.06, + "learning_rate": 9.67866323907455e-05, + "loss": 1.8349, + "step": 25 + }, + { + "epoch": 0.06, + "eval_loss": 1.813267469406128, + "eval_runtime": 31.8291, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 0.346, + "step": 25 + }, + { + "epoch": 0.08, + "learning_rate": 9.61439588688946e-05, + "loss": 1.8239, + "step": 30 + }, + { + "epoch": 0.08, + "eval_loss": 1.8008779287338257, + "eval_runtime": 31.82, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 30 + }, + { + "epoch": 0.09, + "learning_rate": 9.550128534704372e-05, + "loss": 1.7177, + "step": 35 + }, + { + "epoch": 0.09, + "eval_loss": 1.789797067642212, + "eval_runtime": 31.8423, + "eval_samples_per_second": 2.638, + "eval_steps_per_second": 0.345, + "step": 35 + }, + { + "epoch": 0.1, + "learning_rate": 9.485861182519281e-05, + "loss": 1.7962, + "step": 40 + }, + { + "epoch": 0.1, + "eval_loss": 1.7808287143707275, + "eval_runtime": 31.8387, + "eval_samples_per_second": 2.638, + "eval_steps_per_second": 0.345, + "step": 40 + }, + { + "epoch": 0.12, + "learning_rate": 9.421593830334192e-05, + "loss": 1.715, + "step": 45 + }, + { + "epoch": 0.12, + "eval_loss": 1.771236777305603, + "eval_runtime": 31.8526, + "eval_samples_per_second": 2.637, + "eval_steps_per_second": 0.345, + "step": 45 + }, + { + "epoch": 0.13, + "learning_rate": 9.357326478149101e-05, + "loss": 1.7577, + "step": 50 + }, + { + "epoch": 0.13, + "eval_loss": 1.7619521617889404, + "eval_runtime": 31.828, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 0.346, + "step": 50 + }, + { + "epoch": 0.14, + "learning_rate": 9.29305912596401e-05, + "loss": 1.7323, + "step": 55 + }, + { + "epoch": 0.14, + "eval_loss": 1.7440986633300781, + "eval_runtime": 31.8301, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 0.346, + "step": 55 + }, + { + "epoch": 0.15, + "learning_rate": 9.228791773778921e-05, + "loss": 1.7122, + "step": 60 + }, + { + "epoch": 0.15, + "eval_loss": 1.712996006011963, + "eval_runtime": 31.8185, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 60 + }, + { + "epoch": 0.17, + "learning_rate": 9.16452442159383e-05, + "loss": 1.7042, + "step": 65 + }, + { + "epoch": 0.17, + "eval_loss": 1.7101707458496094, + "eval_runtime": 31.8153, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 65 + }, + { + "epoch": 0.18, + "learning_rate": 9.100257069408741e-05, + "loss": 1.7242, + "step": 70 + }, + { + "epoch": 0.18, + "eval_loss": 1.7038702964782715, + "eval_runtime": 31.8191, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 70 + }, + { + "epoch": 0.19, + "learning_rate": 9.03598971722365e-05, + "loss": 1.7033, + "step": 75 + }, + { + "epoch": 0.19, + "eval_loss": 1.7004183530807495, + "eval_runtime": 31.811, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 75 + }, + { + "epoch": 0.21, + "learning_rate": 8.97172236503856e-05, + "loss": 1.6934, + "step": 80 + }, + { + "epoch": 0.21, + "eval_loss": 1.6985355615615845, + "eval_runtime": 31.8221, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 80 + }, + { + "epoch": 0.22, + "learning_rate": 8.907455012853471e-05, + "loss": 1.6443, + "step": 85 + }, + { + "epoch": 0.22, + "eval_loss": 1.696852684020996, + "eval_runtime": 31.8219, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 85 + }, + { + "epoch": 0.23, + "learning_rate": 8.84318766066838e-05, + "loss": 1.7008, + "step": 90 + }, + { + "epoch": 0.23, + "eval_loss": 1.695594072341919, + "eval_runtime": 31.8205, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 90 + }, + { + "epoch": 0.24, + "learning_rate": 8.778920308483291e-05, + "loss": 1.6873, + "step": 95 + }, + { + "epoch": 0.24, + "eval_loss": 1.6928850412368774, + "eval_runtime": 31.814, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 95 + }, + { + "epoch": 0.26, + "learning_rate": 8.7146529562982e-05, + "loss": 1.6721, + "step": 100 + }, + { + "epoch": 0.26, + "eval_loss": 1.6919386386871338, + "eval_runtime": 31.8127, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 100 + }, + { + "epoch": 0.27, + "learning_rate": 8.650385604113111e-05, + "loss": 1.6669, + "step": 105 + }, + { + "epoch": 0.27, + "eval_loss": 1.6905940771102905, + "eval_runtime": 31.8874, + "eval_samples_per_second": 2.634, + "eval_steps_per_second": 0.345, + "step": 105 + }, + { + "epoch": 0.28, + "learning_rate": 8.586118251928022e-05, + "loss": 1.7009, + "step": 110 + }, + { + "epoch": 0.28, + "eval_loss": 1.689422607421875, + "eval_runtime": 31.8489, + "eval_samples_per_second": 2.637, + "eval_steps_per_second": 0.345, + "step": 110 + }, + { + "epoch": 0.3, + "learning_rate": 8.521850899742931e-05, + "loss": 1.7151, + "step": 115 + }, + { + "epoch": 0.3, + "eval_loss": 1.688527226448059, + "eval_runtime": 31.8318, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 0.346, + "step": 115 + }, + { + "epoch": 0.31, + "learning_rate": 8.457583547557842e-05, + "loss": 1.7165, + "step": 120 + }, + { + "epoch": 0.31, + "eval_loss": 1.686748743057251, + "eval_runtime": 31.835, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 0.346, + "step": 120 + }, + { + "epoch": 0.32, + "learning_rate": 8.393316195372751e-05, + "loss": 1.7015, + "step": 125 + }, + { + "epoch": 0.32, + "eval_loss": 1.6855812072753906, + "eval_runtime": 31.8269, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 0.346, + "step": 125 + }, + { + "epoch": 0.33, + "learning_rate": 8.32904884318766e-05, + "loss": 1.6818, + "step": 130 + }, + { + "epoch": 0.33, + "eval_loss": 1.6846078634262085, + "eval_runtime": 31.8617, + "eval_samples_per_second": 2.636, + "eval_steps_per_second": 0.345, + "step": 130 + }, + { + "epoch": 0.35, + "learning_rate": 8.264781491002571e-05, + "loss": 1.7151, + "step": 135 + }, + { + "epoch": 0.35, + "eval_loss": 1.6845166683197021, + "eval_runtime": 31.8631, + "eval_samples_per_second": 2.636, + "eval_steps_per_second": 0.345, + "step": 135 + }, + { + "epoch": 0.36, + "learning_rate": 8.200514138817481e-05, + "loss": 1.6667, + "step": 140 + }, + { + "epoch": 0.36, + "eval_loss": 1.6832246780395508, + "eval_runtime": 31.8276, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 0.346, + "step": 140 + }, + { + "epoch": 0.37, + "learning_rate": 8.136246786632391e-05, + "loss": 1.6586, + "step": 145 + }, + { + "epoch": 0.37, + "eval_loss": 1.6824573278427124, + "eval_runtime": 31.8115, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 145 + }, + { + "epoch": 0.39, + "learning_rate": 8.071979434447301e-05, + "loss": 1.6999, + "step": 150 + }, + { + "epoch": 0.39, + "eval_loss": 1.681317687034607, + "eval_runtime": 31.805, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 150 + }, + { + "epoch": 0.4, + "learning_rate": 8.007712082262212e-05, + "loss": 1.6894, + "step": 155 + }, + { + "epoch": 0.4, + "eval_loss": 1.6812355518341064, + "eval_runtime": 31.8039, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 155 + }, + { + "epoch": 0.41, + "learning_rate": 7.943444730077121e-05, + "loss": 1.6669, + "step": 160 + }, + { + "epoch": 0.41, + "eval_loss": 1.679802656173706, + "eval_runtime": 31.8021, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 160 + }, + { + "epoch": 0.42, + "learning_rate": 7.87917737789203e-05, + "loss": 1.6481, + "step": 165 + }, + { + "epoch": 0.42, + "eval_loss": 1.679994821548462, + "eval_runtime": 31.8201, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 165 + }, + { + "epoch": 0.44, + "learning_rate": 7.814910025706941e-05, + "loss": 1.7042, + "step": 170 + }, + { + "epoch": 0.44, + "eval_loss": 1.6786224842071533, + "eval_runtime": 31.8184, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 170 + }, + { + "epoch": 0.45, + "learning_rate": 7.750642673521852e-05, + "loss": 1.6564, + "step": 175 + }, + { + "epoch": 0.45, + "eval_loss": 1.6783201694488525, + "eval_runtime": 31.8137, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 175 + }, + { + "epoch": 0.46, + "learning_rate": 7.686375321336761e-05, + "loss": 1.6714, + "step": 180 + }, + { + "epoch": 0.46, + "eval_loss": 1.677714228630066, + "eval_runtime": 31.804, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 180 + }, + { + "epoch": 0.48, + "learning_rate": 7.622107969151672e-05, + "loss": 1.6705, + "step": 185 + }, + { + "epoch": 0.48, + "eval_loss": 1.6772773265838623, + "eval_runtime": 31.8077, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 185 + }, + { + "epoch": 0.49, + "learning_rate": 7.557840616966581e-05, + "loss": 1.6624, + "step": 190 + }, + { + "epoch": 0.49, + "eval_loss": 1.6766114234924316, + "eval_runtime": 31.8164, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 190 + }, + { + "epoch": 0.5, + "learning_rate": 7.493573264781492e-05, + "loss": 1.6415, + "step": 195 + }, + { + "epoch": 0.5, + "eval_loss": 1.675940752029419, + "eval_runtime": 31.8124, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 195 + }, + { + "epoch": 0.51, + "learning_rate": 7.429305912596401e-05, + "loss": 1.6633, + "step": 200 + }, + { + "epoch": 0.51, + "eval_loss": 1.6751865148544312, + "eval_runtime": 31.8123, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 200 + }, + { + "epoch": 0.53, + "learning_rate": 7.365038560411311e-05, + "loss": 1.6142, + "step": 205 + }, + { + "epoch": 0.53, + "eval_loss": 1.6750575304031372, + "eval_runtime": 31.8049, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 205 + }, + { + "epoch": 0.54, + "learning_rate": 7.300771208226222e-05, + "loss": 1.6736, + "step": 210 + }, + { + "epoch": 0.54, + "eval_loss": 1.6746423244476318, + "eval_runtime": 31.8111, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 210 + }, + { + "epoch": 0.55, + "learning_rate": 7.236503856041131e-05, + "loss": 1.6414, + "step": 215 + }, + { + "epoch": 0.55, + "eval_loss": 1.6740491390228271, + "eval_runtime": 31.8083, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 215 + }, + { + "epoch": 0.57, + "learning_rate": 7.172236503856042e-05, + "loss": 1.6725, + "step": 220 + }, + { + "epoch": 0.57, + "eval_loss": 1.673694372177124, + "eval_runtime": 31.8025, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 220 + }, + { + "epoch": 0.58, + "learning_rate": 7.107969151670951e-05, + "loss": 1.679, + "step": 225 + }, + { + "epoch": 0.58, + "eval_loss": 1.6733818054199219, + "eval_runtime": 31.8077, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 225 + }, + { + "epoch": 0.59, + "learning_rate": 7.043701799485862e-05, + "loss": 1.7204, + "step": 230 + }, + { + "epoch": 0.59, + "eval_loss": 1.6725033521652222, + "eval_runtime": 31.8079, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 230 + }, + { + "epoch": 0.6, + "learning_rate": 6.979434447300771e-05, + "loss": 1.6816, + "step": 235 + }, + { + "epoch": 0.6, + "eval_loss": 1.67205011844635, + "eval_runtime": 31.809, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 235 + }, + { + "epoch": 0.62, + "learning_rate": 6.91516709511568e-05, + "loss": 1.634, + "step": 240 + }, + { + "epoch": 0.62, + "eval_loss": 1.6717994213104248, + "eval_runtime": 31.8075, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 240 + }, + { + "epoch": 0.63, + "learning_rate": 6.850899742930593e-05, + "loss": 1.6761, + "step": 245 + }, + { + "epoch": 0.63, + "eval_loss": 1.6714757680892944, + "eval_runtime": 31.8052, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 245 + }, + { + "epoch": 0.64, + "learning_rate": 6.786632390745502e-05, + "loss": 1.6996, + "step": 250 + }, + { + "epoch": 0.64, + "eval_loss": 1.671559453010559, + "eval_runtime": 31.8089, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 250 + }, + { + "epoch": 0.66, + "learning_rate": 6.722365038560411e-05, + "loss": 1.6302, + "step": 255 + }, + { + "epoch": 0.66, + "eval_loss": 1.6711652278900146, + "eval_runtime": 31.8105, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 255 + }, + { + "epoch": 0.67, + "learning_rate": 6.658097686375322e-05, + "loss": 1.6611, + "step": 260 + }, + { + "epoch": 0.67, + "eval_loss": 1.6704061031341553, + "eval_runtime": 31.8135, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 260 + }, + { + "epoch": 0.68, + "learning_rate": 6.593830334190231e-05, + "loss": 1.6586, + "step": 265 + }, + { + "epoch": 0.68, + "eval_loss": 1.6705658435821533, + "eval_runtime": 31.7998, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 265 + }, + { + "epoch": 0.69, + "learning_rate": 6.529562982005142e-05, + "loss": 1.6838, + "step": 270 + }, + { + "epoch": 0.69, + "eval_loss": 1.6697953939437866, + "eval_runtime": 31.7994, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 270 + }, + { + "epoch": 0.71, + "learning_rate": 6.465295629820052e-05, + "loss": 1.6499, + "step": 275 + }, + { + "epoch": 0.71, + "eval_loss": 1.669343113899231, + "eval_runtime": 31.8147, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 275 + }, + { + "epoch": 0.72, + "learning_rate": 6.401028277634962e-05, + "loss": 1.6424, + "step": 280 + }, + { + "epoch": 0.72, + "eval_loss": 1.6693596839904785, + "eval_runtime": 31.8155, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 280 + }, + { + "epoch": 0.73, + "learning_rate": 6.336760925449872e-05, + "loss": 1.6238, + "step": 285 + }, + { + "epoch": 0.73, + "eval_loss": 1.669128179550171, + "eval_runtime": 31.8219, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 285 + }, + { + "epoch": 0.75, + "learning_rate": 6.272493573264781e-05, + "loss": 1.6538, + "step": 290 + }, + { + "epoch": 0.75, + "eval_loss": 1.6691263914108276, + "eval_runtime": 31.81, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 290 + }, + { + "epoch": 0.76, + "learning_rate": 6.208226221079692e-05, + "loss": 1.656, + "step": 295 + }, + { + "epoch": 0.76, + "eval_loss": 1.66847825050354, + "eval_runtime": 31.7918, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 295 + }, + { + "epoch": 0.77, + "learning_rate": 6.143958868894601e-05, + "loss": 1.6407, + "step": 300 + }, + { + "epoch": 0.77, + "eval_loss": 1.6683858633041382, + "eval_runtime": 31.8114, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 300 + }, + { + "epoch": 0.78, + "learning_rate": 6.079691516709511e-05, + "loss": 1.6468, + "step": 305 + }, + { + "epoch": 0.78, + "eval_loss": 1.6678352355957031, + "eval_runtime": 31.7975, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 305 + }, + { + "epoch": 0.8, + "learning_rate": 6.015424164524421e-05, + "loss": 1.6579, + "step": 310 + }, + { + "epoch": 0.8, + "eval_loss": 1.6675294637680054, + "eval_runtime": 31.8059, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 310 + }, + { + "epoch": 0.81, + "learning_rate": 5.951156812339333e-05, + "loss": 1.6331, + "step": 315 + }, + { + "epoch": 0.81, + "eval_loss": 1.6670397520065308, + "eval_runtime": 31.7994, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 315 + }, + { + "epoch": 0.82, + "learning_rate": 5.886889460154242e-05, + "loss": 1.6634, + "step": 320 + }, + { + "epoch": 0.82, + "eval_loss": 1.6668330430984497, + "eval_runtime": 31.807, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 320 + }, + { + "epoch": 0.84, + "learning_rate": 5.822622107969152e-05, + "loss": 1.6406, + "step": 325 + }, + { + "epoch": 0.84, + "eval_loss": 1.6668546199798584, + "eval_runtime": 31.8004, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 325 + }, + { + "epoch": 0.85, + "learning_rate": 5.758354755784062e-05, + "loss": 1.6614, + "step": 330 + }, + { + "epoch": 0.85, + "eval_loss": 1.666812539100647, + "eval_runtime": 31.8023, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 330 + }, + { + "epoch": 0.86, + "learning_rate": 5.694087403598972e-05, + "loss": 1.6598, + "step": 335 + }, + { + "epoch": 0.86, + "eval_loss": 1.666398048400879, + "eval_runtime": 31.8087, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 335 + }, + { + "epoch": 0.87, + "learning_rate": 5.6298200514138824e-05, + "loss": 1.6698, + "step": 340 + }, + { + "epoch": 0.87, + "eval_loss": 1.6660391092300415, + "eval_runtime": 31.8016, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 340 + }, + { + "epoch": 0.89, + "learning_rate": 5.5655526992287924e-05, + "loss": 1.6346, + "step": 345 + }, + { + "epoch": 0.89, + "eval_loss": 1.665901780128479, + "eval_runtime": 31.7878, + "eval_samples_per_second": 2.643, + "eval_steps_per_second": 0.346, + "step": 345 + }, + { + "epoch": 0.9, + "learning_rate": 5.501285347043702e-05, + "loss": 1.6617, + "step": 350 + }, + { + "epoch": 0.9, + "eval_loss": 1.666027307510376, + "eval_runtime": 31.7999, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 350 + }, + { + "epoch": 0.91, + "learning_rate": 5.437017994858612e-05, + "loss": 1.6623, + "step": 355 + }, + { + "epoch": 0.91, + "eval_loss": 1.6658612489700317, + "eval_runtime": 31.8128, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 355 + }, + { + "epoch": 0.93, + "learning_rate": 5.372750642673522e-05, + "loss": 1.6737, + "step": 360 + }, + { + "epoch": 0.93, + "eval_loss": 1.6656837463378906, + "eval_runtime": 31.8094, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 360 + }, + { + "epoch": 0.94, + "learning_rate": 5.308483290488432e-05, + "loss": 1.6981, + "step": 365 + }, + { + "epoch": 0.94, + "eval_loss": 1.6653661727905273, + "eval_runtime": 31.8031, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 365 + }, + { + "epoch": 0.95, + "learning_rate": 5.244215938303342e-05, + "loss": 1.6832, + "step": 370 + }, + { + "epoch": 0.95, + "eval_loss": 1.6652257442474365, + "eval_runtime": 31.8077, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 370 + }, + { + "epoch": 0.96, + "learning_rate": 5.1799485861182514e-05, + "loss": 1.6925, + "step": 375 + }, + { + "epoch": 0.96, + "eval_loss": 1.664839267730713, + "eval_runtime": 31.8082, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 375 + }, + { + "epoch": 0.98, + "learning_rate": 5.1156812339331615e-05, + "loss": 1.6721, + "step": 380 + }, + { + "epoch": 0.98, + "eval_loss": 1.6642415523529053, + "eval_runtime": 31.7927, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 380 + }, + { + "epoch": 0.99, + "learning_rate": 5.051413881748073e-05, + "loss": 1.6208, + "step": 385 + }, + { + "epoch": 0.99, + "eval_loss": 1.6641273498535156, + "eval_runtime": 31.8006, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 385 + }, + { + "epoch": 1.0, + "learning_rate": 4.987146529562982e-05, + "loss": 1.6519, + "step": 390 + }, + { + "epoch": 1.0, + "eval_loss": 1.6645455360412598, + "eval_runtime": 31.8026, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 390 + }, + { + "epoch": 1.02, + "learning_rate": 4.922879177377892e-05, + "loss": 1.6581, + "step": 395 + }, + { + "epoch": 1.02, + "eval_loss": 1.6641591787338257, + "eval_runtime": 31.8025, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 395 + }, + { + "epoch": 1.03, + "learning_rate": 4.8586118251928024e-05, + "loss": 1.6768, + "step": 400 + }, + { + "epoch": 1.03, + "eval_loss": 1.6637850999832153, + "eval_runtime": 31.806, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 400 + }, + { + "epoch": 1.04, + "learning_rate": 4.7943444730077124e-05, + "loss": 1.6257, + "step": 405 + }, + { + "epoch": 1.04, + "eval_loss": 1.6636526584625244, + "eval_runtime": 31.8116, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 405 + }, + { + "epoch": 1.05, + "learning_rate": 4.7300771208226225e-05, + "loss": 1.6522, + "step": 410 + }, + { + "epoch": 1.05, + "eval_loss": 1.6634690761566162, + "eval_runtime": 31.8063, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 410 + }, + { + "epoch": 1.07, + "learning_rate": 4.6658097686375325e-05, + "loss": 1.615, + "step": 415 + }, + { + "epoch": 1.07, + "eval_loss": 1.6637413501739502, + "eval_runtime": 31.8115, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 415 + }, + { + "epoch": 1.08, + "learning_rate": 4.6015424164524426e-05, + "loss": 1.5967, + "step": 420 + }, + { + "epoch": 1.08, + "eval_loss": 1.6633188724517822, + "eval_runtime": 31.8016, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 420 + }, + { + "epoch": 1.09, + "learning_rate": 4.537275064267352e-05, + "loss": 1.6708, + "step": 425 + }, + { + "epoch": 1.09, + "eval_loss": 1.6634286642074585, + "eval_runtime": 31.8074, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 425 + }, + { + "epoch": 1.11, + "learning_rate": 4.473007712082262e-05, + "loss": 1.6696, + "step": 430 + }, + { + "epoch": 1.11, + "eval_loss": 1.663163661956787, + "eval_runtime": 31.799, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 430 + }, + { + "epoch": 1.12, + "learning_rate": 4.408740359897173e-05, + "loss": 1.6344, + "step": 435 + }, + { + "epoch": 1.12, + "eval_loss": 1.6632493734359741, + "eval_runtime": 31.8032, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 435 + }, + { + "epoch": 1.13, + "learning_rate": 4.344473007712083e-05, + "loss": 1.5922, + "step": 440 + }, + { + "epoch": 1.13, + "eval_loss": 1.6627737283706665, + "eval_runtime": 31.8064, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 440 + }, + { + "epoch": 1.14, + "learning_rate": 4.280205655526993e-05, + "loss": 1.6541, + "step": 445 + }, + { + "epoch": 1.14, + "eval_loss": 1.6630815267562866, + "eval_runtime": 31.8083, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 445 + }, + { + "epoch": 1.16, + "learning_rate": 4.215938303341902e-05, + "loss": 1.6348, + "step": 450 + }, + { + "epoch": 1.16, + "eval_loss": 1.662713885307312, + "eval_runtime": 31.7977, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 450 + }, + { + "epoch": 1.17, + "learning_rate": 4.151670951156812e-05, + "loss": 1.6444, + "step": 455 + }, + { + "epoch": 1.17, + "eval_loss": 1.6629937887191772, + "eval_runtime": 31.799, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 455 + }, + { + "epoch": 1.18, + "learning_rate": 4.0874035989717224e-05, + "loss": 1.6629, + "step": 460 + }, + { + "epoch": 1.18, + "eval_loss": 1.6626592874526978, + "eval_runtime": 31.8136, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 460 + }, + { + "epoch": 1.2, + "learning_rate": 4.0231362467866324e-05, + "loss": 1.6327, + "step": 465 + }, + { + "epoch": 1.2, + "eval_loss": 1.6625391244888306, + "eval_runtime": 31.802, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 465 + }, + { + "epoch": 1.21, + "learning_rate": 3.958868894601543e-05, + "loss": 1.6236, + "step": 470 + }, + { + "epoch": 1.21, + "eval_loss": 1.6622934341430664, + "eval_runtime": 31.8055, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 470 + }, + { + "epoch": 1.22, + "learning_rate": 3.8946015424164526e-05, + "loss": 1.6232, + "step": 475 + }, + { + "epoch": 1.22, + "eval_loss": 1.661791443824768, + "eval_runtime": 31.8035, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 475 + }, + { + "epoch": 1.23, + "learning_rate": 3.8303341902313626e-05, + "loss": 1.6374, + "step": 480 + }, + { + "epoch": 1.23, + "eval_loss": 1.6618887186050415, + "eval_runtime": 31.7892, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 480 + }, + { + "epoch": 1.25, + "learning_rate": 3.766066838046273e-05, + "loss": 1.6422, + "step": 485 + }, + { + "epoch": 1.25, + "eval_loss": 1.6620476245880127, + "eval_runtime": 31.7974, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 485 + }, + { + "epoch": 1.26, + "learning_rate": 3.701799485861183e-05, + "loss": 1.631, + "step": 490 + }, + { + "epoch": 1.26, + "eval_loss": 1.6622774600982666, + "eval_runtime": 31.7978, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 490 + }, + { + "epoch": 1.27, + "learning_rate": 3.637532133676093e-05, + "loss": 1.609, + "step": 495 + }, + { + "epoch": 1.27, + "eval_loss": 1.661568284034729, + "eval_runtime": 31.8034, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 495 + }, + { + "epoch": 1.29, + "learning_rate": 3.573264781491003e-05, + "loss": 1.6444, + "step": 500 + }, + { + "epoch": 1.29, + "eval_loss": 1.6610949039459229, + "eval_runtime": 31.8005, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 500 + }, + { + "epoch": 1.3, + "learning_rate": 3.508997429305913e-05, + "loss": 1.6387, + "step": 505 + }, + { + "epoch": 1.3, + "eval_loss": 1.6611486673355103, + "eval_runtime": 31.8079, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 505 + }, + { + "epoch": 1.31, + "learning_rate": 3.444730077120823e-05, + "loss": 1.6507, + "step": 510 + }, + { + "epoch": 1.31, + "eval_loss": 1.661084532737732, + "eval_runtime": 31.8028, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 510 + }, + { + "epoch": 1.32, + "learning_rate": 3.380462724935733e-05, + "loss": 1.6377, + "step": 515 + }, + { + "epoch": 1.32, + "eval_loss": 1.6610103845596313, + "eval_runtime": 31.8101, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 515 + }, + { + "epoch": 1.34, + "learning_rate": 3.316195372750643e-05, + "loss": 1.6351, + "step": 520 + }, + { + "epoch": 1.34, + "eval_loss": 1.6608690023422241, + "eval_runtime": 31.7986, + "eval_samples_per_second": 2.642, + "eval_steps_per_second": 0.346, + "step": 520 + }, + { + "epoch": 1.35, + "learning_rate": 3.251928020565553e-05, + "loss": 1.6196, + "step": 525 + }, + { + "epoch": 1.35, + "eval_loss": 1.6606587171554565, + "eval_runtime": 31.8015, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 525 + }, + { + "epoch": 1.36, + "learning_rate": 3.1876606683804625e-05, + "loss": 1.646, + "step": 530 + }, + { + "epoch": 1.36, + "eval_loss": 1.6605886220932007, + "eval_runtime": 31.804, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 530 + }, + { + "epoch": 1.38, + "learning_rate": 3.1233933161953726e-05, + "loss": 1.6824, + "step": 535 + }, + { + "epoch": 1.38, + "eval_loss": 1.6603879928588867, + "eval_runtime": 31.8111, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 535 + }, + { + "epoch": 1.39, + "learning_rate": 3.059125964010283e-05, + "loss": 1.6115, + "step": 540 + }, + { + "epoch": 1.39, + "eval_loss": 1.660337209701538, + "eval_runtime": 31.805, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 540 + }, + { + "epoch": 1.4, + "learning_rate": 2.994858611825193e-05, + "loss": 1.6243, + "step": 545 + }, + { + "epoch": 1.4, + "eval_loss": 1.6606225967407227, + "eval_runtime": 31.808, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 545 + }, + { + "epoch": 1.41, + "learning_rate": 2.930591259640103e-05, + "loss": 1.6486, + "step": 550 + }, + { + "epoch": 1.41, + "eval_loss": 1.6604374647140503, + "eval_runtime": 31.8072, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 550 + }, + { + "epoch": 1.43, + "learning_rate": 2.866323907455013e-05, + "loss": 1.6865, + "step": 555 + }, + { + "epoch": 1.43, + "eval_loss": 1.6601544618606567, + "eval_runtime": 31.8069, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 555 + }, + { + "epoch": 1.44, + "learning_rate": 2.802056555269923e-05, + "loss": 1.6616, + "step": 560 + }, + { + "epoch": 1.44, + "eval_loss": 1.660386562347412, + "eval_runtime": 31.803, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 560 + }, + { + "epoch": 1.45, + "learning_rate": 2.737789203084833e-05, + "loss": 1.6336, + "step": 565 + }, + { + "epoch": 1.45, + "eval_loss": 1.660247802734375, + "eval_runtime": 31.8084, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 565 + }, + { + "epoch": 1.47, + "learning_rate": 2.673521850899743e-05, + "loss": 1.6699, + "step": 570 + }, + { + "epoch": 1.47, + "eval_loss": 1.660292148590088, + "eval_runtime": 31.8031, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 570 + }, + { + "epoch": 1.48, + "learning_rate": 2.6092544987146534e-05, + "loss": 1.6472, + "step": 575 + }, + { + "epoch": 1.48, + "eval_loss": 1.6598683595657349, + "eval_runtime": 31.8181, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 575 + }, + { + "epoch": 1.49, + "learning_rate": 2.5449871465295634e-05, + "loss": 1.6723, + "step": 580 + }, + { + "epoch": 1.49, + "eval_loss": 1.6599065065383911, + "eval_runtime": 31.8124, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 580 + }, + { + "epoch": 1.5, + "learning_rate": 2.480719794344473e-05, + "loss": 1.663, + "step": 585 + }, + { + "epoch": 1.5, + "eval_loss": 1.6599762439727783, + "eval_runtime": 31.8175, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 585 + }, + { + "epoch": 1.52, + "learning_rate": 2.4164524421593832e-05, + "loss": 1.6519, + "step": 590 + }, + { + "epoch": 1.52, + "eval_loss": 1.6599925756454468, + "eval_runtime": 31.8209, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 590 + }, + { + "epoch": 1.53, + "learning_rate": 2.3521850899742933e-05, + "loss": 1.6345, + "step": 595 + }, + { + "epoch": 1.53, + "eval_loss": 1.6600078344345093, + "eval_runtime": 31.8031, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 595 + }, + { + "epoch": 1.54, + "learning_rate": 2.2879177377892033e-05, + "loss": 1.654, + "step": 600 + }, + { + "epoch": 1.54, + "eval_loss": 1.6599884033203125, + "eval_runtime": 31.8281, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 0.346, + "step": 600 + }, + { + "epoch": 1.56, + "learning_rate": 2.2236503856041134e-05, + "loss": 1.691, + "step": 605 + }, + { + "epoch": 1.56, + "eval_loss": 1.6599520444869995, + "eval_runtime": 31.8168, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 605 + }, + { + "epoch": 1.57, + "learning_rate": 2.159383033419023e-05, + "loss": 1.6714, + "step": 610 + }, + { + "epoch": 1.57, + "eval_loss": 1.659857988357544, + "eval_runtime": 31.8153, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 610 + }, + { + "epoch": 1.58, + "learning_rate": 2.095115681233933e-05, + "loss": 1.5977, + "step": 615 + }, + { + "epoch": 1.58, + "eval_loss": 1.6596992015838623, + "eval_runtime": 31.8087, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 615 + }, + { + "epoch": 1.59, + "learning_rate": 2.0308483290488432e-05, + "loss": 1.6429, + "step": 620 + }, + { + "epoch": 1.59, + "eval_loss": 1.6595497131347656, + "eval_runtime": 31.8151, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 620 + }, + { + "epoch": 1.61, + "learning_rate": 1.9665809768637533e-05, + "loss": 1.6091, + "step": 625 + }, + { + "epoch": 1.61, + "eval_loss": 1.6597559452056885, + "eval_runtime": 31.8076, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 625 + }, + { + "epoch": 1.62, + "learning_rate": 1.9023136246786633e-05, + "loss": 1.5956, + "step": 630 + }, + { + "epoch": 1.62, + "eval_loss": 1.6593599319458008, + "eval_runtime": 31.8058, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 630 + }, + { + "epoch": 1.63, + "learning_rate": 1.8380462724935734e-05, + "loss": 1.6697, + "step": 635 + }, + { + "epoch": 1.63, + "eval_loss": 1.6592466831207275, + "eval_runtime": 31.8589, + "eval_samples_per_second": 2.637, + "eval_steps_per_second": 0.345, + "step": 635 + }, + { + "epoch": 1.65, + "learning_rate": 1.7737789203084834e-05, + "loss": 1.6535, + "step": 640 + }, + { + "epoch": 1.65, + "eval_loss": 1.6592165231704712, + "eval_runtime": 31.8225, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 640 + }, + { + "epoch": 1.66, + "learning_rate": 1.7095115681233935e-05, + "loss": 1.6722, + "step": 645 + }, + { + "epoch": 1.66, + "eval_loss": 1.659324049949646, + "eval_runtime": 31.8388, + "eval_samples_per_second": 2.638, + "eval_steps_per_second": 0.345, + "step": 645 + }, + { + "epoch": 1.67, + "learning_rate": 1.6452442159383032e-05, + "loss": 1.6347, + "step": 650 + }, + { + "epoch": 1.67, + "eval_loss": 1.6590396165847778, + "eval_runtime": 31.8174, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 650 + }, + { + "epoch": 1.68, + "learning_rate": 1.5809768637532136e-05, + "loss": 1.6223, + "step": 655 + }, + { + "epoch": 1.68, + "eval_loss": 1.658969521522522, + "eval_runtime": 31.8275, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 0.346, + "step": 655 + }, + { + "epoch": 1.7, + "learning_rate": 1.5167095115681235e-05, + "loss": 1.6309, + "step": 660 + }, + { + "epoch": 1.7, + "eval_loss": 1.6587274074554443, + "eval_runtime": 31.8146, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 660 + }, + { + "epoch": 1.71, + "learning_rate": 1.4524421593830334e-05, + "loss": 1.6499, + "step": 665 + }, + { + "epoch": 1.71, + "eval_loss": 1.65865957736969, + "eval_runtime": 31.8091, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 665 + }, + { + "epoch": 1.72, + "learning_rate": 1.3881748071979436e-05, + "loss": 1.6713, + "step": 670 + }, + { + "epoch": 1.72, + "eval_loss": 1.6586016416549683, + "eval_runtime": 31.8229, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 670 + }, + { + "epoch": 1.74, + "learning_rate": 1.3239074550128535e-05, + "loss": 1.6348, + "step": 675 + }, + { + "epoch": 1.74, + "eval_loss": 1.6584962606430054, + "eval_runtime": 31.8035, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 675 + }, + { + "epoch": 1.75, + "learning_rate": 1.2596401028277636e-05, + "loss": 1.6447, + "step": 680 + }, + { + "epoch": 1.75, + "eval_loss": 1.6585614681243896, + "eval_runtime": 31.8101, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 680 + }, + { + "epoch": 1.76, + "learning_rate": 1.1953727506426736e-05, + "loss": 1.6511, + "step": 685 + }, + { + "epoch": 1.76, + "eval_loss": 1.6583601236343384, + "eval_runtime": 31.8125, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 685 + }, + { + "epoch": 1.77, + "learning_rate": 1.1311053984575835e-05, + "loss": 1.6496, + "step": 690 + }, + { + "epoch": 1.77, + "eval_loss": 1.6583328247070312, + "eval_runtime": 31.8179, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 690 + }, + { + "epoch": 1.79, + "learning_rate": 1.0668380462724936e-05, + "loss": 1.6421, + "step": 695 + }, + { + "epoch": 1.79, + "eval_loss": 1.6583188772201538, + "eval_runtime": 31.8165, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 695 + }, + { + "epoch": 1.8, + "learning_rate": 1.0025706940874038e-05, + "loss": 1.6126, + "step": 700 + }, + { + "epoch": 1.8, + "eval_loss": 1.6580705642700195, + "eval_runtime": 31.8164, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 700 + }, + { + "epoch": 1.81, + "learning_rate": 9.383033419023137e-06, + "loss": 1.6226, + "step": 705 + }, + { + "epoch": 1.81, + "eval_loss": 1.6584465503692627, + "eval_runtime": 31.816, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 705 + }, + { + "epoch": 1.83, + "learning_rate": 8.740359897172237e-06, + "loss": 1.6923, + "step": 710 + }, + { + "epoch": 1.83, + "eval_loss": 1.658342957496643, + "eval_runtime": 31.8215, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 710 + }, + { + "epoch": 1.84, + "learning_rate": 8.097686375321336e-06, + "loss": 1.6224, + "step": 715 + }, + { + "epoch": 1.84, + "eval_loss": 1.6582502126693726, + "eval_runtime": 31.812, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 715 + }, + { + "epoch": 1.85, + "learning_rate": 7.4550128534704376e-06, + "loss": 1.6587, + "step": 720 + }, + { + "epoch": 1.85, + "eval_loss": 1.658232569694519, + "eval_runtime": 31.8059, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 720 + }, + { + "epoch": 1.86, + "learning_rate": 6.812339331619537e-06, + "loss": 1.6641, + "step": 725 + }, + { + "epoch": 1.86, + "eval_loss": 1.6582419872283936, + "eval_runtime": 31.81, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 725 + }, + { + "epoch": 1.88, + "learning_rate": 6.169665809768638e-06, + "loss": 1.6419, + "step": 730 + }, + { + "epoch": 1.88, + "eval_loss": 1.6582107543945312, + "eval_runtime": 31.833, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 0.346, + "step": 730 + }, + { + "epoch": 1.89, + "learning_rate": 5.526992287917738e-06, + "loss": 1.652, + "step": 735 + }, + { + "epoch": 1.89, + "eval_loss": 1.6582270860671997, + "eval_runtime": 31.8801, + "eval_samples_per_second": 2.635, + "eval_steps_per_second": 0.345, + "step": 735 + }, + { + "epoch": 1.9, + "learning_rate": 4.884318766066838e-06, + "loss": 1.6487, + "step": 740 + }, + { + "epoch": 1.9, + "eval_loss": 1.6582915782928467, + "eval_runtime": 31.8265, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 0.346, + "step": 740 + }, + { + "epoch": 1.92, + "learning_rate": 4.241645244215939e-06, + "loss": 1.6245, + "step": 745 + }, + { + "epoch": 1.92, + "eval_loss": 1.658047080039978, + "eval_runtime": 31.8109, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 745 + }, + { + "epoch": 1.93, + "learning_rate": 3.598971722365039e-06, + "loss": 1.6553, + "step": 750 + }, + { + "epoch": 1.93, + "eval_loss": 1.6578683853149414, + "eval_runtime": 31.8025, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 750 + }, + { + "epoch": 1.94, + "learning_rate": 2.956298200514139e-06, + "loss": 1.653, + "step": 755 + }, + { + "epoch": 1.94, + "eval_loss": 1.6578799486160278, + "eval_runtime": 31.8005, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 755 + }, + { + "epoch": 1.95, + "learning_rate": 2.313624678663239e-06, + "loss": 1.6519, + "step": 760 + }, + { + "epoch": 1.95, + "eval_loss": 1.657995343208313, + "eval_runtime": 31.8085, + "eval_samples_per_second": 2.641, + "eval_steps_per_second": 0.346, + "step": 760 + }, + { + "epoch": 1.97, + "learning_rate": 1.6709511568123394e-06, + "loss": 1.6955, + "step": 765 + }, + { + "epoch": 1.97, + "eval_loss": 1.6578824520111084, + "eval_runtime": 31.8128, + "eval_samples_per_second": 2.64, + "eval_steps_per_second": 0.346, + "step": 765 + }, + { + "epoch": 1.98, + "learning_rate": 1.0282776349614395e-06, + "loss": 1.6327, + "step": 770 + }, + { + "epoch": 1.98, + "eval_loss": 1.6581860780715942, + "eval_runtime": 31.8501, + "eval_samples_per_second": 2.637, + "eval_steps_per_second": 0.345, + "step": 770 + } + ], + "logging_steps": 5, + "max_steps": 778, + "num_train_epochs": 2, + "save_steps": 10, + "total_flos": 2.5014589754179584e+17, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-770/training_args.bin b/checkpoint-770/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..48088b6a3851b1352bfa758dd2b3ad558168087f --- /dev/null +++ b/checkpoint-770/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccc6a8542abd91027de58ebfccafdf919108680707834fe188d91cae077ef8e9 +size 4600 diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1fa5ef9a4ba754aaaabcee4c00fae776aa6fd4f0 --- /dev/null +++ b/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "meta-llama/Llama-2-7b-hf", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "bos_token_id": 1, + "eos_token_id": 2, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 11008, + "max_position_embeddings": 4096, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "pretraining_tp": 1, + "quantization_config": { + "bnb_4bit_compute_dtype": "float32", + "bnb_4bit_quant_type": "fp4", + "bnb_4bit_use_double_quant": false, + "llm_int8_enable_fp32_cpu_offload": false, + "llm_int8_has_fp16_weight": false, + "llm_int8_skip_modules": null, + "llm_int8_threshold": 6.0, + "load_in_4bit": false, + "load_in_8bit": true, + "quant_method": "bitsandbytes" + }, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "torch_dtype": "float16", + "transformers_version": "4.35.2", + "use_cache": true, + "vocab_size": 32000 +} diff --git a/logs/events.out.tfevents.1699989718.node0370.palmetto.clemson.edu.1119956.2 b/logs/events.out.tfevents.1699989718.node0370.palmetto.clemson.edu.1119956.2 new file mode 100644 index 0000000000000000000000000000000000000000..f20efdc5246f6366e8a67a93f6b7c8cd36236461 --- /dev/null +++ b/logs/events.out.tfevents.1699989718.node0370.palmetto.clemson.edu.1119956.2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8338f7ab492bb36430b90fde21a7822be3a5f94276aa93f72ad83aeaf872e40 +size 4868 diff --git a/logs/events.out.tfevents.1699989813.node0370.palmetto.clemson.edu.1119956.4 b/logs/events.out.tfevents.1699989813.node0370.palmetto.clemson.edu.1119956.4 new file mode 100644 index 0000000000000000000000000000000000000000..36e6c56852c10a3b0d616c47a89da6b88a49b027 --- /dev/null +++ b/logs/events.out.tfevents.1699989813.node0370.palmetto.clemson.edu.1119956.4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4773b11c0e593bac1fe819e5c6324bb2a3542c475a7d375bb608e46408b2b009 +size 4602 diff --git a/logs/events.out.tfevents.1699994287.node0370.palmetto.clemson.edu.1126403.1 b/logs/events.out.tfevents.1699994287.node0370.palmetto.clemson.edu.1126403.1 new file mode 100644 index 0000000000000000000000000000000000000000..1ecec5e4477180cecbf7cd81d3a407cf2039ba34 --- /dev/null +++ b/logs/events.out.tfevents.1699994287.node0370.palmetto.clemson.edu.1126403.1 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edff87033b1a80ab70d0a0af49d7943790a698e659a0875f929d78a3299c699d +size 7962 diff --git a/logs/events.out.tfevents.1699995040.node0370.palmetto.clemson.edu.1127435.1 b/logs/events.out.tfevents.1699995040.node0370.palmetto.clemson.edu.1127435.1 new file mode 100644 index 0000000000000000000000000000000000000000..29c10bcc75a3a5598564b1460d27ba3d99c5af0d --- /dev/null +++ b/logs/events.out.tfevents.1699995040.node0370.palmetto.clemson.edu.1127435.1 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f054998bd37e1977597fdb66e495fb58c70a3b0042fa2ef4428756252ee49a4c +size 33937 diff --git a/logs/events.out.tfevents.1700064888.node0277.palmetto.clemson.edu.1971495.1 b/logs/events.out.tfevents.1700064888.node0277.palmetto.clemson.edu.1971495.1 new file mode 100644 index 0000000000000000000000000000000000000000..07cca291cd96977cb646be128075fa3d738dd6f1 --- /dev/null +++ b/logs/events.out.tfevents.1700064888.node0277.palmetto.clemson.edu.1971495.1 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:587a2145d6c92f9cbc7c114d9b2d452e13239b0a925f3bdcaa5631c48bdca19a +size 9908 diff --git a/logs/events.out.tfevents.1700066157.node0277.palmetto.clemson.edu.1973537.1 b/logs/events.out.tfevents.1700066157.node0277.palmetto.clemson.edu.1973537.1 new file mode 100644 index 0000000000000000000000000000000000000000..b45e70a37980a5f13f4f8a0bd1d8a3cd7f56bb8f --- /dev/null +++ b/logs/events.out.tfevents.1700066157.node0277.palmetto.clemson.edu.1973537.1 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fcffea7a0e86207d5c20daaabb71040aa71cb529a4bbddc4a309e37d9666f12 +size 14528 diff --git a/logs/events.out.tfevents.1700068261.node0277.palmetto.clemson.edu.1975944.1 b/logs/events.out.tfevents.1700068261.node0277.palmetto.clemson.edu.1975944.1 new file mode 100644 index 0000000000000000000000000000000000000000..6da2cd634e521bc446d472a50c149b7cfb926d34 --- /dev/null +++ b/logs/events.out.tfevents.1700068261.node0277.palmetto.clemson.edu.1975944.1 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c82b86ae9d57161db3f3c5a7f67ae342ff011beeb1392fcbe753863174f1ef4 +size 7736 diff --git a/logs/events.out.tfevents.1700244128.node0287.palmetto.clemson.edu.1704721.0 b/logs/events.out.tfevents.1700244128.node0287.palmetto.clemson.edu.1704721.0 new file mode 100644 index 0000000000000000000000000000000000000000..e71b2157e1c64b684569c28506499b8cedf30397 --- /dev/null +++ b/logs/events.out.tfevents.1700244128.node0287.palmetto.clemson.edu.1704721.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c4110c3d09ad1253563d12b1dd5c4b4b26abe39e0f9f0f2d19de9e70767848b +size 4448 diff --git a/logs/events.out.tfevents.1700244128.node0287.palmetto.clemson.edu.1704721.1 b/logs/events.out.tfevents.1700244128.node0287.palmetto.clemson.edu.1704721.1 new file mode 100644 index 0000000000000000000000000000000000000000..f4964f505424ce68fc2b7c3a35a565f2c126f7f8 --- /dev/null +++ b/logs/events.out.tfevents.1700244128.node0287.palmetto.clemson.edu.1704721.1 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38c6b6302c29076bc6e310b9a7d11350256f18814f9ca6a33ca444921708d1c9 +size 4448 diff --git a/logs/events.out.tfevents.1700244187.node0287.palmetto.clemson.edu.1704721.2 b/logs/events.out.tfevents.1700244187.node0287.palmetto.clemson.edu.1704721.2 new file mode 100644 index 0000000000000000000000000000000000000000..b1dbe66a9e008fe5c07ec16717605275bbb4ad4e --- /dev/null +++ b/logs/events.out.tfevents.1700244187.node0287.palmetto.clemson.edu.1704721.2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:769514ef1a9ff351c4b1f2cc29d88b76962d2bb488c82b44156e84237635f6cb +size 7736 diff --git a/logs/events.out.tfevents.1700244187.node0287.palmetto.clemson.edu.1704721.3 b/logs/events.out.tfevents.1700244187.node0287.palmetto.clemson.edu.1704721.3 new file mode 100644 index 0000000000000000000000000000000000000000..6ef5258af32a95d6eb0097d21dd8112d47c25bb3 --- /dev/null +++ b/logs/events.out.tfevents.1700244187.node0287.palmetto.clemson.edu.1704721.3 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e202adc428e3dc6b92fd86a752b2dd3ea53cf8296312c51180754cde46a6ee61 +size 7736 diff --git a/peft_config.json b/peft_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a0ae17c461c5656a5496d761a8139a20e270bce --- /dev/null +++ b/peft_config.json @@ -0,0 +1,23 @@ +{ + "default": { + "peft_type": "LORA", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-2-7b-hf", + "revision": null, + "task_type": "CAUSAL_LM", + "inference_mode": false, + "r": 8, + "target_modules": [ + "q_proj", + "v_proj" + ], + "lora_alpha": 32, + "lora_dropout": 0.05, + "fan_in_fan_out": false, + "bias": "none", + "modules_to_save": null, + "init_lora_weights": true, + "layers_to_transform": null, + "layers_pattern": null + } +} \ No newline at end of file diff --git a/tensorboard_logs/events.out.tfevents.1699989359.node0370.palmetto.clemson.edu.1118573.0 b/tensorboard_logs/events.out.tfevents.1699989359.node0370.palmetto.clemson.edu.1118573.0 new file mode 100644 index 0000000000000000000000000000000000000000..6133a316f5466fee2bee69ac5bcfb0e32b864e48 --- /dev/null +++ b/tensorboard_logs/events.out.tfevents.1699989359.node0370.palmetto.clemson.edu.1118573.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:508bf620a6efea31611edf8b79a5f90903b4a80a317efec567cf96ac92938d9e +size 88 diff --git a/tensorboard_logs/events.out.tfevents.1699989567.node0370.palmetto.clemson.edu.1119956.0 b/tensorboard_logs/events.out.tfevents.1699989567.node0370.palmetto.clemson.edu.1119956.0 new file mode 100644 index 0000000000000000000000000000000000000000..a054556c9194738e46b992490b272aec5ae6f260 --- /dev/null +++ b/tensorboard_logs/events.out.tfevents.1699989567.node0370.palmetto.clemson.edu.1119956.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:809827a3064f9854ffc2910389ec1573eb98079f63da5e52062fbd8671946ea1 +size 88 diff --git a/tensorboard_logs/events.out.tfevents.1699989718.node0370.palmetto.clemson.edu.1119956.1 b/tensorboard_logs/events.out.tfevents.1699989718.node0370.palmetto.clemson.edu.1119956.1 new file mode 100644 index 0000000000000000000000000000000000000000..c7f57ae801b45d6e67962ead1fddc6ccf0885cd0 --- /dev/null +++ b/tensorboard_logs/events.out.tfevents.1699989718.node0370.palmetto.clemson.edu.1119956.1 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:246a1b62f0367762b2e5adb07db715dd9c6b3d14f7fd72dc33f40509a064ce3e +size 88 diff --git a/tensorboard_logs/events.out.tfevents.1699989813.node0370.palmetto.clemson.edu.1119956.3 b/tensorboard_logs/events.out.tfevents.1699989813.node0370.palmetto.clemson.edu.1119956.3 new file mode 100644 index 0000000000000000000000000000000000000000..080533d6c025214ddb6819e1509e15d2edb8d581 --- /dev/null +++ b/tensorboard_logs/events.out.tfevents.1699989813.node0370.palmetto.clemson.edu.1119956.3 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad7c4a563a3df99358160cbd9b597dc094c701530d2b0a370c8a90c4b043666c +size 88 diff --git a/tensorboard_logs/events.out.tfevents.1699994286.node0370.palmetto.clemson.edu.1126403.0 b/tensorboard_logs/events.out.tfevents.1699994286.node0370.palmetto.clemson.edu.1126403.0 new file mode 100644 index 0000000000000000000000000000000000000000..2e56de8fca32ee8f0d4b08ad379ac7b00ac6f14c --- /dev/null +++ b/tensorboard_logs/events.out.tfevents.1699994286.node0370.palmetto.clemson.edu.1126403.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5c9bc67afc8bbef64c8b9e3bcf1cf3b6761a1dc88662562b96e92d478800129 +size 88 diff --git a/tensorboard_logs/events.out.tfevents.1699995040.node0370.palmetto.clemson.edu.1127435.0 b/tensorboard_logs/events.out.tfevents.1699995040.node0370.palmetto.clemson.edu.1127435.0 new file mode 100644 index 0000000000000000000000000000000000000000..38b16cf75a41e41ec4d8fac709a8e16a571ded63 --- /dev/null +++ b/tensorboard_logs/events.out.tfevents.1699995040.node0370.palmetto.clemson.edu.1127435.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de7c9118466c29677c83972c22ea9f891944714f48b4e4722c914361c639b7b6 +size 88 diff --git a/tensorboard_logs/events.out.tfevents.1700064888.node0277.palmetto.clemson.edu.1971495.0 b/tensorboard_logs/events.out.tfevents.1700064888.node0277.palmetto.clemson.edu.1971495.0 new file mode 100644 index 0000000000000000000000000000000000000000..5a92b5c9996a429300de3e499037f2e531ac7f2b --- /dev/null +++ b/tensorboard_logs/events.out.tfevents.1700064888.node0277.palmetto.clemson.edu.1971495.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a756f5eab83a8540e11fd8b983679204f19859f4f609f2079d04cc709bc8ad94 +size 88 diff --git a/tensorboard_logs/events.out.tfevents.1700066156.node0277.palmetto.clemson.edu.1973537.0 b/tensorboard_logs/events.out.tfevents.1700066156.node0277.palmetto.clemson.edu.1973537.0 new file mode 100644 index 0000000000000000000000000000000000000000..4ef7d3dd1c970500a80530761e4a1690069ad2e4 --- /dev/null +++ b/tensorboard_logs/events.out.tfevents.1700066156.node0277.palmetto.clemson.edu.1973537.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba86d0c48d706e23e958d9e5f5bf4625871675a89b4d1d50619fcc769d9e939a +size 88 diff --git a/tensorboard_logs/events.out.tfevents.1700068260.node0277.palmetto.clemson.edu.1975944.0 b/tensorboard_logs/events.out.tfevents.1700068260.node0277.palmetto.clemson.edu.1975944.0 new file mode 100644 index 0000000000000000000000000000000000000000..cfe7aeb2ca7461b90c23b7ea77accdd1dc096f73 --- /dev/null +++ b/tensorboard_logs/events.out.tfevents.1700068260.node0277.palmetto.clemson.edu.1975944.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5eee7e7bfdf9096c9ff890fd47ea9d2ed9ed48c9dec1229e82b19f2001064a5 +size 88