diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6d0fa148c52f94f4dde143314e8ddb46b5d3258c --- /dev/null +++ b/README.md @@ -0,0 +1,267 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- quant_method: bitsandbytes +- _load_in_8bit: False +- _load_in_4bit: False +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 +- load_in_4bit: False +- load_in_8bit: False + +The following `bitsandbytes` quantization config was used during training: +- quant_method: bitsandbytes +- _load_in_8bit: False +- _load_in_4bit: False +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 +- load_in_4bit: False +- load_in_8bit: False + +The following `bitsandbytes` quantization config was used during training: +- quant_method: bitsandbytes +- _load_in_8bit: False +- _load_in_4bit: False +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 +- load_in_4bit: False +- load_in_8bit: False + +The following `bitsandbytes` quantization config was used during training: +- quant_method: bitsandbytes +- _load_in_8bit: False +- _load_in_4bit: False +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 +- load_in_4bit: False +- load_in_8bit: False + +The following `bitsandbytes` quantization config was used during training: +- quant_method: bitsandbytes +- _load_in_8bit: False +- _load_in_4bit: False +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 +- load_in_4bit: False +- load_in_8bit: False + +The following `bitsandbytes` quantization config was used during training: +- quant_method: bitsandbytes +- _load_in_8bit: False +- _load_in_4bit: False +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 +- load_in_4bit: False +- load_in_8bit: False + +The following `bitsandbytes` quantization config was used during training: +- quant_method: bitsandbytes +- load_in_8bit: True +- load_in_4bit: False +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 + +The following `bitsandbytes` quantization config was used during training: +- quant_method: bitsandbytes +- load_in_8bit: True +- load_in_4bit: False +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 + +The following `bitsandbytes` quantization config was used during training: +- quant_method: bitsandbytes +- load_in_8bit: True +- load_in_4bit: False +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 + +The following `bitsandbytes` quantization config was used during training: +- quant_method: bitsandbytes +- load_in_8bit: True +- load_in_4bit: False +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 + +The following `bitsandbytes` quantization config was used during training: +- quant_method: bitsandbytes +- load_in_8bit: True +- load_in_4bit: False +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 + +The following `bitsandbytes` quantization config was used during training: +- quant_method: bitsandbytes +- load_in_8bit: True +- load_in_4bit: False +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 + +The following `bitsandbytes` quantization config was used during training: +- quant_method: bitsandbytes +- load_in_8bit: True +- load_in_4bit: False +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 + +The following `bitsandbytes` quantization config was used during training: +- quant_method: bitsandbytes +- load_in_8bit: True +- load_in_4bit: False +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 + +The following `bitsandbytes` quantization config was used during training: +- quant_method: bitsandbytes +- load_in_8bit: True +- load_in_4bit: False +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 + +The following `bitsandbytes` quantization config was used during training: +- quant_method: bitsandbytes +- load_in_8bit: True +- load_in_4bit: False +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 + +The following `bitsandbytes` quantization config was used during training: +- quant_method: bitsandbytes +- load_in_8bit: True +- load_in_4bit: False +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 + +The following `bitsandbytes` quantization config was used during training: +- quant_method: bitsandbytes +- load_in_8bit: True +- load_in_4bit: False +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 + +The following `bitsandbytes` quantization config was used during training: +- quant_method: bitsandbytes +- load_in_8bit: True +- load_in_4bit: False +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 +### Framework versions + +- PEFT 0.5.0 +- PEFT 0.5.0 +- PEFT 0.5.0 +- PEFT 0.5.0 +- PEFT 0.5.0 +- PEFT 0.5.0 +- PEFT 0.5.0 +- PEFT 0.5.0 +- PEFT 0.5.0 +- PEFT 0.5.0 +- PEFT 0.5.0 +- PEFT 0.5.0 +- PEFT 0.5.0 +- PEFT 0.5.0 +- PEFT 0.5.0 +- PEFT 0.5.0 +- PEFT 0.5.0 +- PEFT 0.5.0 + +- PEFT 0.5.0 diff --git a/adapter_config.json b/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c6d9ff3bfcea32ea90a3ddeb88352108d838b4ee --- /dev/null +++ b/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-2-70b-hf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/adapter_model.bin b/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..069ea516a881e0917a6287c3a3cc5522b17498e1 --- /dev/null +++ b/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2fcc18a4181c065d107e66cca9058af5a18b263f6263db5b9e223ccd13b5a128 +size 65652106 diff --git a/checkpoint-120/README.md b/checkpoint-120/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4787359fdf0253321d922e272bacc387ca46ce22 --- /dev/null +++ b/checkpoint-120/README.md @@ -0,0 +1,21 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- quant_method: bitsandbytes +- load_in_8bit: True +- load_in_4bit: False +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 +### Framework versions + + +- PEFT 0.5.0 diff --git a/checkpoint-120/adapter_config.json b/checkpoint-120/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..96a10d09371f10f296c5b7fcb5b0ddd7be98eef2 --- /dev/null +++ b/checkpoint-120/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-2-7b-hf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-120/adapter_model.safetensors b/checkpoint-120/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9dfd00029661fa99ff8181d0ef1fdb443240cde4 --- /dev/null +++ b/checkpoint-120/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62efd5c2cab302e216422bdba2e55c8799578254d233788eab87a5ab9d4e7aa6 +size 16794200 diff --git a/checkpoint-120/optimizer.pt b/checkpoint-120/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c13fb82535cd9071bc6b029074e9c53a852f3a2f --- /dev/null +++ b/checkpoint-120/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b492af0c9917e531befd23e336bf7606d5ab7c02fc99d746779bf82c4104a8d +size 33663866 diff --git a/checkpoint-120/rng_state.pth b/checkpoint-120/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..a30ce8c2d7608ff21838dbc8f31ab59255dacf53 --- /dev/null +++ b/checkpoint-120/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02fba8e88cedb0246f54d160518d95e35995cb5dc7cd1fa4e39f15fddf38af99 +size 14244 diff --git a/checkpoint-120/scheduler.pt b/checkpoint-120/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ca1cd1490010843cdfc83b6da4e5f2fbd2e24b6d --- /dev/null +++ b/checkpoint-120/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d483870968c555f36109349b17935837f24c25f4c05d770d0123f84163b943e +size 1064 diff --git a/checkpoint-120/trainer_state.json b/checkpoint-120/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8bcafcec06e37154beac02e91127f45b0e4a8c2e --- /dev/null +++ b/checkpoint-120/trainer_state.json @@ -0,0 +1,355 @@ +{ + "best_metric": 1.6305923461914062, + "best_model_checkpoint": "/scratch/kwamea/llama-output/checkpoint-120", + "epoch": 8.571428571428571, + "eval_steps": 5, + "global_step": 120, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.36, + "learning_rate": 9.642857142857143e-05, + "loss": 2.2959, + "step": 5 + }, + { + "epoch": 0.36, + "eval_loss": 2.26009202003479, + "eval_runtime": 2.5228, + "eval_samples_per_second": 2.378, + "eval_steps_per_second": 0.396, + "step": 5 + }, + { + "epoch": 0.71, + "learning_rate": 9.285714285714286e-05, + "loss": 2.2056, + "step": 10 + }, + { + "epoch": 0.71, + "eval_loss": 2.15522837638855, + "eval_runtime": 2.5338, + "eval_samples_per_second": 2.368, + "eval_steps_per_second": 0.395, + "step": 10 + }, + { + "epoch": 1.07, + "learning_rate": 8.92857142857143e-05, + "loss": 2.097, + "step": 15 + }, + { + "epoch": 1.07, + "eval_loss": 2.0667991638183594, + "eval_runtime": 2.5411, + "eval_samples_per_second": 2.361, + "eval_steps_per_second": 0.394, + "step": 15 + }, + { + "epoch": 1.43, + "learning_rate": 8.571428571428571e-05, + "loss": 2.0293, + "step": 20 + }, + { + "epoch": 1.43, + "eval_loss": 2.0328972339630127, + "eval_runtime": 2.5419, + "eval_samples_per_second": 2.36, + "eval_steps_per_second": 0.393, + "step": 20 + }, + { + "epoch": 1.79, + "learning_rate": 8.214285714285714e-05, + "loss": 2.0228, + "step": 25 + }, + { + "epoch": 1.79, + "eval_loss": 1.998112678527832, + "eval_runtime": 2.5416, + "eval_samples_per_second": 2.361, + "eval_steps_per_second": 0.393, + "step": 25 + }, + { + "epoch": 2.14, + "learning_rate": 7.857142857142858e-05, + "loss": 1.9493, + "step": 30 + }, + { + "epoch": 2.14, + "eval_loss": 1.968154788017273, + "eval_runtime": 2.5414, + "eval_samples_per_second": 2.361, + "eval_steps_per_second": 0.393, + "step": 30 + }, + { + "epoch": 2.5, + "learning_rate": 7.500000000000001e-05, + "loss": 1.9252, + "step": 35 + }, + { + "epoch": 2.5, + "eval_loss": 1.937127947807312, + "eval_runtime": 2.5401, + "eval_samples_per_second": 2.362, + "eval_steps_per_second": 0.394, + "step": 35 + }, + { + "epoch": 2.86, + "learning_rate": 7.142857142857143e-05, + "loss": 1.8848, + "step": 40 + }, + { + "epoch": 2.86, + "eval_loss": 1.9035807847976685, + "eval_runtime": 2.5391, + "eval_samples_per_second": 2.363, + "eval_steps_per_second": 0.394, + "step": 40 + }, + { + "epoch": 3.21, + "learning_rate": 6.785714285714286e-05, + "loss": 1.8708, + "step": 45 + }, + { + "epoch": 3.21, + "eval_loss": 1.8712326288223267, + "eval_runtime": 2.5413, + "eval_samples_per_second": 2.361, + "eval_steps_per_second": 0.394, + "step": 45 + }, + { + "epoch": 3.57, + "learning_rate": 6.428571428571429e-05, + "loss": 1.795, + "step": 50 + }, + { + "epoch": 3.57, + "eval_loss": 1.8411849737167358, + "eval_runtime": 2.5425, + "eval_samples_per_second": 2.36, + "eval_steps_per_second": 0.393, + "step": 50 + }, + { + "epoch": 3.93, + "learning_rate": 6.0714285714285715e-05, + "loss": 1.7851, + "step": 55 + }, + { + "epoch": 3.93, + "eval_loss": 1.8131023645401, + "eval_runtime": 2.5418, + "eval_samples_per_second": 2.36, + "eval_steps_per_second": 0.393, + "step": 55 + }, + { + "epoch": 4.29, + "learning_rate": 5.714285714285714e-05, + "loss": 1.737, + "step": 60 + }, + { + "epoch": 4.29, + "eval_loss": 1.7883529663085938, + "eval_runtime": 2.541, + "eval_samples_per_second": 2.361, + "eval_steps_per_second": 0.394, + "step": 60 + }, + { + "epoch": 4.64, + "learning_rate": 5.3571428571428575e-05, + "loss": 1.7509, + "step": 65 + }, + { + "epoch": 4.64, + "eval_loss": 1.7668665647506714, + "eval_runtime": 2.5422, + "eval_samples_per_second": 2.36, + "eval_steps_per_second": 0.393, + "step": 65 + }, + { + "epoch": 5.0, + "learning_rate": 5e-05, + "loss": 1.7, + "step": 70 + }, + { + "epoch": 5.0, + "eval_loss": 1.7516651153564453, + "eval_runtime": 2.5412, + "eval_samples_per_second": 2.361, + "eval_steps_per_second": 0.394, + "step": 70 + }, + { + "epoch": 5.36, + "learning_rate": 4.642857142857143e-05, + "loss": 1.6734, + "step": 75 + }, + { + "epoch": 5.36, + "eval_loss": 1.7334843873977661, + "eval_runtime": 2.5431, + "eval_samples_per_second": 2.359, + "eval_steps_per_second": 0.393, + "step": 75 + }, + { + "epoch": 5.71, + "learning_rate": 4.2857142857142856e-05, + "loss": 1.6719, + "step": 80 + }, + { + "epoch": 5.71, + "eval_loss": 1.7061794996261597, + "eval_runtime": 2.5412, + "eval_samples_per_second": 2.361, + "eval_steps_per_second": 0.394, + "step": 80 + }, + { + "epoch": 6.07, + "learning_rate": 3.928571428571429e-05, + "loss": 1.5932, + "step": 85 + }, + { + "epoch": 6.07, + "eval_loss": 1.6727076768875122, + "eval_runtime": 2.5367, + "eval_samples_per_second": 2.365, + "eval_steps_per_second": 0.394, + "step": 85 + }, + { + "epoch": 6.43, + "learning_rate": 3.571428571428572e-05, + "loss": 1.5866, + "step": 90 + }, + { + "epoch": 6.43, + "eval_loss": 1.66474187374115, + "eval_runtime": 2.5383, + "eval_samples_per_second": 2.364, + "eval_steps_per_second": 0.394, + "step": 90 + }, + { + "epoch": 6.79, + "learning_rate": 3.2142857142857144e-05, + "loss": 1.577, + "step": 95 + }, + { + "epoch": 6.79, + "eval_loss": 1.6590815782546997, + "eval_runtime": 2.5384, + "eval_samples_per_second": 2.364, + "eval_steps_per_second": 0.394, + "step": 95 + }, + { + "epoch": 7.14, + "learning_rate": 2.857142857142857e-05, + "loss": 1.5532, + "step": 100 + }, + { + "epoch": 7.14, + "eval_loss": 1.6509045362472534, + "eval_runtime": 2.5429, + "eval_samples_per_second": 2.359, + "eval_steps_per_second": 0.393, + "step": 100 + }, + { + "epoch": 7.5, + "learning_rate": 2.5e-05, + "loss": 1.5099, + "step": 105 + }, + { + "epoch": 7.5, + "eval_loss": 1.6463295221328735, + "eval_runtime": 2.5379, + "eval_samples_per_second": 2.364, + "eval_steps_per_second": 0.394, + "step": 105 + }, + { + "epoch": 7.86, + "learning_rate": 2.1428571428571428e-05, + "loss": 1.5717, + "step": 110 + }, + { + "epoch": 7.86, + "eval_loss": 1.6409095525741577, + "eval_runtime": 2.5359, + "eval_samples_per_second": 2.366, + "eval_steps_per_second": 0.394, + "step": 110 + }, + { + "epoch": 8.21, + "learning_rate": 1.785714285714286e-05, + "loss": 1.5354, + "step": 115 + }, + { + "epoch": 8.21, + "eval_loss": 1.6344412565231323, + "eval_runtime": 2.5372, + "eval_samples_per_second": 2.365, + "eval_steps_per_second": 0.394, + "step": 115 + }, + { + "epoch": 8.57, + "learning_rate": 1.4285714285714285e-05, + "loss": 1.5127, + "step": 120 + }, + { + "epoch": 8.57, + "eval_loss": 1.6305923461914062, + "eval_runtime": 2.537, + "eval_samples_per_second": 2.365, + "eval_steps_per_second": 0.394, + "step": 120 + } + ], + "logging_steps": 5, + "max_steps": 140, + "num_train_epochs": 10, + "save_steps": 10, + "total_flos": 3.89964374212608e+16, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-120/training_args.bin b/checkpoint-120/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7ba36f72077528973d8b5af1c35d1ba6ec0d1ea6 --- /dev/null +++ b/checkpoint-120/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:584bdb07d1eef7bc6a4f12879793b3487ecc8146c1335851f27ac0cac2080d65 +size 4600 diff --git a/checkpoint-130/README.md b/checkpoint-130/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4787359fdf0253321d922e272bacc387ca46ce22 --- /dev/null +++ b/checkpoint-130/README.md @@ -0,0 +1,21 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- quant_method: bitsandbytes +- load_in_8bit: True +- load_in_4bit: False +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 +### Framework versions + + +- PEFT 0.5.0 diff --git a/checkpoint-130/adapter_config.json b/checkpoint-130/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..96a10d09371f10f296c5b7fcb5b0ddd7be98eef2 --- /dev/null +++ b/checkpoint-130/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-2-7b-hf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-130/adapter_model.safetensors b/checkpoint-130/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0f9614e426d0318c96f4d6b08413436693b00659 --- /dev/null +++ b/checkpoint-130/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bf971da943c24cf2309c9781a35c85b99251124517b3700a11fad54a7449bdf +size 16794200 diff --git a/checkpoint-130/optimizer.pt b/checkpoint-130/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e60dcb18429170f82eb43f034186fbfdd8fa90d4 --- /dev/null +++ b/checkpoint-130/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85432cbb3394c47f731a428717c461cd6c0b05fcbdb32de6ebcc7368f8892fe5 +size 33663866 diff --git a/checkpoint-130/rng_state.pth b/checkpoint-130/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..a722030db4d2b6895be4949450f3d537606a0272 --- /dev/null +++ b/checkpoint-130/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:685328dbc242eb3fbc87935b960d8b4f81528ca5a1536e2981f75930a9a32737 +size 14244 diff --git a/checkpoint-130/scheduler.pt b/checkpoint-130/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..90a32d92a4e52e8b582f9f469575e566c350c7f9 --- /dev/null +++ b/checkpoint-130/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17c5fa4f32b4713c8e2d7526e153039e69cef4d5ae6474766f511c2326d33908 +size 1064 diff --git a/checkpoint-130/trainer_state.json b/checkpoint-130/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d9c5bf2ce3a679f7aa2820880fc6bfccbdc37eb1 --- /dev/null +++ b/checkpoint-130/trainer_state.json @@ -0,0 +1,383 @@ +{ + "best_metric": 1.6272777318954468, + "best_model_checkpoint": "/scratch/kwamea/llama-output/checkpoint-130", + "epoch": 9.285714285714286, + "eval_steps": 5, + "global_step": 130, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.36, + "learning_rate": 9.642857142857143e-05, + "loss": 2.2959, + "step": 5 + }, + { + "epoch": 0.36, + "eval_loss": 2.26009202003479, + "eval_runtime": 2.5228, + "eval_samples_per_second": 2.378, + "eval_steps_per_second": 0.396, + "step": 5 + }, + { + "epoch": 0.71, + "learning_rate": 9.285714285714286e-05, + "loss": 2.2056, + "step": 10 + }, + { + "epoch": 0.71, + "eval_loss": 2.15522837638855, + "eval_runtime": 2.5338, + "eval_samples_per_second": 2.368, + "eval_steps_per_second": 0.395, + "step": 10 + }, + { + "epoch": 1.07, + "learning_rate": 8.92857142857143e-05, + "loss": 2.097, + "step": 15 + }, + { + "epoch": 1.07, + "eval_loss": 2.0667991638183594, + "eval_runtime": 2.5411, + "eval_samples_per_second": 2.361, + "eval_steps_per_second": 0.394, + "step": 15 + }, + { + "epoch": 1.43, + "learning_rate": 8.571428571428571e-05, + "loss": 2.0293, + "step": 20 + }, + { + "epoch": 1.43, + "eval_loss": 2.0328972339630127, + "eval_runtime": 2.5419, + "eval_samples_per_second": 2.36, + "eval_steps_per_second": 0.393, + "step": 20 + }, + { + "epoch": 1.79, + "learning_rate": 8.214285714285714e-05, + "loss": 2.0228, + "step": 25 + }, + { + "epoch": 1.79, + "eval_loss": 1.998112678527832, + "eval_runtime": 2.5416, + "eval_samples_per_second": 2.361, + "eval_steps_per_second": 0.393, + "step": 25 + }, + { + "epoch": 2.14, + "learning_rate": 7.857142857142858e-05, + "loss": 1.9493, + "step": 30 + }, + { + "epoch": 2.14, + "eval_loss": 1.968154788017273, + "eval_runtime": 2.5414, + "eval_samples_per_second": 2.361, + "eval_steps_per_second": 0.393, + "step": 30 + }, + { + "epoch": 2.5, + "learning_rate": 7.500000000000001e-05, + "loss": 1.9252, + "step": 35 + }, + { + "epoch": 2.5, + "eval_loss": 1.937127947807312, + "eval_runtime": 2.5401, + "eval_samples_per_second": 2.362, + "eval_steps_per_second": 0.394, + "step": 35 + }, + { + "epoch": 2.86, + "learning_rate": 7.142857142857143e-05, + "loss": 1.8848, + "step": 40 + }, + { + "epoch": 2.86, + "eval_loss": 1.9035807847976685, + "eval_runtime": 2.5391, + "eval_samples_per_second": 2.363, + "eval_steps_per_second": 0.394, + "step": 40 + }, + { + "epoch": 3.21, + "learning_rate": 6.785714285714286e-05, + "loss": 1.8708, + "step": 45 + }, + { + "epoch": 3.21, + "eval_loss": 1.8712326288223267, + "eval_runtime": 2.5413, + "eval_samples_per_second": 2.361, + "eval_steps_per_second": 0.394, + "step": 45 + }, + { + "epoch": 3.57, + "learning_rate": 6.428571428571429e-05, + "loss": 1.795, + "step": 50 + }, + { + "epoch": 3.57, + "eval_loss": 1.8411849737167358, + "eval_runtime": 2.5425, + "eval_samples_per_second": 2.36, + "eval_steps_per_second": 0.393, + "step": 50 + }, + { + "epoch": 3.93, + "learning_rate": 6.0714285714285715e-05, + "loss": 1.7851, + "step": 55 + }, + { + "epoch": 3.93, + "eval_loss": 1.8131023645401, + "eval_runtime": 2.5418, + "eval_samples_per_second": 2.36, + "eval_steps_per_second": 0.393, + "step": 55 + }, + { + "epoch": 4.29, + "learning_rate": 5.714285714285714e-05, + "loss": 1.737, + "step": 60 + }, + { + "epoch": 4.29, + "eval_loss": 1.7883529663085938, + "eval_runtime": 2.541, + "eval_samples_per_second": 2.361, + "eval_steps_per_second": 0.394, + "step": 60 + }, + { + "epoch": 4.64, + "learning_rate": 5.3571428571428575e-05, + "loss": 1.7509, + "step": 65 + }, + { + "epoch": 4.64, + "eval_loss": 1.7668665647506714, + "eval_runtime": 2.5422, + "eval_samples_per_second": 2.36, + "eval_steps_per_second": 0.393, + "step": 65 + }, + { + "epoch": 5.0, + "learning_rate": 5e-05, + "loss": 1.7, + "step": 70 + }, + { + "epoch": 5.0, + "eval_loss": 1.7516651153564453, + "eval_runtime": 2.5412, + "eval_samples_per_second": 2.361, + "eval_steps_per_second": 0.394, + "step": 70 + }, + { + "epoch": 5.36, + "learning_rate": 4.642857142857143e-05, + "loss": 1.6734, + "step": 75 + }, + { + "epoch": 5.36, + "eval_loss": 1.7334843873977661, + "eval_runtime": 2.5431, + "eval_samples_per_second": 2.359, + "eval_steps_per_second": 0.393, + "step": 75 + }, + { + "epoch": 5.71, + "learning_rate": 4.2857142857142856e-05, + "loss": 1.6719, + "step": 80 + }, + { + "epoch": 5.71, + "eval_loss": 1.7061794996261597, + "eval_runtime": 2.5412, + "eval_samples_per_second": 2.361, + "eval_steps_per_second": 0.394, + "step": 80 + }, + { + "epoch": 6.07, + "learning_rate": 3.928571428571429e-05, + "loss": 1.5932, + "step": 85 + }, + { + "epoch": 6.07, + "eval_loss": 1.6727076768875122, + "eval_runtime": 2.5367, + "eval_samples_per_second": 2.365, + "eval_steps_per_second": 0.394, + "step": 85 + }, + { + "epoch": 6.43, + "learning_rate": 3.571428571428572e-05, + "loss": 1.5866, + "step": 90 + }, + { + "epoch": 6.43, + "eval_loss": 1.66474187374115, + "eval_runtime": 2.5383, + "eval_samples_per_second": 2.364, + "eval_steps_per_second": 0.394, + "step": 90 + }, + { + "epoch": 6.79, + "learning_rate": 3.2142857142857144e-05, + "loss": 1.577, + "step": 95 + }, + { + "epoch": 6.79, + "eval_loss": 1.6590815782546997, + "eval_runtime": 2.5384, + "eval_samples_per_second": 2.364, + "eval_steps_per_second": 0.394, + "step": 95 + }, + { + "epoch": 7.14, + "learning_rate": 2.857142857142857e-05, + "loss": 1.5532, + "step": 100 + }, + { + "epoch": 7.14, + "eval_loss": 1.6509045362472534, + "eval_runtime": 2.5429, + "eval_samples_per_second": 2.359, + "eval_steps_per_second": 0.393, + "step": 100 + }, + { + "epoch": 7.5, + "learning_rate": 2.5e-05, + "loss": 1.5099, + "step": 105 + }, + { + "epoch": 7.5, + "eval_loss": 1.6463295221328735, + "eval_runtime": 2.5379, + "eval_samples_per_second": 2.364, + "eval_steps_per_second": 0.394, + "step": 105 + }, + { + "epoch": 7.86, + "learning_rate": 2.1428571428571428e-05, + "loss": 1.5717, + "step": 110 + }, + { + "epoch": 7.86, + "eval_loss": 1.6409095525741577, + "eval_runtime": 2.5359, + "eval_samples_per_second": 2.366, + "eval_steps_per_second": 0.394, + "step": 110 + }, + { + "epoch": 8.21, + "learning_rate": 1.785714285714286e-05, + "loss": 1.5354, + "step": 115 + }, + { + "epoch": 8.21, + "eval_loss": 1.6344412565231323, + "eval_runtime": 2.5372, + "eval_samples_per_second": 2.365, + "eval_steps_per_second": 0.394, + "step": 115 + }, + { + "epoch": 8.57, + "learning_rate": 1.4285714285714285e-05, + "loss": 1.5127, + "step": 120 + }, + { + "epoch": 8.57, + "eval_loss": 1.6305923461914062, + "eval_runtime": 2.537, + "eval_samples_per_second": 2.365, + "eval_steps_per_second": 0.394, + "step": 120 + }, + { + "epoch": 8.93, + "learning_rate": 1.0714285714285714e-05, + "loss": 1.5234, + "step": 125 + }, + { + "epoch": 8.93, + "eval_loss": 1.6271618604660034, + "eval_runtime": 2.5403, + "eval_samples_per_second": 2.362, + "eval_steps_per_second": 0.394, + "step": 125 + }, + { + "epoch": 9.29, + "learning_rate": 7.142857142857143e-06, + "loss": 1.4811, + "step": 130 + }, + { + "epoch": 9.29, + "eval_loss": 1.6272777318954468, + "eval_runtime": 2.5362, + "eval_samples_per_second": 2.366, + "eval_steps_per_second": 0.394, + "step": 130 + } + ], + "logging_steps": 5, + "max_steps": 140, + "num_train_epochs": 10, + "save_steps": 10, + "total_flos": 4.22461405396992e+16, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-130/training_args.bin b/checkpoint-130/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7ba36f72077528973d8b5af1c35d1ba6ec0d1ea6 --- /dev/null +++ b/checkpoint-130/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:584bdb07d1eef7bc6a4f12879793b3487ecc8146c1335851f27ac0cac2080d65 +size 4600 diff --git a/checkpoint-140/README.md b/checkpoint-140/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4787359fdf0253321d922e272bacc387ca46ce22 --- /dev/null +++ b/checkpoint-140/README.md @@ -0,0 +1,21 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- quant_method: bitsandbytes +- load_in_8bit: True +- load_in_4bit: False +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 +### Framework versions + + +- PEFT 0.5.0 diff --git a/checkpoint-140/adapter_config.json b/checkpoint-140/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..96a10d09371f10f296c5b7fcb5b0ddd7be98eef2 --- /dev/null +++ b/checkpoint-140/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-2-7b-hf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-140/adapter_model.safetensors b/checkpoint-140/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7b09af703b80284e88c67bcc3b923c079030439e --- /dev/null +++ b/checkpoint-140/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:deacdec1e19e3dc341f8b2842de0a46ea150ef194e7c21c513d50e197684b64b +size 16794200 diff --git a/checkpoint-140/optimizer.pt b/checkpoint-140/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c42b7912a5aaa053190495b1c85007e2c52135ef --- /dev/null +++ b/checkpoint-140/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee1e452a5a0ffd2da36e28a66280aed0ac73a5d69bf887705cedcede4fe70297 +size 33663866 diff --git a/checkpoint-140/rng_state.pth b/checkpoint-140/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..89e723df2fd737cd762d0adbd661f3e859c13c45 --- /dev/null +++ b/checkpoint-140/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bf7d2b3876e2e10b18c6cac5ddf00968ca6923be08b3bd84e271aa516fea7de +size 14244 diff --git a/checkpoint-140/scheduler.pt b/checkpoint-140/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..fa3305bb5fdc717364f92363af29463294419fe1 --- /dev/null +++ b/checkpoint-140/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8c98fcf42f494fa75f135fb338a20acefff4d78f4f44abc68b10fde88458a02 +size 1064 diff --git a/checkpoint-140/trainer_state.json b/checkpoint-140/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5d756dfd10b9bb65ce51e512498f3e70d3f2bc3c --- /dev/null +++ b/checkpoint-140/trainer_state.json @@ -0,0 +1,411 @@ +{ + "best_metric": 1.623923897743225, + "best_model_checkpoint": "/scratch/kwamea/llama-output/checkpoint-140", + "epoch": 10.0, + "eval_steps": 5, + "global_step": 140, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.36, + "learning_rate": 9.642857142857143e-05, + "loss": 2.2959, + "step": 5 + }, + { + "epoch": 0.36, + "eval_loss": 2.26009202003479, + "eval_runtime": 2.5228, + "eval_samples_per_second": 2.378, + "eval_steps_per_second": 0.396, + "step": 5 + }, + { + "epoch": 0.71, + "learning_rate": 9.285714285714286e-05, + "loss": 2.2056, + "step": 10 + }, + { + "epoch": 0.71, + "eval_loss": 2.15522837638855, + "eval_runtime": 2.5338, + "eval_samples_per_second": 2.368, + "eval_steps_per_second": 0.395, + "step": 10 + }, + { + "epoch": 1.07, + "learning_rate": 8.92857142857143e-05, + "loss": 2.097, + "step": 15 + }, + { + "epoch": 1.07, + "eval_loss": 2.0667991638183594, + "eval_runtime": 2.5411, + "eval_samples_per_second": 2.361, + "eval_steps_per_second": 0.394, + "step": 15 + }, + { + "epoch": 1.43, + "learning_rate": 8.571428571428571e-05, + "loss": 2.0293, + "step": 20 + }, + { + "epoch": 1.43, + "eval_loss": 2.0328972339630127, + "eval_runtime": 2.5419, + "eval_samples_per_second": 2.36, + "eval_steps_per_second": 0.393, + "step": 20 + }, + { + "epoch": 1.79, + "learning_rate": 8.214285714285714e-05, + "loss": 2.0228, + "step": 25 + }, + { + "epoch": 1.79, + "eval_loss": 1.998112678527832, + "eval_runtime": 2.5416, + "eval_samples_per_second": 2.361, + "eval_steps_per_second": 0.393, + "step": 25 + }, + { + "epoch": 2.14, + "learning_rate": 7.857142857142858e-05, + "loss": 1.9493, + "step": 30 + }, + { + "epoch": 2.14, + "eval_loss": 1.968154788017273, + "eval_runtime": 2.5414, + "eval_samples_per_second": 2.361, + "eval_steps_per_second": 0.393, + "step": 30 + }, + { + "epoch": 2.5, + "learning_rate": 7.500000000000001e-05, + "loss": 1.9252, + "step": 35 + }, + { + "epoch": 2.5, + "eval_loss": 1.937127947807312, + "eval_runtime": 2.5401, + "eval_samples_per_second": 2.362, + "eval_steps_per_second": 0.394, + "step": 35 + }, + { + "epoch": 2.86, + "learning_rate": 7.142857142857143e-05, + "loss": 1.8848, + "step": 40 + }, + { + "epoch": 2.86, + "eval_loss": 1.9035807847976685, + "eval_runtime": 2.5391, + "eval_samples_per_second": 2.363, + "eval_steps_per_second": 0.394, + "step": 40 + }, + { + "epoch": 3.21, + "learning_rate": 6.785714285714286e-05, + "loss": 1.8708, + "step": 45 + }, + { + "epoch": 3.21, + "eval_loss": 1.8712326288223267, + "eval_runtime": 2.5413, + "eval_samples_per_second": 2.361, + "eval_steps_per_second": 0.394, + "step": 45 + }, + { + "epoch": 3.57, + "learning_rate": 6.428571428571429e-05, + "loss": 1.795, + "step": 50 + }, + { + "epoch": 3.57, + "eval_loss": 1.8411849737167358, + "eval_runtime": 2.5425, + "eval_samples_per_second": 2.36, + "eval_steps_per_second": 0.393, + "step": 50 + }, + { + "epoch": 3.93, + "learning_rate": 6.0714285714285715e-05, + "loss": 1.7851, + "step": 55 + }, + { + "epoch": 3.93, + "eval_loss": 1.8131023645401, + "eval_runtime": 2.5418, + "eval_samples_per_second": 2.36, + "eval_steps_per_second": 0.393, + "step": 55 + }, + { + "epoch": 4.29, + "learning_rate": 5.714285714285714e-05, + "loss": 1.737, + "step": 60 + }, + { + "epoch": 4.29, + "eval_loss": 1.7883529663085938, + "eval_runtime": 2.541, + "eval_samples_per_second": 2.361, + "eval_steps_per_second": 0.394, + "step": 60 + }, + { + "epoch": 4.64, + "learning_rate": 5.3571428571428575e-05, + "loss": 1.7509, + "step": 65 + }, + { + "epoch": 4.64, + "eval_loss": 1.7668665647506714, + "eval_runtime": 2.5422, + "eval_samples_per_second": 2.36, + "eval_steps_per_second": 0.393, + "step": 65 + }, + { + "epoch": 5.0, + "learning_rate": 5e-05, + "loss": 1.7, + "step": 70 + }, + { + "epoch": 5.0, + "eval_loss": 1.7516651153564453, + "eval_runtime": 2.5412, + "eval_samples_per_second": 2.361, + "eval_steps_per_second": 0.394, + "step": 70 + }, + { + "epoch": 5.36, + "learning_rate": 4.642857142857143e-05, + "loss": 1.6734, + "step": 75 + }, + { + "epoch": 5.36, + "eval_loss": 1.7334843873977661, + "eval_runtime": 2.5431, + "eval_samples_per_second": 2.359, + "eval_steps_per_second": 0.393, + "step": 75 + }, + { + "epoch": 5.71, + "learning_rate": 4.2857142857142856e-05, + "loss": 1.6719, + "step": 80 + }, + { + "epoch": 5.71, + "eval_loss": 1.7061794996261597, + "eval_runtime": 2.5412, + "eval_samples_per_second": 2.361, + "eval_steps_per_second": 0.394, + "step": 80 + }, + { + "epoch": 6.07, + "learning_rate": 3.928571428571429e-05, + "loss": 1.5932, + "step": 85 + }, + { + "epoch": 6.07, + "eval_loss": 1.6727076768875122, + "eval_runtime": 2.5367, + "eval_samples_per_second": 2.365, + "eval_steps_per_second": 0.394, + "step": 85 + }, + { + "epoch": 6.43, + "learning_rate": 3.571428571428572e-05, + "loss": 1.5866, + "step": 90 + }, + { + "epoch": 6.43, + "eval_loss": 1.66474187374115, + "eval_runtime": 2.5383, + "eval_samples_per_second": 2.364, + "eval_steps_per_second": 0.394, + "step": 90 + }, + { + "epoch": 6.79, + "learning_rate": 3.2142857142857144e-05, + "loss": 1.577, + "step": 95 + }, + { + "epoch": 6.79, + "eval_loss": 1.6590815782546997, + "eval_runtime": 2.5384, + "eval_samples_per_second": 2.364, + "eval_steps_per_second": 0.394, + "step": 95 + }, + { + "epoch": 7.14, + "learning_rate": 2.857142857142857e-05, + "loss": 1.5532, + "step": 100 + }, + { + "epoch": 7.14, + "eval_loss": 1.6509045362472534, + "eval_runtime": 2.5429, + "eval_samples_per_second": 2.359, + "eval_steps_per_second": 0.393, + "step": 100 + }, + { + "epoch": 7.5, + "learning_rate": 2.5e-05, + "loss": 1.5099, + "step": 105 + }, + { + "epoch": 7.5, + "eval_loss": 1.6463295221328735, + "eval_runtime": 2.5379, + "eval_samples_per_second": 2.364, + "eval_steps_per_second": 0.394, + "step": 105 + }, + { + "epoch": 7.86, + "learning_rate": 2.1428571428571428e-05, + "loss": 1.5717, + "step": 110 + }, + { + "epoch": 7.86, + "eval_loss": 1.6409095525741577, + "eval_runtime": 2.5359, + "eval_samples_per_second": 2.366, + "eval_steps_per_second": 0.394, + "step": 110 + }, + { + "epoch": 8.21, + "learning_rate": 1.785714285714286e-05, + "loss": 1.5354, + "step": 115 + }, + { + "epoch": 8.21, + "eval_loss": 1.6344412565231323, + "eval_runtime": 2.5372, + "eval_samples_per_second": 2.365, + "eval_steps_per_second": 0.394, + "step": 115 + }, + { + "epoch": 8.57, + "learning_rate": 1.4285714285714285e-05, + "loss": 1.5127, + "step": 120 + }, + { + "epoch": 8.57, + "eval_loss": 1.6305923461914062, + "eval_runtime": 2.537, + "eval_samples_per_second": 2.365, + "eval_steps_per_second": 0.394, + "step": 120 + }, + { + "epoch": 8.93, + "learning_rate": 1.0714285714285714e-05, + "loss": 1.5234, + "step": 125 + }, + { + "epoch": 8.93, + "eval_loss": 1.6271618604660034, + "eval_runtime": 2.5403, + "eval_samples_per_second": 2.362, + "eval_steps_per_second": 0.394, + "step": 125 + }, + { + "epoch": 9.29, + "learning_rate": 7.142857142857143e-06, + "loss": 1.4811, + "step": 130 + }, + { + "epoch": 9.29, + "eval_loss": 1.6272777318954468, + "eval_runtime": 2.5362, + "eval_samples_per_second": 2.366, + "eval_steps_per_second": 0.394, + "step": 130 + }, + { + "epoch": 9.64, + "learning_rate": 3.5714285714285714e-06, + "loss": 1.5167, + "step": 135 + }, + { + "epoch": 9.64, + "eval_loss": 1.625247597694397, + "eval_runtime": 2.5383, + "eval_samples_per_second": 2.364, + "eval_steps_per_second": 0.394, + "step": 135 + }, + { + "epoch": 10.0, + "learning_rate": 0.0, + "loss": 1.4972, + "step": 140 + }, + { + "epoch": 10.0, + "eval_loss": 1.623923897743225, + "eval_runtime": 2.536, + "eval_samples_per_second": 2.366, + "eval_steps_per_second": 0.394, + "step": 140 + } + ], + "logging_steps": 5, + "max_steps": 140, + "num_train_epochs": 10, + "save_steps": 10, + "total_flos": 4.54958436581376e+16, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-140/training_args.bin b/checkpoint-140/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7ba36f72077528973d8b5af1c35d1ba6ec0d1ea6 --- /dev/null +++ b/checkpoint-140/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:584bdb07d1eef7bc6a4f12879793b3487ecc8146c1335851f27ac0cac2080d65 +size 4600 diff --git a/checkpoint-60/README.md b/checkpoint-60/README.md new file mode 100644 index 0000000000000000000000000000000000000000..cf6fbe7c7a9fe6356cea61b987d8e2c5c40364f7 --- /dev/null +++ b/checkpoint-60/README.md @@ -0,0 +1,23 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- quant_method: bitsandbytes +- _load_in_8bit: False +- _load_in_4bit: False +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 +- load_in_4bit: False +- load_in_8bit: False +### Framework versions + + +- PEFT 0.5.0 diff --git a/checkpoint-60/adapter_config.json b/checkpoint-60/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c6d9ff3bfcea32ea90a3ddeb88352108d838b4ee --- /dev/null +++ b/checkpoint-60/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-2-70b-hf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-60/adapter_model.safetensors b/checkpoint-60/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..bbde6827dc75886efc63486afdf38f10ccdc0942 --- /dev/null +++ b/checkpoint-60/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7bd8f64fda45ff6615b3e4252c72be74e77270ca7b822f9dcb75c103285df211 +size 65578776 diff --git a/checkpoint-60/optimizer.pt b/checkpoint-60/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..ec4d09a3ed8063a591d2cf167b3cebcb2a26b0a1 --- /dev/null +++ b/checkpoint-60/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3fc876fe0e006834631446ad7f0e991808e2435e346516e2ba28c5184fae096 +size 131345914 diff --git a/checkpoint-60/rng_state.pth b/checkpoint-60/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..45be8ce9f16dd05f634d6e30ad09a239fa2f231e --- /dev/null +++ b/checkpoint-60/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2b438a27eddee154866003dcf691d9169d17b12b3ded46c944feca5f44c9583 +size 14244 diff --git a/checkpoint-60/scheduler.pt b/checkpoint-60/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1c54c5c361ddf63508504ee3885ab802c105474d --- /dev/null +++ b/checkpoint-60/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:532a066b51f06d04618044a51287ede2469b5330f8f8e15d3cb8c13dc71684e6 +size 1064 diff --git a/checkpoint-60/trainer_state.json b/checkpoint-60/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bde3cab8bf513e4a81e6abd82b5f8695cd826a3d --- /dev/null +++ b/checkpoint-60/trainer_state.json @@ -0,0 +1,201 @@ +{ + "best_metric": 1.6068811416625977, + "best_model_checkpoint": "/scratch/kwamea/llama-output/checkpoint-60", + "epoch": 8.275862068965518, + "eval_steps": 5, + "global_step": 60, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.69, + "grad_norm": 0.27489739656448364, + "learning_rate": 9.285714285714286e-05, + "loss": 1.9945, + "step": 5 + }, + { + "epoch": 0.69, + "eval_loss": 2.0204718112945557, + "eval_runtime": 14.7113, + "eval_samples_per_second": 0.408, + "eval_steps_per_second": 0.068, + "step": 5 + }, + { + "epoch": 1.38, + "grad_norm": 0.36750176548957825, + "learning_rate": 8.571428571428571e-05, + "loss": 1.9427, + "step": 10 + }, + { + "epoch": 1.38, + "eval_loss": 1.9357692003250122, + "eval_runtime": 14.713, + "eval_samples_per_second": 0.408, + "eval_steps_per_second": 0.068, + "step": 10 + }, + { + "epoch": 2.07, + "grad_norm": 0.4677024185657501, + "learning_rate": 7.857142857142858e-05, + "loss": 1.8651, + "step": 15 + }, + { + "epoch": 2.07, + "eval_loss": 1.8461824655532837, + "eval_runtime": 14.7203, + "eval_samples_per_second": 0.408, + "eval_steps_per_second": 0.068, + "step": 15 + }, + { + "epoch": 2.76, + "grad_norm": 0.40606027841567993, + "learning_rate": 7.142857142857143e-05, + "loss": 1.7948, + "step": 20 + }, + { + "epoch": 2.76, + "eval_loss": 1.801284670829773, + "eval_runtime": 14.7281, + "eval_samples_per_second": 0.407, + "eval_steps_per_second": 0.068, + "step": 20 + }, + { + "epoch": 3.45, + "grad_norm": 0.3058153986930847, + "learning_rate": 6.428571428571429e-05, + "loss": 1.7233, + "step": 25 + }, + { + "epoch": 3.45, + "eval_loss": 1.766973853111267, + "eval_runtime": 14.6916, + "eval_samples_per_second": 0.408, + "eval_steps_per_second": 0.068, + "step": 25 + }, + { + "epoch": 4.14, + "grad_norm": 0.2967859208583832, + "learning_rate": 5.714285714285714e-05, + "loss": 1.7117, + "step": 30 + }, + { + "epoch": 4.14, + "eval_loss": 1.7350422143936157, + "eval_runtime": 14.7126, + "eval_samples_per_second": 0.408, + "eval_steps_per_second": 0.068, + "step": 30 + }, + { + "epoch": 4.83, + "grad_norm": 0.31578946113586426, + "learning_rate": 5e-05, + "loss": 1.6904, + "step": 35 + }, + { + "epoch": 4.83, + "eval_loss": 1.7066936492919922, + "eval_runtime": 14.7265, + "eval_samples_per_second": 0.407, + "eval_steps_per_second": 0.068, + "step": 35 + }, + { + "epoch": 5.52, + "grad_norm": 0.34217244386672974, + "learning_rate": 4.2857142857142856e-05, + "loss": 1.6459, + "step": 40 + }, + { + "epoch": 5.52, + "eval_loss": 1.6809816360473633, + "eval_runtime": 14.7004, + "eval_samples_per_second": 0.408, + "eval_steps_per_second": 0.068, + "step": 40 + }, + { + "epoch": 6.21, + "grad_norm": 0.36268118023872375, + "learning_rate": 3.571428571428572e-05, + "loss": 1.5992, + "step": 45 + }, + { + "epoch": 6.21, + "eval_loss": 1.6565723419189453, + "eval_runtime": 14.6987, + "eval_samples_per_second": 0.408, + "eval_steps_per_second": 0.068, + "step": 45 + }, + { + "epoch": 6.9, + "grad_norm": 0.3854450583457947, + "learning_rate": 2.857142857142857e-05, + "loss": 1.6088, + "step": 50 + }, + { + "epoch": 6.9, + "eval_loss": 1.6376428604125977, + "eval_runtime": 14.7228, + "eval_samples_per_second": 0.408, + "eval_steps_per_second": 0.068, + "step": 50 + }, + { + "epoch": 7.59, + "grad_norm": 0.39184266328811646, + "learning_rate": 2.1428571428571428e-05, + "loss": 1.5722, + "step": 55 + }, + { + "epoch": 7.59, + "eval_loss": 1.6203418970108032, + "eval_runtime": 14.7373, + "eval_samples_per_second": 0.407, + "eval_steps_per_second": 0.068, + "step": 55 + }, + { + "epoch": 8.28, + "grad_norm": 0.41270822286605835, + "learning_rate": 1.4285714285714285e-05, + "loss": 1.5593, + "step": 60 + }, + { + "epoch": 8.28, + "eval_loss": 1.6068811416625977, + "eval_runtime": 14.7065, + "eval_samples_per_second": 0.408, + "eval_steps_per_second": 0.068, + "step": 60 + } + ], + "logging_steps": 5, + "max_steps": 70, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 10, + "total_flos": 4.053912739695821e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-60/training_args.bin b/checkpoint-60/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..1afa2f9ab75d020a21f04c7f7b6890b1c419092d --- /dev/null +++ b/checkpoint-60/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4c74a2b62a8cab8f69632f5511d4781e140b9b59c7c1d8108fbd66233b3d511 +size 4920 diff --git a/checkpoint-70/README.md b/checkpoint-70/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ac27fe6c4d211d9dd753d3b08dc4cc6c884b865b --- /dev/null +++ b/checkpoint-70/README.md @@ -0,0 +1,68 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- quant_method: bitsandbytes +- _load_in_8bit: False +- _load_in_4bit: False +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 +- load_in_4bit: False +- load_in_8bit: False + +The following `bitsandbytes` quantization config was used during training: +- quant_method: bitsandbytes +- _load_in_8bit: False +- _load_in_4bit: False +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 +- load_in_4bit: False +- load_in_8bit: False + +The following `bitsandbytes` quantization config was used during training: +- quant_method: bitsandbytes +- _load_in_8bit: False +- _load_in_4bit: False +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 +- load_in_4bit: False +- load_in_8bit: False + +The following `bitsandbytes` quantization config was used during training: +- quant_method: bitsandbytes +- _load_in_8bit: False +- _load_in_4bit: False +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 +- load_in_4bit: False +- load_in_8bit: False +### Framework versions + +- PEFT 0.5.0 +- PEFT 0.5.0 +- PEFT 0.5.0 + +- PEFT 0.5.0 diff --git a/checkpoint-70/adapter_config.json b/checkpoint-70/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c6d9ff3bfcea32ea90a3ddeb88352108d838b4ee --- /dev/null +++ b/checkpoint-70/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-2-70b-hf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-70/adapter_model.safetensors b/checkpoint-70/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ca2074821112a6d9e717d149841f77b5d96fbb8e --- /dev/null +++ b/checkpoint-70/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:151c72fda5d538af519864c3bc1a2b3cd0d654e54208865ae1c7ae608be4af9a +size 65578776 diff --git a/checkpoint-70/optimizer.pt b/checkpoint-70/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..57a5dece0b3e16d140eb2d92d6038e53e377d271 --- /dev/null +++ b/checkpoint-70/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54c0d5b07ddbceee7338bc556836c713b321200dc211a359184f6293a0d66b40 +size 131345914 diff --git a/checkpoint-70/rng_state.pth b/checkpoint-70/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..17ab047675b25f9b7ec3b891acac539ab0a8a808 --- /dev/null +++ b/checkpoint-70/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5f0e1881495fd4cc456489a0d9fe631a628cf2bf127bb12be7abb9a7388d623 +size 14244 diff --git a/checkpoint-70/scheduler.pt b/checkpoint-70/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..22e097bb106e1371f11a4be31eaca3b6aa59ad98 --- /dev/null +++ b/checkpoint-70/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6972eef268c24c724483865d9931d7303b58c4b5768be8129b5fb16010a7eb9 +size 1064 diff --git a/checkpoint-70/trainer_state.json b/checkpoint-70/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d582a4dad8827b681120b83fab968a1852452845 --- /dev/null +++ b/checkpoint-70/trainer_state.json @@ -0,0 +1,231 @@ +{ + "best_metric": 1.5954241752624512, + "best_model_checkpoint": "/scratch/kwamea/llama-output/checkpoint-70", + "epoch": 9.655172413793103, + "eval_steps": 5, + "global_step": 70, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.69, + "grad_norm": 0.27489739656448364, + "learning_rate": 9.285714285714286e-05, + "loss": 1.9945, + "step": 5 + }, + { + "epoch": 0.69, + "eval_loss": 2.0204718112945557, + "eval_runtime": 14.7113, + "eval_samples_per_second": 0.408, + "eval_steps_per_second": 0.068, + "step": 5 + }, + { + "epoch": 1.38, + "grad_norm": 0.36750176548957825, + "learning_rate": 8.571428571428571e-05, + "loss": 1.9427, + "step": 10 + }, + { + "epoch": 1.38, + "eval_loss": 1.9357692003250122, + "eval_runtime": 14.713, + "eval_samples_per_second": 0.408, + "eval_steps_per_second": 0.068, + "step": 10 + }, + { + "epoch": 2.07, + "grad_norm": 0.4677024185657501, + "learning_rate": 7.857142857142858e-05, + "loss": 1.8651, + "step": 15 + }, + { + "epoch": 2.07, + "eval_loss": 1.8461824655532837, + "eval_runtime": 14.7203, + "eval_samples_per_second": 0.408, + "eval_steps_per_second": 0.068, + "step": 15 + }, + { + "epoch": 2.76, + "grad_norm": 0.40606027841567993, + "learning_rate": 7.142857142857143e-05, + "loss": 1.7948, + "step": 20 + }, + { + "epoch": 2.76, + "eval_loss": 1.801284670829773, + "eval_runtime": 14.7281, + "eval_samples_per_second": 0.407, + "eval_steps_per_second": 0.068, + "step": 20 + }, + { + "epoch": 3.45, + "grad_norm": 0.3058153986930847, + "learning_rate": 6.428571428571429e-05, + "loss": 1.7233, + "step": 25 + }, + { + "epoch": 3.45, + "eval_loss": 1.766973853111267, + "eval_runtime": 14.6916, + "eval_samples_per_second": 0.408, + "eval_steps_per_second": 0.068, + "step": 25 + }, + { + "epoch": 4.14, + "grad_norm": 0.2967859208583832, + "learning_rate": 5.714285714285714e-05, + "loss": 1.7117, + "step": 30 + }, + { + "epoch": 4.14, + "eval_loss": 1.7350422143936157, + "eval_runtime": 14.7126, + "eval_samples_per_second": 0.408, + "eval_steps_per_second": 0.068, + "step": 30 + }, + { + "epoch": 4.83, + "grad_norm": 0.31578946113586426, + "learning_rate": 5e-05, + "loss": 1.6904, + "step": 35 + }, + { + "epoch": 4.83, + "eval_loss": 1.7066936492919922, + "eval_runtime": 14.7265, + "eval_samples_per_second": 0.407, + "eval_steps_per_second": 0.068, + "step": 35 + }, + { + "epoch": 5.52, + "grad_norm": 0.34217244386672974, + "learning_rate": 4.2857142857142856e-05, + "loss": 1.6459, + "step": 40 + }, + { + "epoch": 5.52, + "eval_loss": 1.6809816360473633, + "eval_runtime": 14.7004, + "eval_samples_per_second": 0.408, + "eval_steps_per_second": 0.068, + "step": 40 + }, + { + "epoch": 6.21, + "grad_norm": 0.36268118023872375, + "learning_rate": 3.571428571428572e-05, + "loss": 1.5992, + "step": 45 + }, + { + "epoch": 6.21, + "eval_loss": 1.6565723419189453, + "eval_runtime": 14.6987, + "eval_samples_per_second": 0.408, + "eval_steps_per_second": 0.068, + "step": 45 + }, + { + "epoch": 6.9, + "grad_norm": 0.3854450583457947, + "learning_rate": 2.857142857142857e-05, + "loss": 1.6088, + "step": 50 + }, + { + "epoch": 6.9, + "eval_loss": 1.6376428604125977, + "eval_runtime": 14.7228, + "eval_samples_per_second": 0.408, + "eval_steps_per_second": 0.068, + "step": 50 + }, + { + "epoch": 7.59, + "grad_norm": 0.39184266328811646, + "learning_rate": 2.1428571428571428e-05, + "loss": 1.5722, + "step": 55 + }, + { + "epoch": 7.59, + "eval_loss": 1.6203418970108032, + "eval_runtime": 14.7373, + "eval_samples_per_second": 0.407, + "eval_steps_per_second": 0.068, + "step": 55 + }, + { + "epoch": 8.28, + "grad_norm": 0.41270822286605835, + "learning_rate": 1.4285714285714285e-05, + "loss": 1.5593, + "step": 60 + }, + { + "epoch": 8.28, + "eval_loss": 1.6068811416625977, + "eval_runtime": 14.7065, + "eval_samples_per_second": 0.408, + "eval_steps_per_second": 0.068, + "step": 60 + }, + { + "epoch": 8.97, + "grad_norm": 0.44460129737854004, + "learning_rate": 7.142857142857143e-06, + "loss": 1.5212, + "step": 65 + }, + { + "epoch": 8.97, + "eval_loss": 1.5982898473739624, + "eval_runtime": 14.6929, + "eval_samples_per_second": 0.408, + "eval_steps_per_second": 0.068, + "step": 65 + }, + { + "epoch": 9.66, + "grad_norm": 0.44520503282546997, + "learning_rate": 0.0, + "loss": 1.5438, + "step": 70 + }, + { + "epoch": 9.66, + "eval_loss": 1.5954241752624512, + "eval_runtime": 14.7317, + "eval_samples_per_second": 0.407, + "eval_steps_per_second": 0.068, + "step": 70 + } + ], + "logging_steps": 5, + "max_steps": 70, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 10, + "total_flos": 4.7295648629784576e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-70/training_args.bin b/checkpoint-70/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..1afa2f9ab75d020a21f04c7f7b6890b1c419092d --- /dev/null +++ b/checkpoint-70/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4c74a2b62a8cab8f69632f5511d4781e140b9b59c7c1d8108fbd66233b3d511 +size 4920 diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..f35d643561164330d94c2157377e40cfdc8e2b6a --- /dev/null +++ b/config.json @@ -0,0 +1,42 @@ +{ + "_name_or_path": "meta-llama/Llama-2-70b-hf", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 2, + "hidden_act": "silu", + "hidden_size": 8192, + "initializer_range": 0.02, + "intermediate_size": 28672, + "max_position_embeddings": 4096, + "model_type": "llama", + "num_attention_heads": 64, + "num_hidden_layers": 80, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "quantization_config": { + "_load_in_4bit": false, + "_load_in_8bit": false, + "bnb_4bit_compute_dtype": "float32", + "bnb_4bit_quant_type": "fp4", + "bnb_4bit_use_double_quant": false, + "llm_int8_enable_fp32_cpu_offload": false, + "llm_int8_has_fp16_weight": false, + "llm_int8_skip_modules": null, + "llm_int8_threshold": 6.0, + "load_in_4bit": false, + "load_in_8bit": false, + "quant_method": "bitsandbytes" + }, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "torch_dtype": "float16", + "transformers_version": "4.38.1", + "use_cache": true, + "vocab_size": 32000 +} diff --git a/logs/events.out.tfevents.1699989718.node0370.palmetto.clemson.edu.1119956.2 b/logs/events.out.tfevents.1699989718.node0370.palmetto.clemson.edu.1119956.2 new file mode 100644 index 0000000000000000000000000000000000000000..f20efdc5246f6366e8a67a93f6b7c8cd36236461 --- /dev/null +++ b/logs/events.out.tfevents.1699989718.node0370.palmetto.clemson.edu.1119956.2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8338f7ab492bb36430b90fde21a7822be3a5f94276aa93f72ad83aeaf872e40 +size 4868 diff --git a/logs/events.out.tfevents.1699989813.node0370.palmetto.clemson.edu.1119956.4 b/logs/events.out.tfevents.1699989813.node0370.palmetto.clemson.edu.1119956.4 new file mode 100644 index 0000000000000000000000000000000000000000..36e6c56852c10a3b0d616c47a89da6b88a49b027 --- /dev/null +++ b/logs/events.out.tfevents.1699989813.node0370.palmetto.clemson.edu.1119956.4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4773b11c0e593bac1fe819e5c6324bb2a3542c475a7d375bb608e46408b2b009 +size 4602 diff --git a/logs/events.out.tfevents.1699994287.node0370.palmetto.clemson.edu.1126403.1 b/logs/events.out.tfevents.1699994287.node0370.palmetto.clemson.edu.1126403.1 new file mode 100644 index 0000000000000000000000000000000000000000..1ecec5e4477180cecbf7cd81d3a407cf2039ba34 --- /dev/null +++ b/logs/events.out.tfevents.1699994287.node0370.palmetto.clemson.edu.1126403.1 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edff87033b1a80ab70d0a0af49d7943790a698e659a0875f929d78a3299c699d +size 7962 diff --git a/logs/events.out.tfevents.1699995040.node0370.palmetto.clemson.edu.1127435.1 b/logs/events.out.tfevents.1699995040.node0370.palmetto.clemson.edu.1127435.1 new file mode 100644 index 0000000000000000000000000000000000000000..29c10bcc75a3a5598564b1460d27ba3d99c5af0d --- /dev/null +++ b/logs/events.out.tfevents.1699995040.node0370.palmetto.clemson.edu.1127435.1 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f054998bd37e1977597fdb66e495fb58c70a3b0042fa2ef4428756252ee49a4c +size 33937 diff --git a/logs/events.out.tfevents.1700064888.node0277.palmetto.clemson.edu.1971495.1 b/logs/events.out.tfevents.1700064888.node0277.palmetto.clemson.edu.1971495.1 new file mode 100644 index 0000000000000000000000000000000000000000..07cca291cd96977cb646be128075fa3d738dd6f1 --- /dev/null +++ b/logs/events.out.tfevents.1700064888.node0277.palmetto.clemson.edu.1971495.1 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:587a2145d6c92f9cbc7c114d9b2d452e13239b0a925f3bdcaa5631c48bdca19a +size 9908 diff --git a/logs/events.out.tfevents.1700066157.node0277.palmetto.clemson.edu.1973537.1 b/logs/events.out.tfevents.1700066157.node0277.palmetto.clemson.edu.1973537.1 new file mode 100644 index 0000000000000000000000000000000000000000..b45e70a37980a5f13f4f8a0bd1d8a3cd7f56bb8f --- /dev/null +++ b/logs/events.out.tfevents.1700066157.node0277.palmetto.clemson.edu.1973537.1 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fcffea7a0e86207d5c20daaabb71040aa71cb529a4bbddc4a309e37d9666f12 +size 14528 diff --git a/logs/events.out.tfevents.1700068261.node0277.palmetto.clemson.edu.1975944.1 b/logs/events.out.tfevents.1700068261.node0277.palmetto.clemson.edu.1975944.1 new file mode 100644 index 0000000000000000000000000000000000000000..6da2cd634e521bc446d472a50c149b7cfb926d34 --- /dev/null +++ b/logs/events.out.tfevents.1700068261.node0277.palmetto.clemson.edu.1975944.1 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c82b86ae9d57161db3f3c5a7f67ae342ff011beeb1392fcbe753863174f1ef4 +size 7736 diff --git a/logs/events.out.tfevents.1700244128.node0287.palmetto.clemson.edu.1704721.0 b/logs/events.out.tfevents.1700244128.node0287.palmetto.clemson.edu.1704721.0 new file mode 100644 index 0000000000000000000000000000000000000000..e71b2157e1c64b684569c28506499b8cedf30397 --- /dev/null +++ b/logs/events.out.tfevents.1700244128.node0287.palmetto.clemson.edu.1704721.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c4110c3d09ad1253563d12b1dd5c4b4b26abe39e0f9f0f2d19de9e70767848b +size 4448 diff --git a/logs/events.out.tfevents.1700244128.node0287.palmetto.clemson.edu.1704721.1 b/logs/events.out.tfevents.1700244128.node0287.palmetto.clemson.edu.1704721.1 new file mode 100644 index 0000000000000000000000000000000000000000..f4964f505424ce68fc2b7c3a35a565f2c126f7f8 --- /dev/null +++ b/logs/events.out.tfevents.1700244128.node0287.palmetto.clemson.edu.1704721.1 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38c6b6302c29076bc6e310b9a7d11350256f18814f9ca6a33ca444921708d1c9 +size 4448 diff --git a/logs/events.out.tfevents.1700244187.node0287.palmetto.clemson.edu.1704721.2 b/logs/events.out.tfevents.1700244187.node0287.palmetto.clemson.edu.1704721.2 new file mode 100644 index 0000000000000000000000000000000000000000..b1dbe66a9e008fe5c07ec16717605275bbb4ad4e --- /dev/null +++ b/logs/events.out.tfevents.1700244187.node0287.palmetto.clemson.edu.1704721.2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:769514ef1a9ff351c4b1f2cc29d88b76962d2bb488c82b44156e84237635f6cb +size 7736 diff --git a/logs/events.out.tfevents.1700244187.node0287.palmetto.clemson.edu.1704721.3 b/logs/events.out.tfevents.1700244187.node0287.palmetto.clemson.edu.1704721.3 new file mode 100644 index 0000000000000000000000000000000000000000..6ef5258af32a95d6eb0097d21dd8112d47c25bb3 --- /dev/null +++ b/logs/events.out.tfevents.1700244187.node0287.palmetto.clemson.edu.1704721.3 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e202adc428e3dc6b92fd86a752b2dd3ea53cf8296312c51180754cde46a6ee61 +size 7736 diff --git a/peft_config.json b/peft_config.json new file mode 100644 index 0000000000000000000000000000000000000000..7654288b0a750b4a9189b6544db0637826b51868 --- /dev/null +++ b/peft_config.json @@ -0,0 +1,23 @@ +{ + "default": { + "peft_type": "LORA", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-2-70b-hf", + "revision": null, + "task_type": "CAUSAL_LM", + "inference_mode": false, + "r": 8, + "target_modules": [ + "q_proj", + "v_proj" + ], + "lora_alpha": 32, + "lora_dropout": 0.05, + "fan_in_fan_out": false, + "bias": "none", + "modules_to_save": null, + "init_lora_weights": true, + "layers_to_transform": null, + "layers_pattern": null + } +} \ No newline at end of file diff --git a/tensorboard_logs/events.out.tfevents.1699989359.node0370.palmetto.clemson.edu.1118573.0 b/tensorboard_logs/events.out.tfevents.1699989359.node0370.palmetto.clemson.edu.1118573.0 new file mode 100644 index 0000000000000000000000000000000000000000..6133a316f5466fee2bee69ac5bcfb0e32b864e48 --- /dev/null +++ b/tensorboard_logs/events.out.tfevents.1699989359.node0370.palmetto.clemson.edu.1118573.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:508bf620a6efea31611edf8b79a5f90903b4a80a317efec567cf96ac92938d9e +size 88 diff --git a/tensorboard_logs/events.out.tfevents.1699989567.node0370.palmetto.clemson.edu.1119956.0 b/tensorboard_logs/events.out.tfevents.1699989567.node0370.palmetto.clemson.edu.1119956.0 new file mode 100644 index 0000000000000000000000000000000000000000..a054556c9194738e46b992490b272aec5ae6f260 --- /dev/null +++ b/tensorboard_logs/events.out.tfevents.1699989567.node0370.palmetto.clemson.edu.1119956.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:809827a3064f9854ffc2910389ec1573eb98079f63da5e52062fbd8671946ea1 +size 88 diff --git a/tensorboard_logs/events.out.tfevents.1699989718.node0370.palmetto.clemson.edu.1119956.1 b/tensorboard_logs/events.out.tfevents.1699989718.node0370.palmetto.clemson.edu.1119956.1 new file mode 100644 index 0000000000000000000000000000000000000000..c7f57ae801b45d6e67962ead1fddc6ccf0885cd0 --- /dev/null +++ b/tensorboard_logs/events.out.tfevents.1699989718.node0370.palmetto.clemson.edu.1119956.1 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:246a1b62f0367762b2e5adb07db715dd9c6b3d14f7fd72dc33f40509a064ce3e +size 88 diff --git a/tensorboard_logs/events.out.tfevents.1699989813.node0370.palmetto.clemson.edu.1119956.3 b/tensorboard_logs/events.out.tfevents.1699989813.node0370.palmetto.clemson.edu.1119956.3 new file mode 100644 index 0000000000000000000000000000000000000000..080533d6c025214ddb6819e1509e15d2edb8d581 --- /dev/null +++ b/tensorboard_logs/events.out.tfevents.1699989813.node0370.palmetto.clemson.edu.1119956.3 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad7c4a563a3df99358160cbd9b597dc094c701530d2b0a370c8a90c4b043666c +size 88 diff --git a/tensorboard_logs/events.out.tfevents.1699994286.node0370.palmetto.clemson.edu.1126403.0 b/tensorboard_logs/events.out.tfevents.1699994286.node0370.palmetto.clemson.edu.1126403.0 new file mode 100644 index 0000000000000000000000000000000000000000..2e56de8fca32ee8f0d4b08ad379ac7b00ac6f14c --- /dev/null +++ b/tensorboard_logs/events.out.tfevents.1699994286.node0370.palmetto.clemson.edu.1126403.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5c9bc67afc8bbef64c8b9e3bcf1cf3b6761a1dc88662562b96e92d478800129 +size 88 diff --git a/tensorboard_logs/events.out.tfevents.1699995040.node0370.palmetto.clemson.edu.1127435.0 b/tensorboard_logs/events.out.tfevents.1699995040.node0370.palmetto.clemson.edu.1127435.0 new file mode 100644 index 0000000000000000000000000000000000000000..38b16cf75a41e41ec4d8fac709a8e16a571ded63 --- /dev/null +++ b/tensorboard_logs/events.out.tfevents.1699995040.node0370.palmetto.clemson.edu.1127435.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de7c9118466c29677c83972c22ea9f891944714f48b4e4722c914361c639b7b6 +size 88 diff --git a/tensorboard_logs/events.out.tfevents.1700064888.node0277.palmetto.clemson.edu.1971495.0 b/tensorboard_logs/events.out.tfevents.1700064888.node0277.palmetto.clemson.edu.1971495.0 new file mode 100644 index 0000000000000000000000000000000000000000..5a92b5c9996a429300de3e499037f2e531ac7f2b --- /dev/null +++ b/tensorboard_logs/events.out.tfevents.1700064888.node0277.palmetto.clemson.edu.1971495.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a756f5eab83a8540e11fd8b983679204f19859f4f609f2079d04cc709bc8ad94 +size 88 diff --git a/tensorboard_logs/events.out.tfevents.1700066156.node0277.palmetto.clemson.edu.1973537.0 b/tensorboard_logs/events.out.tfevents.1700066156.node0277.palmetto.clemson.edu.1973537.0 new file mode 100644 index 0000000000000000000000000000000000000000..4ef7d3dd1c970500a80530761e4a1690069ad2e4 --- /dev/null +++ b/tensorboard_logs/events.out.tfevents.1700066156.node0277.palmetto.clemson.edu.1973537.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba86d0c48d706e23e958d9e5f5bf4625871675a89b4d1d50619fcc769d9e939a +size 88 diff --git a/tensorboard_logs/events.out.tfevents.1700068260.node0277.palmetto.clemson.edu.1975944.0 b/tensorboard_logs/events.out.tfevents.1700068260.node0277.palmetto.clemson.edu.1975944.0 new file mode 100644 index 0000000000000000000000000000000000000000..cfe7aeb2ca7461b90c23b7ea77accdd1dc096f73 --- /dev/null +++ b/tensorboard_logs/events.out.tfevents.1700068260.node0277.palmetto.clemson.edu.1975944.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5eee7e7bfdf9096c9ff890fd47ea9d2ed9ed48c9dec1229e82b19f2001064a5 +size 88