IeBoytsov
/

llama-3-1-sft-qlora-debug

@@ -19,6 +19,8 @@ should probably proofread and complete it, then remove this comment. -->
 # llama-3-1-sft-qlora-test
 This model is a fine-tuned version of [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B) on the generator dataset.
 ## Model description
@@ -51,6 +53,9 @@ The following hyperparameters were used during training:
 ### Training results
 ### Framework versions

 # llama-3-1-sft-qlora-test
 This model is a fine-tuned version of [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B) on the generator dataset.
+It achieves the following results on the evaluation set:
+- Loss: 1.1645
 ## Model description
 ### Training results
+| Training Loss | Epoch  | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| 1.2385        | 0.9934 | 75   | 1.1645          |
 ### Framework versions

all_results.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
-    "epoch": 1.0,
-    "total_flos": 1.1183951188905165e+17,
-    "train_loss": 0.0,
-    "train_runtime": 0.0082,
-    "train_samples": 2078,
-    "train_samples_per_second": 147602.294,
-    "train_steps_per_second": 18480.884
 }

 {
+    "epoch": 0.9933774834437086,
+    "total_flos": 5.564154814608179e+16,
+    "train_loss": 1.2146572240193685,
+    "train_runtime": 3027.7749,
+    "train_samples": 1039,
+    "train_samples_per_second": 0.199,
+    "train_steps_per_second": 0.025
 }

train_results.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
-    "epoch": 1.0,
-    "total_flos": 1.1183951188905165e+17,
-    "train_loss": 0.0,
-    "train_runtime": 0.0082,
-    "train_samples": 2078,
-    "train_samples_per_second": 147602.294,
-    "train_steps_per_second": 18480.884
 }

 {
+    "epoch": 0.9933774834437086,
+    "total_flos": 5.564154814608179e+16,
+    "train_loss": 1.2146572240193685,
+    "train_runtime": 3027.7749,
+    "train_samples": 1039,
+    "train_samples_per_second": 0.199,
+    "train_steps_per_second": 0.025
 }

trainer_state.json CHANGED Viewed

@@ -1,242 +1,145 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 1.0,
   "eval_steps": 500,
-  "global_step": 151,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.006622516556291391,
-      "grad_norm": 0.39227479696273804,
-      "learning_rate": 1.25e-05,
-      "loss": 1.5025,
       "step": 1
     },
-    {
-      "epoch": 0.033112582781456956,
-      "grad_norm": 0.6300222873687744,
-      "learning_rate": 6.25e-05,
-      "loss": 1.4454,
-      "step": 5
-    },
     {
       "epoch": 0.06622516556291391,
-      "grad_norm": 0.2265906035900116,
       "learning_rate": 0.000125,
-      "loss": 1.3232,
-      "step": 10
-    },
-    {
-      "epoch": 0.09933774834437085,
-      "grad_norm": 0.15402744710445404,
-      "learning_rate": 0.0001875,
-      "loss": 1.2574,
-      "step": 15
     },
     {
       "epoch": 0.13245033112582782,
-      "grad_norm": 0.2852768301963806,
-      "learning_rate": 0.00019956707906498044,
-      "loss": 1.2792,
-      "step": 20
-    },
-    {
-      "epoch": 0.16556291390728478,
-      "grad_norm": 0.15974751114845276,
-      "learning_rate": 0.00019781476007338058,
-      "loss": 1.2323,
-      "step": 25
     },
     {
       "epoch": 0.1986754966887417,
-      "grad_norm": 0.14968091249465942,
-      "learning_rate": 0.00019473966425143292,
-      "loss": 1.1704,
-      "step": 30
-    },
-    {
-      "epoch": 0.23178807947019867,
-      "grad_norm": 0.14632880687713623,
-      "learning_rate": 0.00019038337699485208,
-      "loss": 1.1832,
-      "step": 35
     },
     {
       "epoch": 0.26490066225165565,
-      "grad_norm": 0.14008674025535583,
-      "learning_rate": 0.0001848048096156426,
-      "loss": 1.1302,
-      "step": 40
-    },
-    {
-      "epoch": 0.2980132450331126,
-      "grad_norm": 0.13251474499702454,
-      "learning_rate": 0.00017807940266766593,
-      "loss": 1.1606,
-      "step": 45
     },
     {
       "epoch": 0.33112582781456956,
-      "grad_norm": 0.1554378867149353,
-      "learning_rate": 0.0001702981057425662,
-      "loss": 1.1845,
-      "step": 50
-    },
-    {
-      "epoch": 0.36423841059602646,
-      "grad_norm": 0.13782458007335663,
-      "learning_rate": 0.0001615661475325658,
-      "loss": 1.1304,
-      "step": 55
     },
     {
       "epoch": 0.3973509933774834,
-      "grad_norm": 0.1376216560602188,
-      "learning_rate": 0.00015200161279292155,
-      "loss": 1.1386,
-      "step": 60
-    },
-    {
-      "epoch": 0.4304635761589404,
-      "grad_norm": 0.13750730454921722,
-      "learning_rate": 0.0001417338454481818,
-      "loss": 1.1886,
-      "step": 65
     },
     {
       "epoch": 0.46357615894039733,
-      "grad_norm": 0.14349249005317688,
-      "learning_rate": 0.00013090169943749476,
-      "loss": 1.2558,
-      "step": 70
-    },
-    {
-      "epoch": 0.4966887417218543,
-      "grad_norm": 0.1315854787826538,
-      "learning_rate": 0.00011965166095328301,
-      "loss": 1.2015,
-      "step": 75
     },
     {
       "epoch": 0.5298013245033113,
-      "grad_norm": 0.13306115567684174,
-      "learning_rate": 0.00010813586746678583,
-      "loss": 1.1666,
-      "step": 80
-    },
-    {
-      "epoch": 0.5629139072847682,
-      "grad_norm": 0.1344432234764099,
-      "learning_rate": 9.651005032974994e-05,
-      "loss": 1.1596,
-      "step": 85
     },
     {
       "epoch": 0.5960264900662252,
-      "grad_norm": 0.14689402282238007,
-      "learning_rate": 8.49314287750517e-05,
-      "loss": 1.1657,
-      "step": 90
-    },
-    {
-      "epoch": 0.6291390728476821,
-      "grad_norm": 0.13607865571975708,
-      "learning_rate": 7.35565837962798e-05,
-      "loss": 1.1268,
-      "step": 95
     },
     {
       "epoch": 0.6622516556291391,
-      "grad_norm": 0.14480474591255188,
-      "learning_rate": 6.25393406584088e-05,
-      "loss": 1.1523,
-      "step": 100
-    },
-    {
-      "epoch": 0.695364238410596,
-      "grad_norm": 0.13964812457561493,
-      "learning_rate": 5.2028688674975415e-05,
-      "loss": 1.1185,
-      "step": 105
     },
     {
       "epoch": 0.7284768211920529,
-      "grad_norm": 0.13802023231983185,
-      "learning_rate": 4.216676638320135e-05,
-      "loss": 1.1517,
-      "step": 110
-    },
-    {
-      "epoch": 0.7615894039735099,
-      "grad_norm": 0.13757328689098358,
-      "learning_rate": 3.308693936411421e-05,
-      "loss": 1.1655,
-      "step": 115
     },
     {
       "epoch": 0.7947019867549668,
-      "grad_norm": 0.14344969391822815,
-      "learning_rate": 2.491199670185008e-05,
-      "loss": 1.1571,
-      "step": 120
-    },
-    {
-      "epoch": 0.8278145695364238,
-      "grad_norm": 0.15166440606117249,
-      "learning_rate": 1.775249047193377e-05,
-      "loss": 1.2056,
-      "step": 125
     },
     {
       "epoch": 0.8609271523178808,
-      "grad_norm": 0.13406524062156677,
-      "learning_rate": 1.1705240714107302e-05,
-      "loss": 1.1083,
-      "step": 130
-    },
-    {
-      "epoch": 0.8940397350993378,
-      "grad_norm": 0.1382841318845749,
-      "learning_rate": 6.852026107385756e-06,
-      "loss": 1.1221,
-      "step": 135
     },
     {
       "epoch": 0.9271523178807947,
-      "grad_norm": 0.17914775013923645,
-      "learning_rate": 3.2584780537136207e-06,
-      "loss": 1.2247,
-      "step": 140
     },
     {
-      "epoch": 0.9602649006622517,
-      "grad_norm": 0.14941184222698212,
-      "learning_rate": 9.731931258429638e-07,
-      "loss": 1.1813,
-      "step": 145
     },
     {
       "epoch": 0.9933774834437086,
-      "grad_norm": 0.13595885038375854,
-      "learning_rate": 2.7075882053828605e-08,
-      "loss": 1.1418,
-      "step": 150
     },
     {
-      "epoch": 1.0,
-      "step": 151,
-      "total_flos": 1.1183951188905165e+17,
-      "train_loss": 0.0,
-      "train_runtime": 0.0082,
-      "train_samples_per_second": 147602.294,
-      "train_steps_per_second": 18480.884
     }
   ],
   "logging_steps": 5,
-  "max_steps": 151,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 1,
   "save_steps": 100,
@@ -252,7 +155,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 1.1183951188905165e+17,
   "train_batch_size": 4,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.9933774834437086,
   "eval_steps": 500,
+  "global_step": 75,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.013245033112582781,
+      "grad_norm": 0.4943664073944092,
+      "learning_rate": 2.5e-05,
+      "loss": 1.4272,
       "step": 1
     },
     {
       "epoch": 0.06622516556291391,
+      "grad_norm": 0.20984387397766113,
       "learning_rate": 0.000125,
+      "loss": 1.3101,
+      "step": 5
     },
     {
       "epoch": 0.13245033112582782,
+      "grad_norm": 0.2290477156639099,
+      "learning_rate": 0.00019956059820218982,
+      "loss": 1.2917,
+      "step": 10
     },
     {
       "epoch": 0.1986754966887417,
+      "grad_norm": 0.15163910388946533,
+      "learning_rate": 0.00019466156752904343,
+      "loss": 1.2823,
+      "step": 15
     },
     {
       "epoch": 0.26490066225165565,
+      "grad_norm": 0.1627238243818283,
+      "learning_rate": 0.00018458320592590975,
+      "loss": 1.1889,
+      "step": 20
     },
     {
       "epoch": 0.33112582781456956,
+      "grad_norm": 0.15383219718933105,
+      "learning_rate": 0.00016987694277788417,
+      "loss": 1.198,
+      "step": 25
     },
     {
       "epoch": 0.3973509933774834,
+      "grad_norm": 0.1501755714416504,
+      "learning_rate": 0.0001513474193514842,
+      "loss": 1.1762,
+      "step": 30
     },
     {
       "epoch": 0.46357615894039733,
+      "grad_norm": 0.14539840817451477,
+      "learning_rate": 0.0001300084635000341,
+      "loss": 1.2176,
+      "step": 35
     },
     {
       "epoch": 0.5298013245033113,
+      "grad_norm": 0.12844280898571014,
+      "learning_rate": 0.0001070276188945293,
+      "loss": 1.1942,
+      "step": 40
     },
     {
       "epoch": 0.5960264900662252,
+      "grad_norm": 0.13806107640266418,
+      "learning_rate": 8.366226381814697e-05,
+      "loss": 1.2928,
+      "step": 45
     },
     {
       "epoch": 0.6622516556291391,
+      "grad_norm": 0.13188520073890686,
+      "learning_rate": 6.119081473277501e-05,
+      "loss": 1.1959,
+      "step": 50
     },
     {
       "epoch": 0.7284768211920529,
+      "grad_norm": 0.12824179232120514,
+      "learning_rate": 4.084277875864776e-05,
+      "loss": 1.1188,
+      "step": 55
     },
     {
       "epoch": 0.7947019867549668,
+      "grad_norm": 0.14250224828720093,
+      "learning_rate": 2.3731482188961818e-05,
+      "loss": 1.2076,
+      "step": 60
     },
     {
       "epoch": 0.8609271523178808,
+      "grad_norm": 0.14001749455928802,
+      "learning_rate": 1.0793155744261351e-05,
+      "loss": 1.1352,
+      "step": 65
     },
     {
       "epoch": 0.9271523178807947,
+      "grad_norm": 0.15154731273651123,
+      "learning_rate": 2.735709467518699e-06,
+      "loss": 1.1486,
+      "step": 70
     },
     {
+      "epoch": 0.9933774834437086,
+      "grad_norm": 0.14987458288669586,
+      "learning_rate": 0.0,
+      "loss": 1.2385,
+      "step": 75
     },
     {
       "epoch": 0.9933774834437086,
+      "eval_loss": 1.1645171642303467,
+      "eval_runtime": 2610.6612,
+      "eval_samples_per_second": 5.155,
+      "eval_steps_per_second": 0.645,
+      "step": 75
     },
     {
+      "epoch": 0.9933774834437086,
+      "step": 75,
+      "total_flos": 5.564154814608179e+16,
+      "train_loss": 1.2146572240193685,
+      "train_runtime": 3027.7749,
+      "train_samples_per_second": 0.199,
+      "train_steps_per_second": 0.025
     }
   ],
   "logging_steps": 5,
+  "max_steps": 75,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 1,
   "save_steps": 100,
       "attributes": {}
     }
   },
+  "total_flos": 5.564154814608179e+16,
   "train_batch_size": 4,
   "trial_name": null,
   "trial_params": null