Training in progress, step 150, checkpoint

Browse files

Files changed (7) hide show

last-checkpoint/adapter_config.json +2 -2
last-checkpoint/adapter_model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +38 -377
last-checkpoint/training_args.bin +1 -1

last-checkpoint/adapter_config.json CHANGED Viewed

@@ -22,8 +22,8 @@
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
-    "q_proj",
-    "k_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

   "rank_pattern": {},
   "revision": null,
   "target_modules": [
+    "k_proj",
+    "q_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

last-checkpoint/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:348daef4d62db9a5fa3b649cca6826fc23ec3c49e1fb43eaa81d96b56a38718c
 size 527048968

 version https://git-lfs.github.com/spec/v1
+oid sha256:d521622db8e0c7fae9ad561a127c2650738869afc43c26be15d38a48ed692348
 size 527048968

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:494a5ea21271cf0928971accb090d8e84141054b34ceaff64cbe4817f22fea4b
 size 1054136250

 version https://git-lfs.github.com/spec/v1
+oid sha256:654d5f69f2b7f359b70dc71836116986b4ff50d816aa31fd86020592c2ad75c8
 size 1054136250

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:65f6c1dd354391c569e4be3e0b1b637345be25b99bb32967d995788c70f82738
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:226f394c3a9826cc7f74d0799aa02f643f1ee6b891784f44c588787dbc9c0cb3
 size 14244

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3ca6ba6bc5430af54b4982610d295ab940fcfca9eb66d0098ad0a404420eb1d4
 size 1256

 version https://git-lfs.github.com/spec/v1
+oid sha256:2673d78ac7304a2a7678ae71ed65422fa2295f07aca63cf23ca76e0b5c92da69
 size 1256

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -1,464 +1,125 @@
 {
-  "best_metric": 0.7217289209365845,
-  "best_model_checkpoint": "./output/checkpoint-600",
-  "epoch": 13.333333333333334,
   "eval_steps": 150,
-  "global_step": 600,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
       "epoch": 0.2222222222222222,
-      "grad_norm": 1.9086298942565918,
       "learning_rate": 2.9999999999999984e-06,
       "loss": 0.6619,
       "step": 10
     },
     {
       "epoch": 0.4444444444444444,
-      "grad_norm": 1.888395071029663,
       "learning_rate": 5.999999999999997e-06,
-      "loss": 0.7259,
       "step": 20
     },
     {
       "epoch": 0.6666666666666666,
-      "grad_norm": 1.6719470024108887,
       "learning_rate": 8.999999999999993e-06,
-      "loss": 0.7335,
       "step": 30
     },
     {
       "epoch": 0.8888888888888888,
-      "grad_norm": 1.5774726867675781,
       "learning_rate": 1.1999999999999994e-05,
-      "loss": 0.7373,
       "step": 40
     },
     {
       "epoch": 1.1111111111111112,
-      "grad_norm": 1.249552607536316,
       "learning_rate": 1.499999999999999e-05,
-      "loss": 0.8944,
       "step": 50
     },
     {
       "epoch": 1.3333333333333333,
-      "grad_norm": 1.1349461078643799,
       "learning_rate": 1.7999999999999987e-05,
-      "loss": 0.7144,
       "step": 60
     },
     {
       "epoch": 1.5555555555555556,
-      "grad_norm": 1.5111842155456543,
       "learning_rate": 2.0999999999999985e-05,
-      "loss": 0.7577,
       "step": 70
     },
     {
       "epoch": 1.7777777777777777,
-      "grad_norm": 1.873070478439331,
       "learning_rate": 2.3999999999999987e-05,
-      "loss": 0.7073,
       "step": 80
     },
     {
       "epoch": 2.0,
-      "grad_norm": 2.0650975704193115,
       "learning_rate": 2.6999999999999982e-05,
-      "loss": 0.7702,
       "step": 90
     },
     {
       "epoch": 2.2222222222222223,
-      "grad_norm": 1.5584607124328613,
       "learning_rate": 2.999999999999998e-05,
-      "loss": 0.7269,
       "step": 100
     },
     {
       "epoch": 2.4444444444444446,
-      "grad_norm": 1.7033145427703857,
       "learning_rate": 2.999999702723961e-05,
-      "loss": 0.7475,
       "step": 110
     },
     {
       "epoch": 2.6666666666666665,
-      "grad_norm": 2.7943344116210938,
       "learning_rate": 2.9999988108959667e-05,
-      "loss": 0.57,
       "step": 120
     },
     {
       "epoch": 2.888888888888889,
-      "grad_norm": 2.112865686416626,
       "learning_rate": 2.9999973245163695e-05,
-      "loss": 0.704,
       "step": 130
     },
     {
       "epoch": 3.111111111111111,
-      "grad_norm": 2.2598509788513184,
       "learning_rate": 2.999995243585758e-05,
-      "loss": 0.717,
       "step": 140
     },
     {
       "epoch": 3.3333333333333335,
-      "grad_norm": 1.9296040534973145,
       "learning_rate": 2.9999925681049573e-05,
-      "loss": 0.6639,
       "step": 150
     },
     {
       "epoch": 3.3333333333333335,
-      "eval_loss": 0.7678037881851196,
-      "eval_runtime": 0.4654,
-      "eval_samples_per_second": 21.488,
-      "eval_steps_per_second": 21.488,
       "step": 150
-    },
-    {
-      "epoch": 3.5555555555555554,
-      "grad_norm": 2.576017141342163,
-      "learning_rate": 2.9999892980750276e-05,
-      "loss": 0.6945,
-      "step": 160
-    },
-    {
-      "epoch": 3.7777777777777777,
-      "grad_norm": 2.4086973667144775,
-      "learning_rate": 2.9999854334972655e-05,
-      "loss": 0.6925,
-      "step": 170
-    },
-    {
-      "epoch": 4.0,
-      "grad_norm": 2.5403313636779785,
-      "learning_rate": 2.999980974373202e-05,
-      "loss": 0.6681,
-      "step": 180
-    },
-    {
-      "epoch": 4.222222222222222,
-      "grad_norm": 1.6049163341522217,
-      "learning_rate": 2.9999759207046055e-05,
-      "loss": 0.5901,
-      "step": 190
-    },
-    {
-      "epoch": 4.444444444444445,
-      "grad_norm": 1.6813507080078125,
-      "learning_rate": 2.9999702724934783e-05,
-      "loss": 0.7106,
-      "step": 200
-    },
-    {
-      "epoch": 4.666666666666667,
-      "grad_norm": 3.4500820636749268,
-      "learning_rate": 2.99996402974206e-05,
-      "loss": 0.6277,
-      "step": 210
-    },
-    {
-      "epoch": 4.888888888888889,
-      "grad_norm": 2.07940411567688,
-      "learning_rate": 2.9999571924528243e-05,
-      "loss": 0.6731,
-      "step": 220
-    },
-    {
-      "epoch": 5.111111111111111,
-      "grad_norm": 2.62994647026062,
-      "learning_rate": 2.9999497606284816e-05,
-      "loss": 0.6025,
-      "step": 230
-    },
-    {
-      "epoch": 5.333333333333333,
-      "grad_norm": 1.4846452474594116,
-      "learning_rate": 2.9999417342719775e-05,
-      "loss": 0.6941,
-      "step": 240
-    },
-    {
-      "epoch": 5.555555555555555,
-      "grad_norm": 2.218034267425537,
-      "learning_rate": 2.9999331133864935e-05,
-      "loss": 0.6477,
-      "step": 250
-    },
-    {
-      "epoch": 5.777777777777778,
-      "grad_norm": 1.7151379585266113,
-      "learning_rate": 2.9999238979754465e-05,
-      "loss": 0.6094,
-      "step": 260
-    },
-    {
-      "epoch": 6.0,
-      "grad_norm": 1.9011706113815308,
-      "learning_rate": 2.99991408804249e-05,
-      "loss": 0.5759,
-      "step": 270
-    },
-    {
-      "epoch": 6.222222222222222,
-      "grad_norm": 2.1471989154815674,
-      "learning_rate": 2.999903683591511e-05,
-      "loss": 0.574,
-      "step": 280
-    },
-    {
-      "epoch": 6.444444444444445,
-      "grad_norm": 1.3847769498825073,
-      "learning_rate": 2.9998926846266345e-05,
-      "loss": 0.613,
-      "step": 290
-    },
-    {
-      "epoch": 6.666666666666667,
-      "grad_norm": 2.2905008792877197,
-      "learning_rate": 2.9998810911522193e-05,
-      "loss": 0.6224,
-      "step": 300
-    },
-    {
-      "epoch": 6.666666666666667,
-      "eval_loss": 0.7388900518417358,
-      "eval_runtime": 0.4459,
-      "eval_samples_per_second": 22.426,
-      "eval_steps_per_second": 22.426,
-      "step": 300
-    },
-    {
-      "epoch": 6.888888888888889,
-      "grad_norm": 2.6983234882354736,
-      "learning_rate": 2.9998689031728615e-05,
-      "loss": 0.6484,
-      "step": 310
-    },
-    {
-      "epoch": 7.111111111111111,
-      "grad_norm": 1.7625339031219482,
-      "learning_rate": 2.9998561206933918e-05,
-      "loss": 0.5866,
-      "step": 320
-    },
-    {
-      "epoch": 7.333333333333333,
-      "grad_norm": 1.7633429765701294,
-      "learning_rate": 2.9998427437188766e-05,
-      "loss": 0.5797,
-      "step": 330
-    },
-    {
-      "epoch": 7.555555555555555,
-      "grad_norm": 2.347116470336914,
-      "learning_rate": 2.999828772254618e-05,
-      "loss": 0.603,
-      "step": 340
-    },
-    {
-      "epoch": 7.777777777777778,
-      "grad_norm": 2.4734201431274414,
-      "learning_rate": 2.9998142063061544e-05,
-      "loss": 0.6625,
-      "step": 350
-    },
-    {
-      "epoch": 8.0,
-      "grad_norm": 1.330693006515503,
-      "learning_rate": 2.9997990458792583e-05,
-      "loss": 0.6044,
-      "step": 360
-    },
-    {
-      "epoch": 8.222222222222221,
-      "grad_norm": 1.9030860662460327,
-      "learning_rate": 2.9997832909799397e-05,
-      "loss": 0.549,
-      "step": 370
-    },
-    {
-      "epoch": 8.444444444444445,
-      "grad_norm": 1.935556173324585,
-      "learning_rate": 2.9997669416144432e-05,
-      "loss": 0.641,
-      "step": 380
-    },
-    {
-      "epoch": 8.666666666666666,
-      "grad_norm": 1.049513816833496,
-      "learning_rate": 2.999749997789249e-05,
-      "loss": 0.5395,
-      "step": 390
-    },
-    {
-      "epoch": 8.88888888888889,
-      "grad_norm": 1.5243322849273682,
-      "learning_rate": 2.9997324595110723e-05,
-      "loss": 0.6544,
-      "step": 400
-    },
-    {
-      "epoch": 9.11111111111111,
-      "grad_norm": 1.3851348161697388,
-      "learning_rate": 2.9997143267868663e-05,
-      "loss": 0.5948,
-      "step": 410
-    },
-    {
-      "epoch": 9.333333333333334,
-      "grad_norm": 2.487696409225464,
-      "learning_rate": 2.999695599623817e-05,
-      "loss": 0.6226,
-      "step": 420
-    },
-    {
-      "epoch": 9.555555555555555,
-      "grad_norm": 2.581589698791504,
-      "learning_rate": 2.9996762780293483e-05,
-      "loss": 0.5751,
-      "step": 430
-    },
-    {
-      "epoch": 9.777777777777779,
-      "grad_norm": 1.5271048545837402,
-      "learning_rate": 2.9996563620111176e-05,
-      "loss": 0.5295,
-      "step": 440
-    },
-    {
-      "epoch": 10.0,
-      "grad_norm": 1.3882054090499878,
-      "learning_rate": 2.9996358515770198e-05,
-      "loss": 0.5419,
-      "step": 450
-    },
-    {
-      "epoch": 10.0,
-      "eval_loss": 0.7256744503974915,
-      "eval_runtime": 0.4708,
-      "eval_samples_per_second": 21.239,
-      "eval_steps_per_second": 21.239,
-      "step": 450
-    },
-    {
-      "epoch": 10.222222222222221,
-      "grad_norm": 2.2227277755737305,
-      "learning_rate": 2.9996147467351836e-05,
-      "loss": 0.5062,
-      "step": 460
-    },
-    {
-      "epoch": 10.444444444444445,
-      "grad_norm": 1.412768006324768,
-      "learning_rate": 2.9995930474939753e-05,
-      "loss": 0.4908,
-      "step": 470
-    },
-    {
-      "epoch": 10.666666666666666,
-      "grad_norm": 1.9368879795074463,
-      "learning_rate": 2.9995707538619954e-05,
-      "loss": 0.6362,
-      "step": 480
-    },
-    {
-      "epoch": 10.88888888888889,
-      "grad_norm": 2.137639045715332,
-      "learning_rate": 2.9995478658480802e-05,
-      "loss": 0.5532,
-      "step": 490
-    },
-    {
-      "epoch": 11.11111111111111,
-      "grad_norm": 1.867410659790039,
-      "learning_rate": 2.9995243834613023e-05,
-      "loss": 0.5231,
-      "step": 500
-    },
-    {
-      "epoch": 11.333333333333334,
-      "grad_norm": 1.6794224977493286,
-      "learning_rate": 2.9995003067109687e-05,
-      "loss": 0.5388,
-      "step": 510
-    },
-    {
-      "epoch": 11.555555555555555,
-      "grad_norm": 2.639946699142456,
-      "learning_rate": 2.9994756356066226e-05,
-      "loss": 0.5847,
-      "step": 520
-    },
-    {
-      "epoch": 11.777777777777779,
-      "grad_norm": 2.2483253479003906,
-      "learning_rate": 2.999450370158044e-05,
-      "loss": 0.5348,
-      "step": 530
-    },
-    {
-      "epoch": 12.0,
-      "grad_norm": 1.535469651222229,
-      "learning_rate": 2.9994245103752457e-05,
-      "loss": 0.5238,
-      "step": 540
-    },
-    {
-      "epoch": 12.222222222222221,
-      "grad_norm": 1.2466766834259033,
-      "learning_rate": 2.999398056268479e-05,
-      "loss": 0.536,
-      "step": 550
-    },
-    {
-      "epoch": 12.444444444444445,
-      "grad_norm": 1.4720205068588257,
-      "learning_rate": 2.9993710078482286e-05,
-      "loss": 0.4151,
-      "step": 560
-    },
-    {
-      "epoch": 12.666666666666666,
-      "grad_norm": 3.3889882564544678,
-      "learning_rate": 2.9993433651252164e-05,
-      "loss": 0.6201,
-      "step": 570
-    },
-    {
-      "epoch": 12.88888888888889,
-      "grad_norm": 1.472764253616333,
-      "learning_rate": 2.9993151281103986e-05,
-      "loss": 0.5345,
-      "step": 580
-    },
-    {
-      "epoch": 13.11111111111111,
-      "grad_norm": 2.440230369567871,
-      "learning_rate": 2.9992862968149675e-05,
-      "loss": 0.4178,
-      "step": 590
-    },
-    {
-      "epoch": 13.333333333333334,
-      "grad_norm": 2.4395759105682373,
-      "learning_rate": 2.9992568712503513e-05,
-      "loss": 0.5315,
-      "step": 600
-    },
-    {
-      "epoch": 13.333333333333334,
-      "eval_loss": 0.7217289209365845,
-      "eval_runtime": 0.4076,
-      "eval_samples_per_second": 24.535,
-      "eval_steps_per_second": 24.535,
-      "step": 600
     }
   ],
   "logging_steps": 10,
@@ -478,7 +139,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 6372295013597184.0,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null

 {
+  "best_metric": 0.7681264281272888,
+  "best_model_checkpoint": "./output/checkpoint-150",
+  "epoch": 3.3333333333333335,
   "eval_steps": 150,
+  "global_step": 150,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
       "epoch": 0.2222222222222222,
+      "grad_norm": 1.9085510969161987,
       "learning_rate": 2.9999999999999984e-06,
       "loss": 0.6619,
       "step": 10
     },
     {
       "epoch": 0.4444444444444444,
+      "grad_norm": 1.8888217210769653,
       "learning_rate": 5.999999999999997e-06,
+      "loss": 0.7258,
       "step": 20
     },
     {
       "epoch": 0.6666666666666666,
+      "grad_norm": 1.6715161800384521,
       "learning_rate": 8.999999999999993e-06,
+      "loss": 0.7336,
       "step": 30
     },
     {
       "epoch": 0.8888888888888888,
+      "grad_norm": 1.576881766319275,
       "learning_rate": 1.1999999999999994e-05,
+      "loss": 0.737,
       "step": 40
     },
     {
       "epoch": 1.1111111111111112,
+      "grad_norm": 1.2493071556091309,
       "learning_rate": 1.499999999999999e-05,
+      "loss": 0.8943,
       "step": 50
     },
     {
       "epoch": 1.3333333333333333,
+      "grad_norm": 1.1351404190063477,
       "learning_rate": 1.7999999999999987e-05,
+      "loss": 0.7145,
       "step": 60
     },
     {
       "epoch": 1.5555555555555556,
+      "grad_norm": 1.5109528303146362,
       "learning_rate": 2.0999999999999985e-05,
+      "loss": 0.7582,
       "step": 70
     },
     {
       "epoch": 1.7777777777777777,
+      "grad_norm": 1.872719645500183,
       "learning_rate": 2.3999999999999987e-05,
+      "loss": 0.7074,
       "step": 80
     },
     {
       "epoch": 2.0,
+      "grad_norm": 2.0645689964294434,
       "learning_rate": 2.6999999999999982e-05,
+      "loss": 0.7704,
       "step": 90
     },
     {
       "epoch": 2.2222222222222223,
+      "grad_norm": 1.5649832487106323,
       "learning_rate": 2.999999999999998e-05,
+      "loss": 0.727,
       "step": 100
     },
     {
       "epoch": 2.4444444444444446,
+      "grad_norm": 1.7015666961669922,
       "learning_rate": 2.999999702723961e-05,
+      "loss": 0.747,
       "step": 110
     },
     {
       "epoch": 2.6666666666666665,
+      "grad_norm": 2.7937145233154297,
       "learning_rate": 2.9999988108959667e-05,
+      "loss": 0.5702,
       "step": 120
     },
     {
       "epoch": 2.888888888888889,
+      "grad_norm": 2.12908673286438,
       "learning_rate": 2.9999973245163695e-05,
+      "loss": 0.7045,
       "step": 130
     },
     {
       "epoch": 3.111111111111111,
+      "grad_norm": 2.259050130844116,
       "learning_rate": 2.999995243585758e-05,
+      "loss": 0.7171,
       "step": 140
     },
     {
       "epoch": 3.3333333333333335,
+      "grad_norm": 1.9302667379379272,
       "learning_rate": 2.9999925681049573e-05,
+      "loss": 0.6647,
       "step": 150
     },
     {
       "epoch": 3.3333333333333335,
+      "eval_loss": 0.7681264281272888,
+      "eval_runtime": 0.4717,
+      "eval_samples_per_second": 21.2,
+      "eval_steps_per_second": 21.2,
       "step": 150
     }
   ],
   "logging_steps": 10,
       "attributes": {}
     }
   },
+  "total_flos": 1615667282657280.0,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null

last-checkpoint/training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:44d2192bde2a23766c50facad1c20f2470e5e208bb2f21a9c48d77c7aea22798
 size 5496

 version https://git-lfs.github.com/spec/v1
+oid sha256:fc378f68851406ff2cbb4bd474c2caf3a38d9a74ccb912ae832b6d6e36628285
 size 5496