Training in progress, step 4500, checkpoint

Browse files

Files changed (6) hide show

last-checkpoint/adapter_config.json +4 -4
last-checkpoint/adapter_model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +493 -3
last-checkpoint/training_args.bin +1 -1

last-checkpoint/adapter_config.json CHANGED Viewed

@@ -23,13 +23,13 @@
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
     "q_proj",
     "v_proj",
-    "k_proj",
-    "up_proj",
     "down_proj",
-    "gate_proj",
-    "o_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

   "rank_pattern": {},
   "revision": null,
   "target_modules": [
+    "gate_proj",
     "q_proj",
     "v_proj",
+    "o_proj",
     "down_proj",
+    "k_proj",
+    "up_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

last-checkpoint/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:027c28cbacad0920c7a8ec1a4dbaf396f0658e37d9c57aa24903513cf568bf29
 size 161533160

 version https://git-lfs.github.com/spec/v1
+oid sha256:caf08a37d467af8be6ee4d7f8398900235c664d83757737f8601af66dd61bee5
 size 161533160

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a811f08d635f9fd429d0ac8672eee899607dd871ece10f326b8ec3e7266d9db2
 size 323292202

 version https://git-lfs.github.com/spec/v1
+oid sha256:3f21859111603253f67b1ca4afa8ce4858e0978e5685b991a3d9c2883b821035
 size 323292202

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:824d4a418ca52dbceab02ca3bdda11d00d54b246084fd87a75671a28233a0cb2
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:712366a72cf55e9140e7cb32d65c59ab0aec41cadb87ddf8db2ed2cbbb7181be
 size 1064

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -2,9 +2,9 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.16556291390728478,
   "eval_steps": 100,
-  "global_step": 1000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -164,6 +164,496 @@
       "learning_rate": 5.513245033112583e-05,
       "loss": 0.7213,
       "step": 1000
     }
   ],
   "logging_steps": 50,
@@ -183,7 +673,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 7833052747137024.0,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.7450331125827815,
   "eval_steps": 100,
+  "global_step": 4500,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "learning_rate": 5.513245033112583e-05,
       "loss": 0.7213,
       "step": 1000
+    },
+    {
+      "epoch": 0.173841059602649,
+      "grad_norm": 1.8289754390716553,
+      "learning_rate": 5.789183222958058e-05,
+      "loss": 0.7335,
+      "step": 1050
+    },
+    {
+      "epoch": 0.18211920529801323,
+      "grad_norm": 1.4989681243896484,
+      "learning_rate": 6.065121412803533e-05,
+      "loss": 0.7326,
+      "step": 1100
+    },
+    {
+      "epoch": 0.19039735099337748,
+      "grad_norm": 1.5326098203659058,
+      "learning_rate": 6.341059602649006e-05,
+      "loss": 0.7311,
+      "step": 1150
+    },
+    {
+      "epoch": 0.1986754966887417,
+      "grad_norm": 1.4897147417068481,
+      "learning_rate": 6.616997792494481e-05,
+      "loss": 0.6918,
+      "step": 1200
+    },
+    {
+      "epoch": 0.20695364238410596,
+      "grad_norm": 1.634765863418579,
+      "learning_rate": 6.892935982339957e-05,
+      "loss": 0.7051,
+      "step": 1250
+    },
+    {
+      "epoch": 0.2152317880794702,
+      "grad_norm": 1.4463587999343872,
+      "learning_rate": 7.168874172185431e-05,
+      "loss": 0.6955,
+      "step": 1300
+    },
+    {
+      "epoch": 0.22350993377483444,
+      "grad_norm": 1.632133960723877,
+      "learning_rate": 7.444812362030905e-05,
+      "loss": 0.6901,
+      "step": 1350
+    },
+    {
+      "epoch": 0.23178807947019867,
+      "grad_norm": 1.4062328338623047,
+      "learning_rate": 7.72075055187638e-05,
+      "loss": 0.6833,
+      "step": 1400
+    },
+    {
+      "epoch": 0.24006622516556292,
+      "grad_norm": 1.2914466857910156,
+      "learning_rate": 7.996688741721855e-05,
+      "loss": 0.6663,
+      "step": 1450
+    },
+    {
+      "epoch": 0.24834437086092714,
+      "grad_norm": 1.4995919466018677,
+      "learning_rate": 8.272626931567329e-05,
+      "loss": 0.6959,
+      "step": 1500
+    },
+    {
+      "epoch": 0.25662251655629137,
+      "grad_norm": 1.1299749612808228,
+      "learning_rate": 8.548565121412803e-05,
+      "loss": 0.6685,
+      "step": 1550
+    },
+    {
+      "epoch": 0.26490066225165565,
+      "grad_norm": 1.329004168510437,
+      "learning_rate": 8.824503311258279e-05,
+      "loss": 0.6678,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2731788079470199,
+      "grad_norm": 1.5191948413848877,
+      "learning_rate": 9.100441501103754e-05,
+      "loss": 0.6731,
+      "step": 1650
+    },
+    {
+      "epoch": 0.2814569536423841,
+      "grad_norm": 1.739169716835022,
+      "learning_rate": 9.376379690949227e-05,
+      "loss": 0.6691,
+      "step": 1700
+    },
+    {
+      "epoch": 0.2897350993377483,
+      "grad_norm": 1.2906118631362915,
+      "learning_rate": 9.652317880794703e-05,
+      "loss": 0.6718,
+      "step": 1750
+    },
+    {
+      "epoch": 0.2980132450331126,
+      "grad_norm": 1.289502501487732,
+      "learning_rate": 9.928256070640178e-05,
+      "loss": 0.6581,
+      "step": 1800
+    },
+    {
+      "epoch": 0.30629139072847683,
+      "grad_norm": 1.3923128843307495,
+      "learning_rate": 9.999872989402833e-05,
+      "loss": 0.6589,
+      "step": 1850
+    },
+    {
+      "epoch": 0.31456953642384106,
+      "grad_norm": 1.1048816442489624,
+      "learning_rate": 9.999297790520483e-05,
+      "loss": 0.6341,
+      "step": 1900
+    },
+    {
+      "epoch": 0.3228476821192053,
+      "grad_norm": 1.3568603992462158,
+      "learning_rate": 9.998258777484084e-05,
+      "loss": 0.6318,
+      "step": 1950
+    },
+    {
+      "epoch": 0.33112582781456956,
+      "grad_norm": 0.923786997795105,
+      "learning_rate": 9.996756046688961e-05,
+      "loss": 0.6318,
+      "step": 2000
+    },
+    {
+      "epoch": 0.3394039735099338,
+      "grad_norm": 1.102367877960205,
+      "learning_rate": 9.994789737552259e-05,
+      "loss": 0.6193,
+      "step": 2050
+    },
+    {
+      "epoch": 0.347682119205298,
+      "grad_norm": 1.0738896131515503,
+      "learning_rate": 9.992360032500001e-05,
+      "loss": 0.6184,
+      "step": 2100
+    },
+    {
+      "epoch": 0.35596026490066224,
+      "grad_norm": 1.279288649559021,
+      "learning_rate": 9.98946715695016e-05,
+      "loss": 0.626,
+      "step": 2150
+    },
+    {
+      "epoch": 0.36423841059602646,
+      "grad_norm": 1.2009036540985107,
+      "learning_rate": 9.986111379291759e-05,
+      "loss": 0.6305,
+      "step": 2200
+    },
+    {
+      "epoch": 0.37251655629139074,
+      "grad_norm": 0.8177038431167603,
+      "learning_rate": 9.982293010859955e-05,
+      "loss": 0.6266,
+      "step": 2250
+    },
+    {
+      "epoch": 0.38079470198675497,
+      "grad_norm": 1.2464983463287354,
+      "learning_rate": 9.978012405907165e-05,
+      "loss": 0.6148,
+      "step": 2300
+    },
+    {
+      "epoch": 0.3890728476821192,
+      "grad_norm": 1.2841860055923462,
+      "learning_rate": 9.973269961570195e-05,
+      "loss": 0.5946,
+      "step": 2350
+    },
+    {
+      "epoch": 0.3973509933774834,
+      "grad_norm": 1.2200431823730469,
+      "learning_rate": 9.968066117833401e-05,
+      "loss": 0.6166,
+      "step": 2400
+    },
+    {
+      "epoch": 0.4056291390728477,
+      "grad_norm": 1.128247857093811,
+      "learning_rate": 9.962401357487863e-05,
+      "loss": 0.5992,
+      "step": 2450
+    },
+    {
+      "epoch": 0.4139072847682119,
+      "grad_norm": 1.0683091878890991,
+      "learning_rate": 9.956276206086597e-05,
+      "loss": 0.6048,
+      "step": 2500
+    },
+    {
+      "epoch": 0.42218543046357615,
+      "grad_norm": 1.1819758415222168,
+      "learning_rate": 9.949691231895791e-05,
+      "loss": 0.5944,
+      "step": 2550
+    },
+    {
+      "epoch": 0.4304635761589404,
+      "grad_norm": 1.0043411254882812,
+      "learning_rate": 9.942647045842095e-05,
+      "loss": 0.5962,
+      "step": 2600
+    },
+    {
+      "epoch": 0.43874172185430466,
+      "grad_norm": 1.0588668584823608,
+      "learning_rate": 9.93514430145593e-05,
+      "loss": 0.6067,
+      "step": 2650
+    },
+    {
+      "epoch": 0.4470198675496689,
+      "grad_norm": 0.9364084601402283,
+      "learning_rate": 9.927183694810862e-05,
+      "loss": 0.5928,
+      "step": 2700
+    },
+    {
+      "epoch": 0.4552980132450331,
+      "grad_norm": 1.155172348022461,
+      "learning_rate": 9.918765964459022e-05,
+      "loss": 0.5987,
+      "step": 2750
+    },
+    {
+      "epoch": 0.46357615894039733,
+      "grad_norm": 1.1639224290847778,
+      "learning_rate": 9.909891891362587e-05,
+      "loss": 0.5745,
+      "step": 2800
+    },
+    {
+      "epoch": 0.4718543046357616,
+      "grad_norm": 0.9658174514770508,
+      "learning_rate": 9.900562298821323e-05,
+      "loss": 0.5825,
+      "step": 2850
+    },
+    {
+      "epoch": 0.48013245033112584,
+      "grad_norm": 1.118033766746521,
+      "learning_rate": 9.890778052396205e-05,
+      "loss": 0.5806,
+      "step": 2900
+    },
+    {
+      "epoch": 0.48841059602649006,
+      "grad_norm": 0.9781912565231323,
+      "learning_rate": 9.880540059829115e-05,
+      "loss": 0.5712,
+      "step": 2950
+    },
+    {
+      "epoch": 0.4966887417218543,
+      "grad_norm": 1.2145684957504272,
+      "learning_rate": 9.869849270958622e-05,
+      "loss": 0.5855,
+      "step": 3000
+    },
+    {
+      "epoch": 0.5049668874172185,
+      "grad_norm": 0.999279260635376,
+      "learning_rate": 9.858706677631862e-05,
+      "loss": 0.5843,
+      "step": 3050
+    },
+    {
+      "epoch": 0.5132450331125827,
+      "grad_norm": 1.098258137702942,
+      "learning_rate": 9.847113313612517e-05,
+      "loss": 0.5605,
+      "step": 3100
+    },
+    {
+      "epoch": 0.5215231788079471,
+      "grad_norm": 0.627949059009552,
+      "learning_rate": 9.835070254484912e-05,
+      "loss": 0.5538,
+      "step": 3150
+    },
+    {
+      "epoch": 0.5298013245033113,
+      "grad_norm": 1.0991902351379395,
+      "learning_rate": 9.822578617554219e-05,
+      "loss": 0.5555,
+      "step": 3200
+    },
+    {
+      "epoch": 0.5380794701986755,
+      "grad_norm": 0.9670843482017517,
+      "learning_rate": 9.8096395617428e-05,
+      "loss": 0.5647,
+      "step": 3250
+    },
+    {
+      "epoch": 0.5463576158940397,
+      "grad_norm": 0.9838133454322815,
+      "learning_rate": 9.796254287482693e-05,
+      "loss": 0.5561,
+      "step": 3300
+    },
+    {
+      "epoch": 0.554635761589404,
+      "grad_norm": 1.1465744972229004,
+      "learning_rate": 9.782424036604234e-05,
+      "loss": 0.559,
+      "step": 3350
+    },
+    {
+      "epoch": 0.5629139072847682,
+      "grad_norm": 1.1423758268356323,
+      "learning_rate": 9.768150092220849e-05,
+      "loss": 0.5517,
+      "step": 3400
+    },
+    {
+      "epoch": 0.5711920529801324,
+      "grad_norm": 1.1365066766738892,
+      "learning_rate": 9.753433778610008e-05,
+      "loss": 0.5464,
+      "step": 3450
+    },
+    {
+      "epoch": 0.5794701986754967,
+      "grad_norm": 0.81045001745224,
+      "learning_rate": 9.738276461090371e-05,
+      "loss": 0.5493,
+      "step": 3500
+    },
+    {
+      "epoch": 0.5877483443708609,
+      "grad_norm": 1.0236687660217285,
+      "learning_rate": 9.72267954589511e-05,
+      "loss": 0.567,
+      "step": 3550
+    },
+    {
+      "epoch": 0.5960264900662252,
+      "grad_norm": 0.9495602250099182,
+      "learning_rate": 9.706644480041455e-05,
+      "loss": 0.5474,
+      "step": 3600
+    },
+    {
+      "epoch": 0.6043046357615894,
+      "grad_norm": 0.960738480091095,
+      "learning_rate": 9.690172751196437e-05,
+      "loss": 0.5238,
+      "step": 3650
+    },
+    {
+      "epoch": 0.6125827814569537,
+      "grad_norm": 1.0488675832748413,
+      "learning_rate": 9.67326588753887e-05,
+      "loss": 0.521,
+      "step": 3700
+    },
+    {
+      "epoch": 0.6208609271523179,
+      "grad_norm": 0.8753538727760315,
+      "learning_rate": 9.65592545761758e-05,
+      "loss": 0.5232,
+      "step": 3750
+    },
+    {
+      "epoch": 0.6291390728476821,
+      "grad_norm": 1.0551217794418335,
+      "learning_rate": 9.638153070205871e-05,
+      "loss": 0.5432,
+      "step": 3800
+    },
+    {
+      "epoch": 0.6374172185430463,
+      "grad_norm": 1.158676028251648,
+      "learning_rate": 9.619950374152278e-05,
+      "loss": 0.5416,
+      "step": 3850
+    },
+    {
+      "epoch": 0.6456953642384106,
+      "grad_norm": 1.0036752223968506,
+      "learning_rate": 9.601319058227589e-05,
+      "loss": 0.5496,
+      "step": 3900
+    },
+    {
+      "epoch": 0.6539735099337748,
+      "grad_norm": 0.8905594348907471,
+      "learning_rate": 9.58226085096817e-05,
+      "loss": 0.5335,
+      "step": 3950
+    },
+    {
+      "epoch": 0.6622516556291391,
+      "grad_norm": 0.9868190884590149,
+      "learning_rate": 9.562777520515598e-05,
+      "loss": 0.5094,
+      "step": 4000
+    },
+    {
+      "epoch": 0.6705298013245033,
+      "grad_norm": 0.9672690629959106,
+      "learning_rate": 9.542870874452618e-05,
+      "loss": 0.5061,
+      "step": 4050
+    },
+    {
+      "epoch": 0.6788079470198676,
+      "grad_norm": 1.044123888015747,
+      "learning_rate": 9.52254275963545e-05,
+      "loss": 0.5253,
+      "step": 4100
+    },
+    {
+      "epoch": 0.6870860927152318,
+      "grad_norm": 1.0346958637237549,
+      "learning_rate": 9.501795062022434e-05,
+      "loss": 0.5149,
+      "step": 4150
+    },
+    {
+      "epoch": 0.695364238410596,
+      "grad_norm": 1.0799248218536377,
+      "learning_rate": 9.48062970649907e-05,
+      "loss": 0.5207,
+      "step": 4200
+    },
+    {
+      "epoch": 0.7036423841059603,
+      "grad_norm": 0.9847925901412964,
+      "learning_rate": 9.459048656699427e-05,
+      "loss": 0.531,
+      "step": 4250
+    },
+    {
+      "epoch": 0.7119205298013245,
+      "grad_norm": 1.134179949760437,
+      "learning_rate": 9.43705391482397e-05,
+      "loss": 0.5202,
+      "step": 4300
+    },
+    {
+      "epoch": 0.7201986754966887,
+      "grad_norm": 0.9750307202339172,
+      "learning_rate": 9.414647521453798e-05,
+      "loss": 0.5183,
+      "step": 4350
+    },
+    {
+      "epoch": 0.7284768211920529,
+      "grad_norm": 1.372010350227356,
+      "learning_rate": 9.391831555361341e-05,
+      "loss": 0.5203,
+      "step": 4400
+    },
+    {
+      "epoch": 0.7367549668874173,
+      "grad_norm": 0.9671643376350403,
+      "learning_rate": 9.36860813331748e-05,
+      "loss": 0.5313,
+      "step": 4450
+    },
+    {
+      "epoch": 0.7450331125827815,
+      "grad_norm": 1.270264983177185,
+      "learning_rate": 9.344979409895178e-05,
+      "loss": 0.5236,
+      "step": 4500
     }
   ],
   "logging_steps": 50,
       "attributes": {}
     }
   },
+  "total_flos": 3.5270617397723136e+16,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

last-checkpoint/training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5dd2ef96eff028fc6db83c8627ce2e789cafe652a25ea367c040819bc392f916
 size 5752

 version https://git-lfs.github.com/spec/v1
+oid sha256:2543e07a37d2c3de3cd8e1d682eb10ddfc7a8cf84209a331e0b0e44870af81c3
 size 5752