Training in progress, step 4650, checkpoint

Browse files

Files changed (5) hide show

last-checkpoint/adapter_config.json +5 -5
last-checkpoint/adapter_model.safetensors +2 -2
last-checkpoint/optimizer.pt +2 -2
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +223 -3

last-checkpoint/adapter_config.json CHANGED Viewed

@@ -29,13 +29,13 @@
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
-    "down_proj",
-    "gate_proj",
-    "up_proj",
-    "o_proj",
     "q_proj",
     "k_proj",
-    "v_proj"
   ],
   "target_parameters": null,
   "task_type": "CAUSAL_LM",

   "rank_pattern": {},
   "revision": null,
   "target_modules": [
+    "v_proj",
     "q_proj",
     "k_proj",
+    "gate_proj",
+    "o_proj",
+    "down_proj",
+    "up_proj"
   ],
   "target_parameters": null,
   "task_type": "CAUSAL_LM",

last-checkpoint/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:048c5dfc16ee1754da9fd336f18683a3fa4e3b619f7f3d1d05b7716113348974
-size 20814808

 version https://git-lfs.github.com/spec/v1
+oid sha256:af30f33a8af5e4a013efd26ee53bc624e3f1edea07e127d58d10b844ecce2026
+size 41581360

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b1258df7bbd652cd4ac8845bc1dede253fef70e3930ecda4676e66059e46a5b6
-size 21506325

 version https://git-lfs.github.com/spec/v1
+oid sha256:7c1729e104b948026b118ff21370b1d2f21bc93d0781e691807f6578d395035b
+size 22453035

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9ff4f019e07e22292d32e03e5912231177e1a365bd18d638eade1eecc917db10
 size 1465

 version https://git-lfs.github.com/spec/v1
+oid sha256:d40984bf5f703b17e7e396c9ca4247ffe72588f4caff5b69f55c23c86e97ea6c
 size 1465

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -2,9 +2,9 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.88134135855546,
   "eval_steps": 300,
-  "global_step": 4100,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -1648,6 +1648,226 @@
       "mean_token_accuracy": 0.9456648254394531,
       "num_tokens": 5058961.0,
       "step": 4100
     }
   ],
   "logging_steps": 25,
@@ -1667,7 +1887,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 7.076085424686029e+16,
   "train_batch_size": 3,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.9995700773860705,
   "eval_steps": 300,
+  "global_step": 4650,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "mean_token_accuracy": 0.9456648254394531,
       "num_tokens": 5058961.0,
       "step": 4100
+    },
+    {
+      "entropy": 0.18220152616500854,
+      "epoch": 0.8867153912295787,
+      "grad_norm": 0.31099215149879456,
+      "learning_rate": 0.0002,
+      "loss": 0.1831790542602539,
+      "mean_token_accuracy": 0.9388860750198365,
+      "num_tokens": 31646.0,
+      "step": 4125
+    },
+    {
+      "entropy": 0.17971669435501098,
+      "epoch": 0.8920894239036974,
+      "grad_norm": 0.2860122323036194,
+      "learning_rate": 0.0002,
+      "loss": 0.17871889114379882,
+      "mean_token_accuracy": 0.9418160128593445,
+      "num_tokens": 63466.0,
+      "step": 4150
+    },
+    {
+      "entropy": 0.17056418150663377,
+      "epoch": 0.897463456577816,
+      "grad_norm": 0.2612093389034271,
+      "learning_rate": 0.0002,
+      "loss": 0.17395471572875976,
+      "mean_token_accuracy": 0.9422563743591309,
+      "num_tokens": 94269.0,
+      "step": 4175
+    },
+    {
+      "entropy": 0.17489842355251312,
+      "epoch": 0.9028374892519346,
+      "grad_norm": 0.36198702454566956,
+      "learning_rate": 0.0002,
+      "loss": 0.17566781997680664,
+      "mean_token_accuracy": 0.9409847593307495,
+      "num_tokens": 125794.0,
+      "step": 4200
+    },
+    {
+      "entropy": 0.1683477830886841,
+      "epoch": 0.9082115219260533,
+      "grad_norm": 0.2940385341644287,
+      "learning_rate": 0.0002,
+      "loss": 0.1692376708984375,
+      "mean_token_accuracy": 0.9459913969039917,
+      "num_tokens": 156271.0,
+      "step": 4225
+    },
+    {
+      "entropy": 0.14542917400598526,
+      "epoch": 0.913585554600172,
+      "grad_norm": 0.45115435123443604,
+      "learning_rate": 0.0002,
+      "loss": 0.14854653358459471,
+      "mean_token_accuracy": 0.950973105430603,
+      "num_tokens": 185224.0,
+      "step": 4250
+    },
+    {
+      "entropy": 0.17559541881084442,
+      "epoch": 0.9189595872742906,
+      "grad_norm": 0.18303845822811127,
+      "learning_rate": 0.0002,
+      "loss": 0.17568387985229492,
+      "mean_token_accuracy": 0.9408818078041077,
+      "num_tokens": 216346.0,
+      "step": 4275
+    },
+    {
+      "entropy": 0.1603526195883751,
+      "epoch": 0.9243336199484092,
+      "grad_norm": 0.2949071526527405,
+      "learning_rate": 0.0002,
+      "loss": 0.1592039203643799,
+      "mean_token_accuracy": 0.9486523294448852,
+      "num_tokens": 246847.0,
+      "step": 4300
+    },
+    {
+      "entropy": 0.162405326962471,
+      "epoch": 0.929707652622528,
+      "grad_norm": 0.3486879765987396,
+      "learning_rate": 0.0002,
+      "loss": 0.1632448196411133,
+      "mean_token_accuracy": 0.9453012681007386,
+      "num_tokens": 277246.0,
+      "step": 4325
+    },
+    {
+      "entropy": 0.16633329182863235,
+      "epoch": 0.9350816852966466,
+      "grad_norm": 0.3270273208618164,
+      "learning_rate": 0.0002,
+      "loss": 0.16598182678222656,
+      "mean_token_accuracy": 0.943821303844452,
+      "num_tokens": 307874.0,
+      "step": 4350
+    },
+    {
+      "entropy": 0.16052240520715713,
+      "epoch": 0.9404557179707652,
+      "grad_norm": 0.31142178177833557,
+      "learning_rate": 0.0002,
+      "loss": 0.1634804344177246,
+      "mean_token_accuracy": 0.9480662798881531,
+      "num_tokens": 338240.0,
+      "step": 4375
+    },
+    {
+      "entropy": 0.16865724414587022,
+      "epoch": 0.945829750644884,
+      "grad_norm": 0.2577108144760132,
+      "learning_rate": 0.0002,
+      "loss": 0.16492490768432616,
+      "mean_token_accuracy": 0.9463495826721191,
+      "num_tokens": 368740.0,
+      "step": 4400
+    },
+    {
+      "entropy": 0.1669575396180153,
+      "epoch": 0.9512037833190026,
+      "grad_norm": 0.26715075969696045,
+      "learning_rate": 0.0002,
+      "loss": 0.16754981994628906,
+      "mean_token_accuracy": 0.9427931928634643,
+      "num_tokens": 400022.0,
+      "step": 4425
+    },
+    {
+      "entropy": 0.18261059492826462,
+      "epoch": 0.9565778159931212,
+      "grad_norm": 0.28751739859580994,
+      "learning_rate": 0.0002,
+      "loss": 0.17873405456542968,
+      "mean_token_accuracy": 0.9412663197517395,
+      "num_tokens": 431956.0,
+      "step": 4450
+    },
+    {
+      "entropy": 0.15669210344552995,
+      "epoch": 0.9619518486672399,
+      "grad_norm": 0.300042986869812,
+      "learning_rate": 0.0002,
+      "loss": 0.1616361427307129,
+      "mean_token_accuracy": 0.946834671497345,
+      "num_tokens": 462567.0,
+      "step": 4475
+    },
+    {
+      "entropy": 0.16525104999542237,
+      "epoch": 0.9673258813413586,
+      "grad_norm": 0.18482960760593414,
+      "learning_rate": 0.0002,
+      "loss": 0.16297117233276368,
+      "mean_token_accuracy": 0.9456335234642029,
+      "num_tokens": 493133.0,
+      "step": 4500
+    },
+    {
+      "entropy": 0.16325506687164307,
+      "epoch": 0.9726999140154772,
+      "grad_norm": 0.2662312686443329,
+      "learning_rate": 0.0002,
+      "loss": 0.1621280288696289,
+      "mean_token_accuracy": 0.94725031375885,
+      "num_tokens": 523582.0,
+      "step": 4525
+    },
+    {
+      "entropy": 0.17149330377578736,
+      "epoch": 0.9780739466895959,
+      "grad_norm": 0.255045622587204,
+      "learning_rate": 0.0002,
+      "loss": 0.1708805465698242,
+      "mean_token_accuracy": 0.9442848777770996,
+      "num_tokens": 554347.0,
+      "step": 4550
+    },
+    {
+      "entropy": 0.1666904228925705,
+      "epoch": 0.9834479793637145,
+      "grad_norm": 0.29972079396247864,
+      "learning_rate": 0.0002,
+      "loss": 0.16790952682495117,
+      "mean_token_accuracy": 0.9447818112373352,
+      "num_tokens": 585240.0,
+      "step": 4575
+    },
+    {
+      "entropy": 0.15647386968135835,
+      "epoch": 0.9888220120378332,
+      "grad_norm": 0.2015724927186966,
+      "learning_rate": 0.0002,
+      "loss": 0.15715859413146974,
+      "mean_token_accuracy": 0.947631905078888,
+      "num_tokens": 615339.0,
+      "step": 4600
+    },
+    {
+      "entropy": 0.1566900384426117,
+      "epoch": 0.9941960447119519,
+      "grad_norm": 0.3145524561405182,
+      "learning_rate": 0.0002,
+      "loss": 0.15771458625793458,
+      "mean_token_accuracy": 0.9484156608581543,
+      "num_tokens": 645469.0,
+      "step": 4625
+    },
+    {
+      "entropy": 0.18080857157707214,
+      "epoch": 0.9995700773860705,
+      "grad_norm": 0.2863779664039612,
+      "learning_rate": 0.0002,
+      "loss": 0.18163055419921875,
+      "mean_token_accuracy": 0.9397158980369568,
+      "num_tokens": 677384.0,
+      "step": 4650
     }
   ],
   "logging_steps": 25,
       "attributes": {}
     }
   },
+  "total_flos": 8.020771235899546e+16,
   "train_batch_size": 3,
   "trial_name": null,
   "trial_params": null