Upload checkpoint checkpoint-500

Browse files

Files changed (9) hide show

checkpoint-500/README.md +9 -1
checkpoint-500/adapter_config.json +2 -0
checkpoint-500/adapter_model.safetensors +1 -1
checkpoint-500/optimizer.pt +1 -1
checkpoint-500/rng_state.pth +1 -1
checkpoint-500/scaler.pt +1 -1
checkpoint-500/scheduler.pt +1 -1
checkpoint-500/trainer_state.json +202 -202
checkpoint-500/training_args.bin +1 -1

checkpoint-500/README.md CHANGED Viewed

@@ -1,6 +1,14 @@
 ---
 base_model: unsloth/gemma-3n-e4b-it-unsloth-bnb-4bit
 library_name: peft
 ---
 # Model Card for Model ID
@@ -199,4 +207,4 @@ Carbon emissions can be estimated using the [Machine Learning Impact calculator]
 [More Information Needed]
 ### Framework versions
-- PEFT 0.15.2

 ---
 base_model: unsloth/gemma-3n-e4b-it-unsloth-bnb-4bit
 library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-3n-e4b-it-unsloth-bnb-4bit
+- lora
+- sft
+- transformers
+- trl
+- unsloth
 ---
 # Model Card for Model ID
 [More Information Needed]
 ### Framework versions
+- PEFT 0.16.0

checkpoint-500/adapter_config.json CHANGED Viewed

@@ -20,6 +20,7 @@
   "megatron_core": "megatron.core",
   "modules_to_save": null,
   "peft_type": "LORA",
   "r": 64,
   "rank_pattern": {},
   "revision": null,
@@ -27,5 +28,6 @@
   "task_type": "CAUSAL_LM",
   "trainable_token_indices": null,
   "use_dora": false,
   "use_rslora": false
 }

   "megatron_core": "megatron.core",
   "modules_to_save": null,
   "peft_type": "LORA",
+  "qalora_group_size": 16,
   "r": 64,
   "rank_pattern": {},
   "revision": null,
   "task_type": "CAUSAL_LM",
   "trainable_token_indices": null,
   "use_dora": false,
+  "use_qalora": false,
   "use_rslora": false
 }

checkpoint-500/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:20737e909449720581ff16c0395e4434aef4aa1a2838565f12d7d3f50c8d29ed
 size 614801160

 version https://git-lfs.github.com/spec/v1
+oid sha256:35432e3448dc399d7009235ea69c2896cbd43fab141893e34024ec3662721349
 size 614801160

checkpoint-500/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ead4f9b8b66c7d11c9633ec957442074ec6296b47a0e6b3c10bdc2f16366a55e
 size 314017998

 version https://git-lfs.github.com/spec/v1
+oid sha256:0591fb6a9ab0a59d2746b4236495547239e87606c8e4da40eaf107f259cc4a5e
 size 314017998

checkpoint-500/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:38ca74abb9af1cc9151b91103870122159c1f0b4cbbe035d58feaf102cb270d5
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:3d7ecf15e83ac4d18e0d90f8a44821af2f304313a6ae05eeb21767226a79c463
 size 14244

checkpoint-500/scaler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3fb08505854ee0c6308e1c2e695e68ee75ef6d1922625fe97977938ccc7f5139
 size 988

 version https://git-lfs.github.com/spec/v1
+oid sha256:a0d26e61df469459d42b9c7f0105c1e800955e9b52f2335feda3ebc3ccf0aeef
 size 988

checkpoint-500/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2d8b94494ed01293cc68870f97cfdbf0218ffce35b622e47ada9513dc7755853
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:51724c0c2f76605ef4dbed23f46627f0a0309d9b31ccd5ac442e22fde361928f
 size 1064

checkpoint-500/trainer_state.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 3.0549618320610685,
   "eval_steps": 500,
   "global_step": 500,
   "is_hyper_param_search": false,
@@ -10,360 +10,360 @@
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.061068702290076333,
       "grad_norm": NaN,
       "learning_rate": 0.0,
-      "loss": 9.249,
       "step": 10
     },
     {
-      "epoch": 0.12213740458015267,
-      "grad_norm": 4165304.25,
-      "learning_rate": 7.317073170731707e-09,
-      "loss": 9.3268,
       "step": 20
     },
     {
-      "epoch": 0.183206106870229,
-      "grad_norm": 52283.171875,
-      "learning_rate": 3.170731707317073e-08,
-      "loss": 5.2487,
       "step": 30
     },
     {
-      "epoch": 0.24427480916030533,
-      "grad_norm": 34552.171875,
-      "learning_rate": 5.609756097560976e-08,
-      "loss": 3.5598,
       "step": 40
     },
     {
-      "epoch": 0.3053435114503817,
-      "grad_norm": 26069.314453125,
-      "learning_rate": 8.048780487804878e-08,
-      "loss": 3.2428,
       "step": 50
     },
     {
-      "epoch": 0.366412213740458,
-      "grad_norm": 17445.56640625,
-      "learning_rate": 1.048780487804878e-07,
-      "loss": 3.2248,
       "step": 60
     },
     {
-      "epoch": 0.42748091603053434,
-      "grad_norm": 23311.046875,
-      "learning_rate": 1.2926829268292682e-07,
-      "loss": 3.2595,
       "step": 70
     },
     {
-      "epoch": 0.48854961832061067,
-      "grad_norm": 13832.998046875,
-      "learning_rate": 1.5365853658536586e-07,
-      "loss": 3.0941,
       "step": 80
     },
     {
-      "epoch": 0.549618320610687,
-      "grad_norm": 11702.7421875,
-      "learning_rate": 1.7804878048780488e-07,
-      "loss": 3.0741,
       "step": 90
     },
     {
-      "epoch": 0.6106870229007634,
-      "grad_norm": 11529.9228515625,
-      "learning_rate": 1.997289972899729e-07,
-      "loss": 3.163,
       "step": 100
     },
     {
-      "epoch": 0.6717557251908397,
-      "grad_norm": 9445.1396484375,
-      "learning_rate": 1.970189701897019e-07,
-      "loss": 3.0271,
       "step": 110
     },
     {
-      "epoch": 0.732824427480916,
-      "grad_norm": 9955.53515625,
-      "learning_rate": 1.9430894308943088e-07,
-      "loss": 2.9452,
       "step": 120
     },
     {
-      "epoch": 0.7938931297709924,
-      "grad_norm": 8468.791015625,
-      "learning_rate": 1.915989159891599e-07,
-      "loss": 2.9369,
       "step": 130
     },
     {
-      "epoch": 0.8549618320610687,
-      "grad_norm": 9535.78125,
-      "learning_rate": 1.8888888888888888e-07,
-      "loss": 2.8495,
       "step": 140
     },
     {
-      "epoch": 0.916030534351145,
-      "grad_norm": 6875.52783203125,
-      "learning_rate": 1.861788617886179e-07,
-      "loss": 2.9174,
       "step": 150
     },
     {
-      "epoch": 0.9770992366412213,
-      "grad_norm": 7618.35107421875,
-      "learning_rate": 1.8346883468834688e-07,
-      "loss": 2.8059,
       "step": 160
     },
     {
-      "epoch": 1.036641221374046,
-      "grad_norm": 7838.251953125,
-      "learning_rate": 1.8075880758807586e-07,
-      "loss": 2.7811,
       "step": 170
     },
     {
-      "epoch": 1.0977099236641221,
-      "grad_norm": 7657.27880859375,
-      "learning_rate": 1.7804878048780488e-07,
-      "loss": 2.7413,
       "step": 180
     },
     {
-      "epoch": 1.1587786259541986,
-      "grad_norm": 7777.9345703125,
-      "learning_rate": 1.753387533875339e-07,
-      "loss": 2.7186,
       "step": 190
     },
     {
-      "epoch": 1.2198473282442748,
-      "grad_norm": 8100.41259765625,
-      "learning_rate": 1.7262872628726285e-07,
-      "loss": 2.6789,
       "step": 200
     },
     {
-      "epoch": 1.2809160305343512,
-      "grad_norm": 6984.42822265625,
-      "learning_rate": 1.6991869918699186e-07,
-      "loss": 2.608,
       "step": 210
     },
     {
-      "epoch": 1.3419847328244274,
-      "grad_norm": 7442.26806640625,
-      "learning_rate": 1.6720867208672087e-07,
-      "loss": 2.5542,
       "step": 220
     },
     {
-      "epoch": 1.4030534351145039,
-      "grad_norm": 7562.8271484375,
-      "learning_rate": 1.6449864498644986e-07,
-      "loss": 2.7031,
       "step": 230
     },
     {
-      "epoch": 1.46412213740458,
-      "grad_norm": 7235.27392578125,
-      "learning_rate": 1.6178861788617885e-07,
-      "loss": 2.6075,
       "step": 240
     },
     {
-      "epoch": 1.5251908396946565,
-      "grad_norm": 7698.4599609375,
-      "learning_rate": 1.5907859078590786e-07,
-      "loss": 2.5737,
       "step": 250
     },
     {
-      "epoch": 1.5862595419847327,
-      "grad_norm": 6776.5927734375,
-      "learning_rate": 1.5636856368563685e-07,
-      "loss": 2.6098,
       "step": 260
     },
     {
-      "epoch": 1.6473282442748092,
-      "grad_norm": 6810.216796875,
-      "learning_rate": 1.5365853658536586e-07,
-      "loss": 2.6596,
       "step": 270
     },
     {
-      "epoch": 1.7083969465648856,
-      "grad_norm": 8227.4892578125,
-      "learning_rate": 1.5094850948509485e-07,
-      "loss": 2.5461,
       "step": 280
     },
     {
-      "epoch": 1.7694656488549618,
-      "grad_norm": 6727.93212890625,
-      "learning_rate": 1.4823848238482383e-07,
-      "loss": 2.5579,
       "step": 290
     },
     {
-      "epoch": 1.830534351145038,
-      "grad_norm": 7243.64111328125,
-      "learning_rate": 1.4552845528455284e-07,
-      "loss": 2.5538,
       "step": 300
     },
     {
-      "epoch": 1.8916030534351145,
-      "grad_norm": 6177.71240234375,
-      "learning_rate": 1.4281842818428186e-07,
-      "loss": 2.4577,
       "step": 310
     },
     {
-      "epoch": 1.952671755725191,
-      "grad_norm": 7574.3271484375,
-      "learning_rate": 1.4010840108401082e-07,
-      "loss": 2.4778,
       "step": 320
     },
     {
-      "epoch": 2.018320610687023,
-      "grad_norm": 6852.73095703125,
-      "learning_rate": 1.3739837398373983e-07,
-      "loss": 2.7076,
       "step": 330
     },
     {
-      "epoch": 2.0793893129770993,
-      "grad_norm": 6963.75927734375,
-      "learning_rate": 1.3468834688346884e-07,
-      "loss": 2.3829,
       "step": 340
     },
     {
-      "epoch": 2.1404580152671757,
-      "grad_norm": 7274.21630859375,
-      "learning_rate": 1.3197831978319783e-07,
-      "loss": 2.4186,
       "step": 350
     },
     {
-      "epoch": 2.1954198473282442,
-      "grad_norm": 6927.31884765625,
-      "learning_rate": 1.2926829268292682e-07,
-      "loss": 2.5768,
       "step": 360
     },
     {
-      "epoch": 2.2564885496183207,
-      "grad_norm": 6366.95458984375,
-      "learning_rate": 1.265582655826558e-07,
-      "loss": 2.3761,
       "step": 370
     },
     {
-      "epoch": 2.317557251908397,
-      "grad_norm": 7380.29638671875,
-      "learning_rate": 1.2384823848238481e-07,
-      "loss": 2.289,
       "step": 380
     },
     {
-      "epoch": 2.378625954198473,
-      "grad_norm": 5605.09375,
-      "learning_rate": 1.2113821138211383e-07,
-      "loss": 2.4026,
       "step": 390
     },
     {
-      "epoch": 2.4396946564885496,
-      "grad_norm": 7360.8037109375,
-      "learning_rate": 1.184281842818428e-07,
-      "loss": 2.5143,
       "step": 400
     },
     {
-      "epoch": 2.500763358778626,
-      "grad_norm": 6937.611328125,
-      "learning_rate": 1.1571815718157181e-07,
-      "loss": 2.3633,
       "step": 410
     },
     {
-      "epoch": 2.5618320610687024,
-      "grad_norm": 7583.28173828125,
-      "learning_rate": 1.1300813008130081e-07,
-      "loss": 2.4694,
       "step": 420
     },
     {
-      "epoch": 2.6229007633587784,
-      "grad_norm": 6622.31201171875,
-      "learning_rate": 1.102981029810298e-07,
-      "loss": 2.4963,
       "step": 430
     },
     {
-      "epoch": 2.683969465648855,
-      "grad_norm": 6935.6689453125,
-      "learning_rate": 1.075880758807588e-07,
-      "loss": 2.3218,
       "step": 440
     },
     {
-      "epoch": 2.7450381679389313,
-      "grad_norm": 7263.111328125,
-      "learning_rate": 1.048780487804878e-07,
-      "loss": 2.3239,
       "step": 450
     },
     {
-      "epoch": 2.8061068702290077,
-      "grad_norm": 7146.81787109375,
-      "learning_rate": 1.0216802168021679e-07,
-      "loss": 2.4087,
       "step": 460
     },
     {
-      "epoch": 2.867175572519084,
-      "grad_norm": 7680.18701171875,
-      "learning_rate": 9.94579945799458e-08,
-      "loss": 2.2903,
       "step": 470
     },
     {
-      "epoch": 2.92824427480916,
-      "grad_norm": 6631.55224609375,
-      "learning_rate": 9.67479674796748e-08,
-      "loss": 2.3723,
       "step": 480
     },
     {
-      "epoch": 2.9893129770992366,
-      "grad_norm": 8262.0439453125,
-      "learning_rate": 9.40379403794038e-08,
-      "loss": 2.2509,
       "step": 490
     },
     {
-      "epoch": 3.0549618320610685,
-      "grad_norm": 6400.1064453125,
-      "learning_rate": 9.132791327913278e-08,
-      "loss": 2.4533,
       "step": 500
     }
   ],
   "logging_steps": 10,
-  "max_steps": 820,
   "num_input_tokens_seen": 0,
-  "num_train_epochs": 5,
   "save_steps": 50,
   "stateful_callbacks": {
     "TrainerControl": {
@@ -377,7 +377,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 3.735794843165885e+16,
   "train_batch_size": 5,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 2.6178010471204187,
   "eval_steps": 500,
   "global_step": 500,
   "is_hyper_param_search": false,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.05235602094240838,
       "grad_norm": NaN,
       "learning_rate": 0.0,
+      "loss": 9.7408,
       "step": 10
     },
     {
+      "epoch": 0.10471204188481675,
+      "grad_norm": 202209.765625,
+      "learning_rate": 3.1413612565445024e-08,
+      "loss": 9.1548,
       "step": 20
     },
     {
+      "epoch": 0.15706806282722513,
+      "grad_norm": 27189.787109375,
+      "learning_rate": 1.3612565445026178e-07,
+      "loss": 3.8451,
       "step": 30
     },
     {
+      "epoch": 0.2094240837696335,
+      "grad_norm": 10457.34765625,
+      "learning_rate": 2.4083769633507854e-07,
+      "loss": 3.267,
       "step": 40
     },
     {
+      "epoch": 0.2617801047120419,
+      "grad_norm": 8087.2939453125,
+      "learning_rate": 3.4554973821989523e-07,
+      "loss": 3.0939,
       "step": 50
     },
     {
+      "epoch": 0.31413612565445026,
+      "grad_norm": 7214.744140625,
+      "learning_rate": 4.50261780104712e-07,
+      "loss": 3.0211,
       "step": 60
     },
     {
+      "epoch": 0.36649214659685864,
+      "grad_norm": 6162.826171875,
+      "learning_rate": 5.549738219895288e-07,
+      "loss": 2.846,
       "step": 70
     },
     {
+      "epoch": 0.418848167539267,
+      "grad_norm": 4688.05615234375,
+      "learning_rate": 6.596858638743455e-07,
+      "loss": 2.8104,
       "step": 80
     },
     {
+      "epoch": 0.4712041884816754,
+      "grad_norm": 3856.7578125,
+      "learning_rate": 7.643979057591623e-07,
+      "loss": 2.8735,
       "step": 90
     },
     {
+      "epoch": 0.5235602094240838,
+      "grad_norm": 3529.413330078125,
+      "learning_rate": 8.691099476439791e-07,
+      "loss": 2.8117,
       "step": 100
     },
     {
+      "epoch": 0.5759162303664922,
+      "grad_norm": 2830.52734375,
+      "learning_rate": 9.738219895287958e-07,
+      "loss": 2.7099,
       "step": 110
     },
     {
+      "epoch": 0.6282722513089005,
+      "grad_norm": 2316.537353515625,
+      "learning_rate": 1.0785340314136124e-06,
+      "loss": 2.6387,
       "step": 120
     },
     {
+      "epoch": 0.680628272251309,
+      "grad_norm": 2685.246826171875,
+      "learning_rate": 1.1832460732984293e-06,
+      "loss": 2.6667,
       "step": 130
     },
     {
+      "epoch": 0.7329842931937173,
+      "grad_norm": 2066.593017578125,
+      "learning_rate": 1.2879581151832458e-06,
+      "loss": 2.5786,
       "step": 140
     },
     {
+      "epoch": 0.7853403141361257,
+      "grad_norm": 2110.41748046875,
+      "learning_rate": 1.3926701570680628e-06,
+      "loss": 2.4927,
       "step": 150
     },
     {
+      "epoch": 0.837696335078534,
+      "grad_norm": 1557.745849609375,
+      "learning_rate": 1.4973821989528795e-06,
+      "loss": 2.6125,
       "step": 160
     },
     {
+      "epoch": 0.8900523560209425,
+      "grad_norm": 1510.9991455078125,
+      "learning_rate": 1.6020942408376963e-06,
+      "loss": 2.5048,
       "step": 170
     },
     {
+      "epoch": 0.9424083769633508,
+      "grad_norm": 1395.5841064453125,
+      "learning_rate": 1.706806282722513e-06,
+      "loss": 2.5049,
       "step": 180
     },
     {
+      "epoch": 0.9947643979057592,
+      "grad_norm": 1400.4466552734375,
+      "learning_rate": 1.8115183246073297e-06,
+      "loss": 2.4902,
       "step": 190
     },
     {
+      "epoch": 1.0471204188481675,
+      "grad_norm": 1328.171142578125,
+      "learning_rate": 1.9162303664921463e-06,
+      "loss": 2.3063,
       "step": 200
     },
     {
+      "epoch": 1.0994764397905759,
+      "grad_norm": 1169.1490478515625,
+      "learning_rate": 1.997673065735893e-06,
+      "loss": 2.3826,
       "step": 210
     },
     {
+      "epoch": 1.1518324607329844,
+      "grad_norm": 1007.3028564453125,
+      "learning_rate": 1.9860383944153577e-06,
+      "loss": 2.2646,
       "step": 220
     },
     {
+      "epoch": 1.2041884816753927,
+      "grad_norm": 905.8086547851562,
+      "learning_rate": 1.9744037230948225e-06,
+      "loss": 2.3065,
       "step": 230
     },
     {
+      "epoch": 1.256544502617801,
+      "grad_norm": 904.2677001953125,
+      "learning_rate": 1.9627690517742874e-06,
+      "loss": 2.369,
       "step": 240
     },
     {
+      "epoch": 1.3089005235602094,
+      "grad_norm": 878.70751953125,
+      "learning_rate": 1.951134380453752e-06,
+      "loss": 2.2916,
       "step": 250
     },
     {
+      "epoch": 1.3612565445026177,
+      "grad_norm": 785.525146484375,
+      "learning_rate": 1.9394997091332166e-06,
+      "loss": 2.2916,
       "step": 260
     },
     {
+      "epoch": 1.4136125654450262,
+      "grad_norm": 715.8485107421875,
+      "learning_rate": 1.927865037812682e-06,
+      "loss": 2.247,
       "step": 270
     },
     {
+      "epoch": 1.4659685863874345,
+      "grad_norm": 742.1319580078125,
+      "learning_rate": 1.9162303664921463e-06,
+      "loss": 2.2293,
       "step": 280
     },
     {
+      "epoch": 1.518324607329843,
+      "grad_norm": 777.41259765625,
+      "learning_rate": 1.9045956951716113e-06,
+      "loss": 2.1447,
       "step": 290
     },
     {
+      "epoch": 1.5706806282722514,
+      "grad_norm": 693.8157348632812,
+      "learning_rate": 1.8929610238510761e-06,
+      "loss": 2.1851,
       "step": 300
     },
     {
+      "epoch": 1.6230366492146597,
+      "grad_norm": 707.2672119140625,
+      "learning_rate": 1.881326352530541e-06,
+      "loss": 2.1879,
       "step": 310
     },
     {
+      "epoch": 1.675392670157068,
+      "grad_norm": 727.61767578125,
+      "learning_rate": 1.8696916812100056e-06,
+      "loss": 2.1962,
       "step": 320
     },
     {
+      "epoch": 1.7277486910994764,
+      "grad_norm": 695.4833984375,
+      "learning_rate": 1.8580570098894706e-06,
+      "loss": 2.2057,
       "step": 330
     },
     {
+      "epoch": 1.7801047120418847,
+      "grad_norm": 614.199462890625,
+      "learning_rate": 1.8464223385689352e-06,
+      "loss": 2.0654,
       "step": 340
     },
     {
+      "epoch": 1.8324607329842932,
+      "grad_norm": 724.0316162109375,
+      "learning_rate": 1.8347876672484e-06,
+      "loss": 2.0803,
       "step": 350
     },
     {
+      "epoch": 1.8848167539267016,
+      "grad_norm": 664.735595703125,
+      "learning_rate": 1.823152995927865e-06,
+      "loss": 1.8995,
       "step": 360
     },
     {
+      "epoch": 1.93717277486911,
+      "grad_norm": 725.57373046875,
+      "learning_rate": 1.8115183246073297e-06,
+      "loss": 1.9195,
       "step": 370
     },
     {
+      "epoch": 1.9895287958115184,
+      "grad_norm": 680.0363159179688,
+      "learning_rate": 1.7998836532867946e-06,
+      "loss": 1.9157,
       "step": 380
     },
     {
+      "epoch": 2.0418848167539267,
+      "grad_norm": 656.7247314453125,
+      "learning_rate": 1.7882489819662594e-06,
+      "loss": 1.8435,
       "step": 390
     },
     {
+      "epoch": 2.094240837696335,
+      "grad_norm": 754.1705322265625,
+      "learning_rate": 1.776614310645724e-06,
+      "loss": 1.8308,
       "step": 400
     },
     {
+      "epoch": 2.1465968586387434,
+      "grad_norm": 811.6585693359375,
+      "learning_rate": 1.764979639325189e-06,
+      "loss": 1.8349,
       "step": 410
     },
     {
+      "epoch": 2.1989528795811517,
+      "grad_norm": 743.0385131835938,
+      "learning_rate": 1.7533449680046537e-06,
+      "loss": 1.8507,
       "step": 420
     },
     {
+      "epoch": 2.25130890052356,
+      "grad_norm": 748.2722778320312,
+      "learning_rate": 1.7417102966841187e-06,
+      "loss": 1.7967,
       "step": 430
     },
     {
+      "epoch": 2.303664921465969,
+      "grad_norm": 587.875732421875,
+      "learning_rate": 1.7300756253635833e-06,
+      "loss": 1.9958,
       "step": 440
     },
     {
+      "epoch": 2.356020942408377,
+      "grad_norm": 623.1217651367188,
+      "learning_rate": 1.7184409540430482e-06,
+      "loss": 1.8716,
       "step": 450
     },
     {
+      "epoch": 2.4083769633507854,
+      "grad_norm": 689.55126953125,
+      "learning_rate": 1.706806282722513e-06,
+      "loss": 1.8947,
       "step": 460
     },
     {
+      "epoch": 2.4607329842931938,
+      "grad_norm": 656.4078369140625,
+      "learning_rate": 1.6951716114019778e-06,
+      "loss": 1.8584,
       "step": 470
     },
     {
+      "epoch": 2.513089005235602,
+      "grad_norm": 672.116455078125,
+      "learning_rate": 1.6835369400814424e-06,
+      "loss": 1.8129,
       "step": 480
     },
     {
+      "epoch": 2.5654450261780104,
+      "grad_norm": 586.6629638671875,
+      "learning_rate": 1.6719022687609075e-06,
+      "loss": 1.8214,
       "step": 490
     },
     {
+      "epoch": 2.6178010471204187,
+      "grad_norm": 593.3026123046875,
+      "learning_rate": 1.6602675974403721e-06,
+      "loss": 1.802,
       "step": 500
     }
   ],
   "logging_steps": 10,
+  "max_steps": 1910,
   "num_input_tokens_seen": 0,
+  "num_train_epochs": 10,
   "save_steps": 50,
   "stateful_callbacks": {
     "TrainerControl": {
       "attributes": {}
     }
   },
+  "total_flos": 3.817928767049712e+16,
   "train_batch_size": 5,
   "trial_name": null,
   "trial_params": null

checkpoint-500/training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:757d9ed6a271cc7dd663b202a023b2731e235bf47955a4f40ff4c18331f20ba4
 size 5816

 version https://git-lfs.github.com/spec/v1
+oid sha256:5a98a6f231f28700315fa8e9cb612a94ae1e99d1ff2b7795e1d31ff2c428a5d2
 size 5816