Training in progress, step 450, checkpoint

Browse files

Files changed (6) hide show

last-checkpoint/README.md +1 -10
last-checkpoint/adapter_model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +1054 -4

last-checkpoint/README.md CHANGED Viewed

@@ -1,14 +1,6 @@
 ---
 base_model: unsloth/phi-4-reasoning-unsloth-bnb-4bit
 library_name: peft
-pipeline_tag: text-generation
-tags:
-- base_model:adapter:unsloth/phi-4-reasoning-unsloth-bnb-4bit
-- lora
-- sft
-- transformers
-- trl
-- unsloth
 ---
 # Model Card for Model ID
@@ -207,5 +199,4 @@ Carbon emissions can be estimated using the [Machine Learning Impact calculator]
 [More Information Needed]
 ### Framework versions
-- PEFT 0.14.0
-- PEFT 0.18.1

 ---
 base_model: unsloth/phi-4-reasoning-unsloth-bnb-4bit
 library_name: peft
 ---
 # Model Card for Model ID
 [More Information Needed]
 ### Framework versions
+- PEFT 0.14.0

last-checkpoint/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:53985acb31dd0998666fd1ef3b26164afc3209f9a5b83687825d987aea43c9f6
 size 170415112

 version https://git-lfs.github.com/spec/v1
+oid sha256:46446edb55375c8e30837b741ea6efdf301ab5e7c33a3e01a2228e7f2984fcfa
 size 170415112

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b36722f4c91b9b0fc6f5a698586c1c70039d77cbf978f47df7ad99a7f0ecb044
 size 86718091

 version https://git-lfs.github.com/spec/v1
+oid sha256:a6a68463284b4ad2a06822b70257392d6961fee2250f7ab39d5edb82b4dcbafc
 size 86718091

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a8e2011629d8bed3ef560fa11175cac55684c4e12a72634bb24abf767b6c7399
 size 14645

 version https://git-lfs.github.com/spec/v1
+oid sha256:718a0f3db00824213036a2c0441849791319b7d9cf189065873bb26a7020738e
 size 14645

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7c6534683ab3f989236ae99e3358a88f1314e102299761d5177d621f24a30eb4
 size 1465

 version https://git-lfs.github.com/spec/v1
+oid sha256:0c2fa43c6f5c9db389e161efa317fdc098f9dc594a3395c416087750a9a40f32
 size 1465

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -2,9 +2,9 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 3.3333333333333335,
   "eval_steps": 500,
-  "global_step": 300,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -2108,6 +2108,1056 @@
       "learning_rate": 6.786516853932583e-05,
       "loss": 0.8799,
       "step": 300
     }
   ],
   "logging_steps": 1,
@@ -2122,12 +3172,12 @@
         "should_evaluate": false,
         "should_log": false,
         "should_save": true,
-        "should_training_stop": false
       },
       "attributes": {}
     }
   },
-  "total_flos": 3.347440736403456e+18,
   "train_batch_size": 16,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 5.0,
   "eval_steps": 500,
+  "global_step": 450,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "learning_rate": 6.786516853932583e-05,
       "loss": 0.8799,
       "step": 300
+    },
+    {
+      "epoch": 3.3444444444444446,
+      "grad_norm": 0.06960175186395645,
+      "learning_rate": 6.741573033707866e-05,
+      "loss": 0.8587,
+      "step": 301
+    },
+    {
+      "epoch": 3.3555555555555556,
+      "grad_norm": 0.06346665322780609,
+      "learning_rate": 6.696629213483147e-05,
+      "loss": 0.8311,
+      "step": 302
+    },
+    {
+      "epoch": 3.3666666666666667,
+      "grad_norm": 0.06519903242588043,
+      "learning_rate": 6.651685393258428e-05,
+      "loss": 0.8537,
+      "step": 303
+    },
+    {
+      "epoch": 3.3777777777777778,
+      "grad_norm": 0.060187116265296936,
+      "learning_rate": 6.606741573033708e-05,
+      "loss": 0.8614,
+      "step": 304
+    },
+    {
+      "epoch": 3.388888888888889,
+      "grad_norm": 0.06387775391340256,
+      "learning_rate": 6.561797752808989e-05,
+      "loss": 0.8795,
+      "step": 305
+    },
+    {
+      "epoch": 3.4,
+      "grad_norm": 0.06667324900627136,
+      "learning_rate": 6.51685393258427e-05,
+      "loss": 0.8086,
+      "step": 306
+    },
+    {
+      "epoch": 3.411111111111111,
+      "grad_norm": 0.06455954164266586,
+      "learning_rate": 6.47191011235955e-05,
+      "loss": 0.7697,
+      "step": 307
+    },
+    {
+      "epoch": 3.422222222222222,
+      "grad_norm": 0.06988281011581421,
+      "learning_rate": 6.426966292134831e-05,
+      "loss": 0.8821,
+      "step": 308
+    },
+    {
+      "epoch": 3.4333333333333336,
+      "grad_norm": 0.07037835568189621,
+      "learning_rate": 6.382022471910112e-05,
+      "loss": 0.7936,
+      "step": 309
+    },
+    {
+      "epoch": 3.4444444444444446,
+      "grad_norm": 0.060425762087106705,
+      "learning_rate": 6.337078651685394e-05,
+      "loss": 0.8403,
+      "step": 310
+    },
+    {
+      "epoch": 3.4555555555555557,
+      "grad_norm": 0.06361618638038635,
+      "learning_rate": 6.292134831460675e-05,
+      "loss": 0.8605,
+      "step": 311
+    },
+    {
+      "epoch": 3.466666666666667,
+      "grad_norm": 0.06325947493314743,
+      "learning_rate": 6.247191011235956e-05,
+      "loss": 0.9275,
+      "step": 312
+    },
+    {
+      "epoch": 3.477777777777778,
+      "grad_norm": 0.054963476955890656,
+      "learning_rate": 6.202247191011237e-05,
+      "loss": 0.78,
+      "step": 313
+    },
+    {
+      "epoch": 3.488888888888889,
+      "grad_norm": 0.09446276724338531,
+      "learning_rate": 6.157303370786517e-05,
+      "loss": 0.8164,
+      "step": 314
+    },
+    {
+      "epoch": 3.5,
+      "grad_norm": 0.07087241113185883,
+      "learning_rate": 6.112359550561798e-05,
+      "loss": 0.9349,
+      "step": 315
+    },
+    {
+      "epoch": 3.511111111111111,
+      "grad_norm": 0.057490210980176926,
+      "learning_rate": 6.067415730337079e-05,
+      "loss": 0.8346,
+      "step": 316
+    },
+    {
+      "epoch": 3.522222222222222,
+      "grad_norm": 0.05834012106060982,
+      "learning_rate": 6.0224719101123596e-05,
+      "loss": 0.8413,
+      "step": 317
+    },
+    {
+      "epoch": 3.533333333333333,
+      "grad_norm": 0.0664343386888504,
+      "learning_rate": 5.977528089887641e-05,
+      "loss": 0.8908,
+      "step": 318
+    },
+    {
+      "epoch": 3.5444444444444443,
+      "grad_norm": 0.061850935220718384,
+      "learning_rate": 5.932584269662922e-05,
+      "loss": 0.8421,
+      "step": 319
+    },
+    {
+      "epoch": 3.5555555555555554,
+      "grad_norm": 0.07486843317747116,
+      "learning_rate": 5.8876404494382023e-05,
+      "loss": 0.7768,
+      "step": 320
+    },
+    {
+      "epoch": 3.5666666666666664,
+      "grad_norm": 0.06851966679096222,
+      "learning_rate": 5.8426966292134835e-05,
+      "loss": 0.7788,
+      "step": 321
+    },
+    {
+      "epoch": 3.5777777777777775,
+      "grad_norm": 0.06413612514734268,
+      "learning_rate": 5.7977528089887646e-05,
+      "loss": 0.8243,
+      "step": 322
+    },
+    {
+      "epoch": 3.588888888888889,
+      "grad_norm": 0.06908858567476273,
+      "learning_rate": 5.752808988764046e-05,
+      "loss": 0.8331,
+      "step": 323
+    },
+    {
+      "epoch": 3.6,
+      "grad_norm": 0.056398071348667145,
+      "learning_rate": 5.7078651685393256e-05,
+      "loss": 0.8889,
+      "step": 324
+    },
+    {
+      "epoch": 3.611111111111111,
+      "grad_norm": 0.06279837340116501,
+      "learning_rate": 5.6629213483146074e-05,
+      "loss": 0.7433,
+      "step": 325
+    },
+    {
+      "epoch": 3.6222222222222222,
+      "grad_norm": 0.07156965136528015,
+      "learning_rate": 5.6179775280898885e-05,
+      "loss": 0.7733,
+      "step": 326
+    },
+    {
+      "epoch": 3.6333333333333333,
+      "grad_norm": 0.058655962347984314,
+      "learning_rate": 5.573033707865168e-05,
+      "loss": 0.7912,
+      "step": 327
+    },
+    {
+      "epoch": 3.6444444444444444,
+      "grad_norm": 0.06260320544242859,
+      "learning_rate": 5.5280898876404495e-05,
+      "loss": 0.8112,
+      "step": 328
+    },
+    {
+      "epoch": 3.6555555555555554,
+      "grad_norm": 0.08468577265739441,
+      "learning_rate": 5.4831460674157306e-05,
+      "loss": 0.8475,
+      "step": 329
+    },
+    {
+      "epoch": 3.6666666666666665,
+      "grad_norm": 0.06323560327291489,
+      "learning_rate": 5.438202247191011e-05,
+      "loss": 0.9234,
+      "step": 330
+    },
+    {
+      "epoch": 3.677777777777778,
+      "grad_norm": 0.0636834055185318,
+      "learning_rate": 5.393258426966292e-05,
+      "loss": 0.8859,
+      "step": 331
+    },
+    {
+      "epoch": 3.688888888888889,
+      "grad_norm": 0.06598466634750366,
+      "learning_rate": 5.3483146067415734e-05,
+      "loss": 0.8385,
+      "step": 332
+    },
+    {
+      "epoch": 3.7,
+      "grad_norm": 0.0580470897257328,
+      "learning_rate": 5.3033707865168545e-05,
+      "loss": 0.8485,
+      "step": 333
+    },
+    {
+      "epoch": 3.7111111111111112,
+      "grad_norm": 0.06465502083301544,
+      "learning_rate": 5.258426966292135e-05,
+      "loss": 0.8595,
+      "step": 334
+    },
+    {
+      "epoch": 3.7222222222222223,
+      "grad_norm": 0.06161164864897728,
+      "learning_rate": 5.213483146067416e-05,
+      "loss": 0.8341,
+      "step": 335
+    },
+    {
+      "epoch": 3.7333333333333334,
+      "grad_norm": 0.07357273995876312,
+      "learning_rate": 5.168539325842697e-05,
+      "loss": 0.7917,
+      "step": 336
+    },
+    {
+      "epoch": 3.7444444444444445,
+      "grad_norm": 0.06672197580337524,
+      "learning_rate": 5.123595505617977e-05,
+      "loss": 0.7472,
+      "step": 337
+    },
+    {
+      "epoch": 3.7555555555555555,
+      "grad_norm": 0.06582935154438019,
+      "learning_rate": 5.078651685393259e-05,
+      "loss": 0.9154,
+      "step": 338
+    },
+    {
+      "epoch": 3.7666666666666666,
+      "grad_norm": 0.06434327363967896,
+      "learning_rate": 5.03370786516854e-05,
+      "loss": 0.7815,
+      "step": 339
+    },
+    {
+      "epoch": 3.7777777777777777,
+      "grad_norm": 0.06618902832269669,
+      "learning_rate": 4.9887640449438205e-05,
+      "loss": 0.8484,
+      "step": 340
+    },
+    {
+      "epoch": 3.7888888888888888,
+      "grad_norm": 0.05907148867845535,
+      "learning_rate": 4.943820224719101e-05,
+      "loss": 0.8342,
+      "step": 341
+    },
+    {
+      "epoch": 3.8,
+      "grad_norm": 0.0687561109662056,
+      "learning_rate": 4.898876404494382e-05,
+      "loss": 0.7931,
+      "step": 342
+    },
+    {
+      "epoch": 3.811111111111111,
+      "grad_norm": 0.06155551224946976,
+      "learning_rate": 4.853932584269663e-05,
+      "loss": 0.8265,
+      "step": 343
+    },
+    {
+      "epoch": 3.822222222222222,
+      "grad_norm": 0.06934966892004013,
+      "learning_rate": 4.808988764044944e-05,
+      "loss": 0.8432,
+      "step": 344
+    },
+    {
+      "epoch": 3.8333333333333335,
+      "grad_norm": 0.06328413635492325,
+      "learning_rate": 4.764044943820225e-05,
+      "loss": 0.9138,
+      "step": 345
+    },
+    {
+      "epoch": 3.8444444444444446,
+      "grad_norm": 0.07244782894849777,
+      "learning_rate": 4.719101123595506e-05,
+      "loss": 0.8205,
+      "step": 346
+    },
+    {
+      "epoch": 3.8555555555555556,
+      "grad_norm": 0.06606698781251907,
+      "learning_rate": 4.674157303370787e-05,
+      "loss": 0.7833,
+      "step": 347
+    },
+    {
+      "epoch": 3.8666666666666667,
+      "grad_norm": 0.06491662561893463,
+      "learning_rate": 4.629213483146068e-05,
+      "loss": 0.834,
+      "step": 348
+    },
+    {
+      "epoch": 3.8777777777777778,
+      "grad_norm": 0.059926439076662064,
+      "learning_rate": 4.584269662921348e-05,
+      "loss": 0.7969,
+      "step": 349
+    },
+    {
+      "epoch": 3.888888888888889,
+      "grad_norm": 0.060968831181526184,
+      "learning_rate": 4.539325842696629e-05,
+      "loss": 0.846,
+      "step": 350
+    },
+    {
+      "epoch": 3.9,
+      "grad_norm": 0.07196994125843048,
+      "learning_rate": 4.4943820224719104e-05,
+      "loss": 0.8191,
+      "step": 351
+    },
+    {
+      "epoch": 3.911111111111111,
+      "grad_norm": 0.061406128108501434,
+      "learning_rate": 4.4494382022471916e-05,
+      "loss": 0.8128,
+      "step": 352
+    },
+    {
+      "epoch": 3.9222222222222225,
+      "grad_norm": 0.05814112350344658,
+      "learning_rate": 4.404494382022472e-05,
+      "loss": 0.8453,
+      "step": 353
+    },
+    {
+      "epoch": 3.9333333333333336,
+      "grad_norm": 0.06283029168844223,
+      "learning_rate": 4.3595505617977525e-05,
+      "loss": 0.899,
+      "step": 354
+    },
+    {
+      "epoch": 3.9444444444444446,
+      "grad_norm": 0.06355167180299759,
+      "learning_rate": 4.314606741573034e-05,
+      "loss": 0.7727,
+      "step": 355
+    },
+    {
+      "epoch": 3.9555555555555557,
+      "grad_norm": 0.06236105412244797,
+      "learning_rate": 4.269662921348315e-05,
+      "loss": 0.8153,
+      "step": 356
+    },
+    {
+      "epoch": 3.966666666666667,
+      "grad_norm": 0.07224887609481812,
+      "learning_rate": 4.224719101123596e-05,
+      "loss": 0.7292,
+      "step": 357
+    },
+    {
+      "epoch": 3.977777777777778,
+      "grad_norm": 0.06908698379993439,
+      "learning_rate": 4.1797752808988764e-05,
+      "loss": 0.7171,
+      "step": 358
+    },
+    {
+      "epoch": 3.988888888888889,
+      "grad_norm": 0.06701194494962692,
+      "learning_rate": 4.1348314606741576e-05,
+      "loss": 0.8333,
+      "step": 359
+    },
+    {
+      "epoch": 4.0,
+      "grad_norm": 0.06868044286966324,
+      "learning_rate": 4.089887640449439e-05,
+      "loss": 0.7593,
+      "step": 360
+    },
+    {
+      "epoch": 4.011111111111111,
+      "grad_norm": 0.059760935604572296,
+      "learning_rate": 4.044943820224719e-05,
+      "loss": 0.8347,
+      "step": 361
+    },
+    {
+      "epoch": 4.022222222222222,
+      "grad_norm": 0.06333184987306595,
+      "learning_rate": 4e-05,
+      "loss": 0.8283,
+      "step": 362
+    },
+    {
+      "epoch": 4.033333333333333,
+      "grad_norm": 0.05701572820544243,
+      "learning_rate": 3.955056179775281e-05,
+      "loss": 0.8445,
+      "step": 363
+    },
+    {
+      "epoch": 4.044444444444444,
+      "grad_norm": 0.057465020567178726,
+      "learning_rate": 3.910112359550562e-05,
+      "loss": 0.8068,
+      "step": 364
+    },
+    {
+      "epoch": 4.055555555555555,
+      "grad_norm": 0.07226772606372833,
+      "learning_rate": 3.865168539325843e-05,
+      "loss": 0.8497,
+      "step": 365
+    },
+    {
+      "epoch": 4.066666666666666,
+      "grad_norm": 0.07886355370283127,
+      "learning_rate": 3.8202247191011236e-05,
+      "loss": 0.8044,
+      "step": 366
+    },
+    {
+      "epoch": 4.0777777777777775,
+      "grad_norm": 0.071531742811203,
+      "learning_rate": 3.775280898876405e-05,
+      "loss": 0.7713,
+      "step": 367
+    },
+    {
+      "epoch": 4.088888888888889,
+      "grad_norm": 0.07198038697242737,
+      "learning_rate": 3.730337078651686e-05,
+      "loss": 0.8685,
+      "step": 368
+    },
+    {
+      "epoch": 4.1,
+      "grad_norm": 0.07044125348329544,
+      "learning_rate": 3.685393258426966e-05,
+      "loss": 0.767,
+      "step": 369
+    },
+    {
+      "epoch": 4.111111111111111,
+      "grad_norm": 0.07338492572307587,
+      "learning_rate": 3.6404494382022475e-05,
+      "loss": 0.7563,
+      "step": 370
+    },
+    {
+      "epoch": 4.122222222222222,
+      "grad_norm": 0.06580597162246704,
+      "learning_rate": 3.595505617977528e-05,
+      "loss": 0.8228,
+      "step": 371
+    },
+    {
+      "epoch": 4.133333333333334,
+      "grad_norm": 0.06712008267641068,
+      "learning_rate": 3.550561797752809e-05,
+      "loss": 0.8644,
+      "step": 372
+    },
+    {
+      "epoch": 4.144444444444445,
+      "grad_norm": 0.0610797144472599,
+      "learning_rate": 3.50561797752809e-05,
+      "loss": 0.7718,
+      "step": 373
+    },
+    {
+      "epoch": 4.155555555555556,
+      "grad_norm": 0.07456668466329575,
+      "learning_rate": 3.460674157303371e-05,
+      "loss": 0.7259,
+      "step": 374
+    },
+    {
+      "epoch": 4.166666666666667,
+      "grad_norm": 0.05761990696191788,
+      "learning_rate": 3.415730337078652e-05,
+      "loss": 0.8152,
+      "step": 375
+    },
+    {
+      "epoch": 4.177777777777778,
+      "grad_norm": 0.06759507954120636,
+      "learning_rate": 3.370786516853933e-05,
+      "loss": 0.8665,
+      "step": 376
+    },
+    {
+      "epoch": 4.188888888888889,
+      "grad_norm": 0.06531881541013718,
+      "learning_rate": 3.325842696629214e-05,
+      "loss": 0.8211,
+      "step": 377
+    },
+    {
+      "epoch": 4.2,
+      "grad_norm": 0.06870734691619873,
+      "learning_rate": 3.2808988764044946e-05,
+      "loss": 0.7547,
+      "step": 378
+    },
+    {
+      "epoch": 4.211111111111111,
+      "grad_norm": 0.06056152656674385,
+      "learning_rate": 3.235955056179775e-05,
+      "loss": 0.7779,
+      "step": 379
+    },
+    {
+      "epoch": 4.222222222222222,
+      "grad_norm": 0.06231197342276573,
+      "learning_rate": 3.191011235955056e-05,
+      "loss": 0.767,
+      "step": 380
+    },
+    {
+      "epoch": 4.233333333333333,
+      "grad_norm": 0.06242848560214043,
+      "learning_rate": 3.1460674157303374e-05,
+      "loss": 0.8473,
+      "step": 381
+    },
+    {
+      "epoch": 4.2444444444444445,
+      "grad_norm": 0.06435809284448624,
+      "learning_rate": 3.1011235955056185e-05,
+      "loss": 0.898,
+      "step": 382
+    },
+    {
+      "epoch": 4.2555555555555555,
+      "grad_norm": 0.06603720039129257,
+      "learning_rate": 3.056179775280899e-05,
+      "loss": 0.8338,
+      "step": 383
+    },
+    {
+      "epoch": 4.266666666666667,
+      "grad_norm": 0.06830265372991562,
+      "learning_rate": 3.0112359550561798e-05,
+      "loss": 0.8323,
+      "step": 384
+    },
+    {
+      "epoch": 4.277777777777778,
+      "grad_norm": 0.06457255780696869,
+      "learning_rate": 2.966292134831461e-05,
+      "loss": 0.8353,
+      "step": 385
+    },
+    {
+      "epoch": 4.288888888888889,
+      "grad_norm": 0.06458742916584015,
+      "learning_rate": 2.9213483146067417e-05,
+      "loss": 0.7949,
+      "step": 386
+    },
+    {
+      "epoch": 4.3,
+      "grad_norm": 0.07112720608711243,
+      "learning_rate": 2.876404494382023e-05,
+      "loss": 0.7822,
+      "step": 387
+    },
+    {
+      "epoch": 4.311111111111111,
+      "grad_norm": 0.06759954988956451,
+      "learning_rate": 2.8314606741573037e-05,
+      "loss": 0.7434,
+      "step": 388
+    },
+    {
+      "epoch": 4.322222222222222,
+      "grad_norm": 0.061707496643066406,
+      "learning_rate": 2.786516853932584e-05,
+      "loss": 0.7928,
+      "step": 389
+    },
+    {
+      "epoch": 4.333333333333333,
+      "grad_norm": 0.07415860146284103,
+      "learning_rate": 2.7415730337078653e-05,
+      "loss": 0.7564,
+      "step": 390
+    },
+    {
+      "epoch": 4.344444444444444,
+      "grad_norm": 0.06894145160913467,
+      "learning_rate": 2.696629213483146e-05,
+      "loss": 0.7985,
+      "step": 391
+    },
+    {
+      "epoch": 4.355555555555555,
+      "grad_norm": 0.06181568279862404,
+      "learning_rate": 2.6516853932584273e-05,
+      "loss": 0.8546,
+      "step": 392
+    },
+    {
+      "epoch": 4.366666666666666,
+      "grad_norm": 0.0904124304652214,
+      "learning_rate": 2.606741573033708e-05,
+      "loss": 0.8677,
+      "step": 393
+    },
+    {
+      "epoch": 4.377777777777778,
+      "grad_norm": 0.06363783776760101,
+      "learning_rate": 2.5617977528089885e-05,
+      "loss": 0.7788,
+      "step": 394
+    },
+    {
+      "epoch": 4.388888888888889,
+      "grad_norm": 0.06672205030918121,
+      "learning_rate": 2.51685393258427e-05,
+      "loss": 0.8017,
+      "step": 395
+    },
+    {
+      "epoch": 4.4,
+      "grad_norm": 0.07275419682264328,
+      "learning_rate": 2.4719101123595505e-05,
+      "loss": 0.7785,
+      "step": 396
+    },
+    {
+      "epoch": 4.411111111111111,
+      "grad_norm": 0.05966269597411156,
+      "learning_rate": 2.4269662921348316e-05,
+      "loss": 0.8946,
+      "step": 397
+    },
+    {
+      "epoch": 4.4222222222222225,
+      "grad_norm": 0.07346921414136887,
+      "learning_rate": 2.3820224719101125e-05,
+      "loss": 0.8088,
+      "step": 398
+    },
+    {
+      "epoch": 4.433333333333334,
+      "grad_norm": 0.06261128932237625,
+      "learning_rate": 2.3370786516853936e-05,
+      "loss": 0.7987,
+      "step": 399
+    },
+    {
+      "epoch": 4.444444444444445,
+      "grad_norm": 0.06262333691120148,
+      "learning_rate": 2.292134831460674e-05,
+      "loss": 0.8036,
+      "step": 400
+    },
+    {
+      "epoch": 4.455555555555556,
+      "grad_norm": 0.061543941497802734,
+      "learning_rate": 2.2471910112359552e-05,
+      "loss": 0.7783,
+      "step": 401
+    },
+    {
+      "epoch": 4.466666666666667,
+      "grad_norm": 0.061959441751241684,
+      "learning_rate": 2.202247191011236e-05,
+      "loss": 0.8373,
+      "step": 402
+    },
+    {
+      "epoch": 4.477777777777778,
+      "grad_norm": 0.06770654022693634,
+      "learning_rate": 2.157303370786517e-05,
+      "loss": 0.843,
+      "step": 403
+    },
+    {
+      "epoch": 4.488888888888889,
+      "grad_norm": 0.06563393771648407,
+      "learning_rate": 2.112359550561798e-05,
+      "loss": 0.7825,
+      "step": 404
+    },
+    {
+      "epoch": 4.5,
+      "grad_norm": 0.07568306475877762,
+      "learning_rate": 2.0674157303370788e-05,
+      "loss": 0.8061,
+      "step": 405
+    },
+    {
+      "epoch": 4.511111111111111,
+      "grad_norm": 0.06064269691705704,
+      "learning_rate": 2.0224719101123596e-05,
+      "loss": 0.7677,
+      "step": 406
+    },
+    {
+      "epoch": 4.522222222222222,
+      "grad_norm": 0.06697507947683334,
+      "learning_rate": 1.9775280898876404e-05,
+      "loss": 0.838,
+      "step": 407
+    },
+    {
+      "epoch": 4.533333333333333,
+      "grad_norm": 0.07317288964986801,
+      "learning_rate": 1.9325842696629215e-05,
+      "loss": 0.8156,
+      "step": 408
+    },
+    {
+      "epoch": 4.544444444444444,
+      "grad_norm": 0.06405317783355713,
+      "learning_rate": 1.8876404494382024e-05,
+      "loss": 0.8283,
+      "step": 409
+    },
+    {
+      "epoch": 4.555555555555555,
+      "grad_norm": 0.0716167539358139,
+      "learning_rate": 1.842696629213483e-05,
+      "loss": 0.8894,
+      "step": 410
+    },
+    {
+      "epoch": 4.566666666666666,
+      "grad_norm": 0.07410852611064911,
+      "learning_rate": 1.797752808988764e-05,
+      "loss": 0.7339,
+      "step": 411
+    },
+    {
+      "epoch": 4.5777777777777775,
+      "grad_norm": 0.06780914962291718,
+      "learning_rate": 1.752808988764045e-05,
+      "loss": 0.7925,
+      "step": 412
+    },
+    {
+      "epoch": 4.588888888888889,
+      "grad_norm": 0.065445177257061,
+      "learning_rate": 1.707865168539326e-05,
+      "loss": 0.8451,
+      "step": 413
+    },
+    {
+      "epoch": 4.6,
+      "grad_norm": 0.06177813559770584,
+      "learning_rate": 1.662921348314607e-05,
+      "loss": 0.9403,
+      "step": 414
+    },
+    {
+      "epoch": 4.611111111111111,
+      "grad_norm": 0.0611334890127182,
+      "learning_rate": 1.6179775280898875e-05,
+      "loss": 0.8064,
+      "step": 415
+    },
+    {
+      "epoch": 4.622222222222222,
+      "grad_norm": 0.06584127992391586,
+      "learning_rate": 1.5730337078651687e-05,
+      "loss": 0.8344,
+      "step": 416
+    },
+    {
+      "epoch": 4.633333333333333,
+      "grad_norm": 0.06387150287628174,
+      "learning_rate": 1.5280898876404495e-05,
+      "loss": 0.8348,
+      "step": 417
+    },
+    {
+      "epoch": 4.644444444444445,
+      "grad_norm": 0.07291285693645477,
+      "learning_rate": 1.4831460674157305e-05,
+      "loss": 0.8291,
+      "step": 418
+    },
+    {
+      "epoch": 4.655555555555556,
+      "grad_norm": 0.06709293276071548,
+      "learning_rate": 1.4382022471910114e-05,
+      "loss": 0.7971,
+      "step": 419
+    },
+    {
+      "epoch": 4.666666666666667,
+      "grad_norm": 0.06627030670642853,
+      "learning_rate": 1.393258426966292e-05,
+      "loss": 0.7975,
+      "step": 420
+    },
+    {
+      "epoch": 4.677777777777778,
+      "grad_norm": 0.06093061715364456,
+      "learning_rate": 1.348314606741573e-05,
+      "loss": 0.7644,
+      "step": 421
+    },
+    {
+      "epoch": 4.688888888888889,
+      "grad_norm": 0.06513749808073044,
+      "learning_rate": 1.303370786516854e-05,
+      "loss": 0.8221,
+      "step": 422
+    },
+    {
+      "epoch": 4.7,
+      "grad_norm": 0.06582172214984894,
+      "learning_rate": 1.258426966292135e-05,
+      "loss": 0.84,
+      "step": 423
+    },
+    {
+      "epoch": 4.711111111111111,
+      "grad_norm": 0.05726146325469017,
+      "learning_rate": 1.2134831460674158e-05,
+      "loss": 0.8009,
+      "step": 424
+    },
+    {
+      "epoch": 4.722222222222222,
+      "grad_norm": 0.060694370418787,
+      "learning_rate": 1.1685393258426968e-05,
+      "loss": 0.8697,
+      "step": 425
+    },
+    {
+      "epoch": 4.733333333333333,
+      "grad_norm": 0.0613742358982563,
+      "learning_rate": 1.1235955056179776e-05,
+      "loss": 0.8642,
+      "step": 426
+    },
+    {
+      "epoch": 4.7444444444444445,
+      "grad_norm": 0.07079113274812698,
+      "learning_rate": 1.0786516853932586e-05,
+      "loss": 0.7765,
+      "step": 427
+    },
+    {
+      "epoch": 4.7555555555555555,
+      "grad_norm": 0.058669183403253555,
+      "learning_rate": 1.0337078651685394e-05,
+      "loss": 0.8284,
+      "step": 428
+    },
+    {
+      "epoch": 4.766666666666667,
+      "grad_norm": 0.058568935841321945,
+      "learning_rate": 9.887640449438202e-06,
+      "loss": 0.8041,
+      "step": 429
+    },
+    {
+      "epoch": 4.777777777777778,
+      "grad_norm": 0.3323858678340912,
+      "learning_rate": 9.438202247191012e-06,
+      "loss": 0.7972,
+      "step": 430
+    },
+    {
+      "epoch": 4.788888888888889,
+      "grad_norm": 0.060114577412605286,
+      "learning_rate": 8.98876404494382e-06,
+      "loss": 0.918,
+      "step": 431
+    },
+    {
+      "epoch": 4.8,
+      "grad_norm": 0.07039128243923187,
+      "learning_rate": 8.53932584269663e-06,
+      "loss": 0.7934,
+      "step": 432
+    },
+    {
+      "epoch": 4.811111111111111,
+      "grad_norm": 0.06390310078859329,
+      "learning_rate": 8.089887640449438e-06,
+      "loss": 0.8117,
+      "step": 433
+    },
+    {
+      "epoch": 4.822222222222222,
+      "grad_norm": 0.06396052241325378,
+      "learning_rate": 7.640449438202247e-06,
+      "loss": 0.7735,
+      "step": 434
+    },
+    {
+      "epoch": 4.833333333333333,
+      "grad_norm": 0.12294171750545502,
+      "learning_rate": 7.191011235955057e-06,
+      "loss": 0.8392,
+      "step": 435
+    },
+    {
+      "epoch": 4.844444444444444,
+      "grad_norm": 0.056867167353630066,
+      "learning_rate": 6.741573033707865e-06,
+      "loss": 0.7539,
+      "step": 436
+    },
+    {
+      "epoch": 4.855555555555555,
+      "grad_norm": 0.06927565485239029,
+      "learning_rate": 6.292134831460675e-06,
+      "loss": 0.7451,
+      "step": 437
+    },
+    {
+      "epoch": 4.866666666666667,
+      "grad_norm": 0.06880547106266022,
+      "learning_rate": 5.842696629213484e-06,
+      "loss": 0.799,
+      "step": 438
+    },
+    {
+      "epoch": 4.877777777777778,
+      "grad_norm": 0.07145966589450836,
+      "learning_rate": 5.393258426966293e-06,
+      "loss": 0.8621,
+      "step": 439
+    },
+    {
+      "epoch": 4.888888888888889,
+      "grad_norm": 0.06943900138139725,
+      "learning_rate": 4.943820224719101e-06,
+      "loss": 0.871,
+      "step": 440
+    },
+    {
+      "epoch": 4.9,
+      "grad_norm": 0.14493992924690247,
+      "learning_rate": 4.49438202247191e-06,
+      "loss": 0.8927,
+      "step": 441
+    },
+    {
+      "epoch": 4.911111111111111,
+      "grad_norm": 0.060017552226781845,
+      "learning_rate": 4.044943820224719e-06,
+      "loss": 0.8933,
+      "step": 442
+    },
+    {
+      "epoch": 4.9222222222222225,
+      "grad_norm": 0.0633043721318245,
+      "learning_rate": 3.5955056179775286e-06,
+      "loss": 0.8325,
+      "step": 443
+    },
+    {
+      "epoch": 4.933333333333334,
+      "grad_norm": 0.06329839676618576,
+      "learning_rate": 3.1460674157303375e-06,
+      "loss": 0.7598,
+      "step": 444
+    },
+    {
+      "epoch": 4.944444444444445,
+      "grad_norm": 0.07061803340911865,
+      "learning_rate": 2.6966292134831465e-06,
+      "loss": 0.7973,
+      "step": 445
+    },
+    {
+      "epoch": 4.955555555555556,
+      "grad_norm": 0.06513971835374832,
+      "learning_rate": 2.247191011235955e-06,
+      "loss": 0.8717,
+      "step": 446
+    },
+    {
+      "epoch": 4.966666666666667,
+      "grad_norm": 0.07594099640846252,
+      "learning_rate": 1.7977528089887643e-06,
+      "loss": 0.7733,
+      "step": 447
+    },
+    {
+      "epoch": 4.977777777777778,
+      "grad_norm": 0.06038981303572655,
+      "learning_rate": 1.3483146067415732e-06,
+      "loss": 0.7951,
+      "step": 448
+    },
+    {
+      "epoch": 4.988888888888889,
+      "grad_norm": 0.061661411076784134,
+      "learning_rate": 8.988764044943822e-07,
+      "loss": 0.7623,
+      "step": 449
+    },
+    {
+      "epoch": 5.0,
+      "grad_norm": 0.05862731486558914,
+      "learning_rate": 4.494382022471911e-07,
+      "loss": 0.819,
+      "step": 450
     }
   ],
   "logging_steps": 1,
         "should_evaluate": false,
         "should_log": false,
         "should_save": true,
+        "should_training_stop": true
       },
       "attributes": {}
     }
   },
+  "total_flos": 5.021161104605184e+18,
   "train_batch_size": 16,
   "trial_name": null,
   "trial_params": null