Training in progress, step 600, checkpoint

Browse files

Files changed (5) hide show

last-checkpoint/adapter_model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +1421 -5

last-checkpoint/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dddb1f59b2959121b12b50bb61b63842301b162d95ecbb055867d2afd083dc58
 size 239536272

 version https://git-lfs.github.com/spec/v1
+oid sha256:380a9f8bd1452df8063d91dc4f5608799c0eed96ed4cdc18292e7cb70a2346a6
 size 239536272

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f8da9553519cebee2d524dee081101503a9ff9f4caeaa4df0ef8cb6a1e928f4c
 size 183010548

 version https://git-lfs.github.com/spec/v1
+oid sha256:ce3b58265d130261bec295db93a3ae710f09eb686a51c216c625ecac4a9bbd35
 size 183010548

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f99e922e9a1bb9d79225c975af051497d2174fcce841ca602f315829e5457fd7
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:84a73f7114eacaa8f3d0a34708cd3c380dc843298650a38ecf02f4e2db17ae9f
 size 14244

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4507382881947f75306f0e508c94993046ca897effd5134c9ce5479a6deef707
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:d4dd75a8923222c498d00e1167f4c1be1383dd919c1ebf448c63527342992a3c
 size 1064

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
-  "best_metric": 0.7893620729446411,
-  "best_model_checkpoint": "miner_id_24/checkpoint-400",
-  "epoch": 0.06563429391857245,
   "eval_steps": 100,
-  "global_step": 400,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -2847,6 +2847,1422 @@
       "eval_samples_per_second": 3.45,
       "eval_steps_per_second": 3.45,
       "step": 400
     }
   ],
   "logging_steps": 1,
@@ -2875,7 +4291,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 5.574872376174182e+16,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

 {
+  "best_metric": 0.7834404110908508,
+  "best_model_checkpoint": "miner_id_24/checkpoint-600",
+  "epoch": 0.09845144087785868,
   "eval_steps": 100,
+  "global_step": 600,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 3.45,
       "eval_steps_per_second": 3.45,
       "step": 400
+    },
+    {
+      "epoch": 0.06579837965336889,
+      "grad_norm": 0.15986493229866028,
+      "learning_rate": 0.0001997741149322696,
+      "loss": 0.5572,
+      "step": 401
+    },
+    {
+      "epoch": 0.06596246538816532,
+      "grad_norm": 0.2333621084690094,
+      "learning_rate": 0.00019977295847037837,
+      "loss": 0.7397,
+      "step": 402
+    },
+    {
+      "epoch": 0.06612655112296174,
+      "grad_norm": 0.2305152714252472,
+      "learning_rate": 0.0001997717990590413,
+      "loss": 0.7124,
+      "step": 403
+    },
+    {
+      "epoch": 0.06629063685775818,
+      "grad_norm": 0.22249913215637207,
+      "learning_rate": 0.00019977063669829271,
+      "loss": 0.672,
+      "step": 404
+    },
+    {
+      "epoch": 0.06645472259255461,
+      "grad_norm": 0.23822623491287231,
+      "learning_rate": 0.00019976947138816695,
+      "loss": 0.7609,
+      "step": 405
+    },
+    {
+      "epoch": 0.06661880832735104,
+      "grad_norm": 0.4435589611530304,
+      "learning_rate": 0.00019976830312869848,
+      "loss": 0.7179,
+      "step": 406
+    },
+    {
+      "epoch": 0.06678289406214748,
+      "grad_norm": 8.647324562072754,
+      "learning_rate": 0.0001997671319199218,
+      "loss": 0.8875,
+      "step": 407
+    },
+    {
+      "epoch": 0.0669469797969439,
+      "grad_norm": 0.37018343806266785,
+      "learning_rate": 0.00019976595776187154,
+      "loss": 0.8438,
+      "step": 408
+    },
+    {
+      "epoch": 0.06711106553174033,
+      "grad_norm": 0.2479041963815689,
+      "learning_rate": 0.00019976478065458243,
+      "loss": 0.686,
+      "step": 409
+    },
+    {
+      "epoch": 0.06727515126653677,
+      "grad_norm": 0.2105472832918167,
+      "learning_rate": 0.00019976360059808927,
+      "loss": 0.7365,
+      "step": 410
+    },
+    {
+      "epoch": 0.0674392370013332,
+      "grad_norm": 0.30589988827705383,
+      "learning_rate": 0.00019976241759242692,
+      "loss": 0.8328,
+      "step": 411
+    },
+    {
+      "epoch": 0.06760332273612962,
+      "grad_norm": 0.24568642675876617,
+      "learning_rate": 0.0001997612316376304,
+      "loss": 0.8076,
+      "step": 412
+    },
+    {
+      "epoch": 0.06776740847092606,
+      "grad_norm": 0.22725167870521545,
+      "learning_rate": 0.00019976004273373468,
+      "loss": 0.6269,
+      "step": 413
+    },
+    {
+      "epoch": 0.06793149420572249,
+      "grad_norm": 0.30800437927246094,
+      "learning_rate": 0.00019975885088077499,
+      "loss": 0.685,
+      "step": 414
+    },
+    {
+      "epoch": 0.06809557994051892,
+      "grad_norm": 0.27556002140045166,
+      "learning_rate": 0.00019975765607878655,
+      "loss": 0.5931,
+      "step": 415
+    },
+    {
+      "epoch": 0.06825966567531536,
+      "grad_norm": 0.46520912647247314,
+      "learning_rate": 0.0001997564583278046,
+      "loss": 0.7079,
+      "step": 416
+    },
+    {
+      "epoch": 0.06842375141011178,
+      "grad_norm": 0.5119454264640808,
+      "learning_rate": 0.00019975525762786468,
+      "loss": 0.9523,
+      "step": 417
+    },
+    {
+      "epoch": 0.06858783714490821,
+      "grad_norm": 0.23642492294311523,
+      "learning_rate": 0.0001997540539790022,
+      "loss": 0.723,
+      "step": 418
+    },
+    {
+      "epoch": 0.06875192287970465,
+      "grad_norm": 0.3586544096469879,
+      "learning_rate": 0.0001997528473812527,
+      "loss": 0.864,
+      "step": 419
+    },
+    {
+      "epoch": 0.06891600861450108,
+      "grad_norm": 0.30147257447242737,
+      "learning_rate": 0.00019975163783465195,
+      "loss": 0.8027,
+      "step": 420
+    },
+    {
+      "epoch": 0.0690800943492975,
+      "grad_norm": 0.2807484269142151,
+      "learning_rate": 0.00019975042533923568,
+      "loss": 0.7956,
+      "step": 421
+    },
+    {
+      "epoch": 0.06924418008409394,
+      "grad_norm": 0.3472043573856354,
+      "learning_rate": 0.00019974920989503968,
+      "loss": 0.8275,
+      "step": 422
+    },
+    {
+      "epoch": 0.06940826581889037,
+      "grad_norm": 0.2816649377346039,
+      "learning_rate": 0.00019974799150209992,
+      "loss": 0.7084,
+      "step": 423
+    },
+    {
+      "epoch": 0.0695723515536868,
+      "grad_norm": 0.35236573219299316,
+      "learning_rate": 0.00019974677016045244,
+      "loss": 0.8452,
+      "step": 424
+    },
+    {
+      "epoch": 0.06973643728848324,
+      "grad_norm": 0.2761305868625641,
+      "learning_rate": 0.0001997455458701333,
+      "loss": 0.8538,
+      "step": 425
+    },
+    {
+      "epoch": 0.06990052302327966,
+      "grad_norm": 0.25361061096191406,
+      "learning_rate": 0.0001997443186311787,
+      "loss": 0.5883,
+      "step": 426
+    },
+    {
+      "epoch": 0.07006460875807609,
+      "grad_norm": 0.3191013038158417,
+      "learning_rate": 0.00019974308844362496,
+      "loss": 0.8699,
+      "step": 427
+    },
+    {
+      "epoch": 0.07022869449287253,
+      "grad_norm": 0.28123271465301514,
+      "learning_rate": 0.00019974185530750838,
+      "loss": 0.8439,
+      "step": 428
+    },
+    {
+      "epoch": 0.07039278022766896,
+      "grad_norm": 0.5743336081504822,
+      "learning_rate": 0.00019974061922286546,
+      "loss": 0.9119,
+      "step": 429
+    },
+    {
+      "epoch": 0.07055686596246538,
+      "grad_norm": 0.3127213418483734,
+      "learning_rate": 0.00019973938018973274,
+      "loss": 0.9264,
+      "step": 430
+    },
+    {
+      "epoch": 0.07072095169726182,
+      "grad_norm": 0.34507542848587036,
+      "learning_rate": 0.0001997381382081468,
+      "loss": 0.7802,
+      "step": 431
+    },
+    {
+      "epoch": 0.07088503743205825,
+      "grad_norm": 0.28120848536491394,
+      "learning_rate": 0.00019973689327814443,
+      "loss": 0.7031,
+      "step": 432
+    },
+    {
+      "epoch": 0.07104912316685467,
+      "grad_norm": 0.4257034659385681,
+      "learning_rate": 0.0001997356453997624,
+      "loss": 0.8775,
+      "step": 433
+    },
+    {
+      "epoch": 0.07121320890165111,
+      "grad_norm": 0.3616534173488617,
+      "learning_rate": 0.00019973439457303757,
+      "loss": 0.8013,
+      "step": 434
+    },
+    {
+      "epoch": 0.07137729463644754,
+      "grad_norm": 0.38141894340515137,
+      "learning_rate": 0.00019973314079800697,
+      "loss": 0.8423,
+      "step": 435
+    },
+    {
+      "epoch": 0.07154138037124398,
+      "grad_norm": 0.31338822841644287,
+      "learning_rate": 0.0001997318840747076,
+      "loss": 0.8506,
+      "step": 436
+    },
+    {
+      "epoch": 0.07170546610604041,
+      "grad_norm": 0.3113965690135956,
+      "learning_rate": 0.00019973062440317664,
+      "loss": 0.8534,
+      "step": 437
+    },
+    {
+      "epoch": 0.07186955184083683,
+      "grad_norm": 0.39697837829589844,
+      "learning_rate": 0.00019972936178345134,
+      "loss": 0.8224,
+      "step": 438
+    },
+    {
+      "epoch": 0.07203363757563327,
+      "grad_norm": 0.30266883969306946,
+      "learning_rate": 0.00019972809621556902,
+      "loss": 0.7729,
+      "step": 439
+    },
+    {
+      "epoch": 0.0721977233104297,
+      "grad_norm": 0.3997204601764679,
+      "learning_rate": 0.0001997268276995671,
+      "loss": 0.7422,
+      "step": 440
+    },
+    {
+      "epoch": 0.07236180904522613,
+      "grad_norm": 0.4488981366157532,
+      "learning_rate": 0.00019972555623548306,
+      "loss": 0.898,
+      "step": 441
+    },
+    {
+      "epoch": 0.07252589478002257,
+      "grad_norm": 0.3362194299697876,
+      "learning_rate": 0.0001997242818233545,
+      "loss": 0.8179,
+      "step": 442
+    },
+    {
+      "epoch": 0.072689980514819,
+      "grad_norm": 0.4512924551963806,
+      "learning_rate": 0.00019972300446321909,
+      "loss": 0.8867,
+      "step": 443
+    },
+    {
+      "epoch": 0.07285406624961542,
+      "grad_norm": 0.5159667134284973,
+      "learning_rate": 0.00019972172415511457,
+      "loss": 0.7261,
+      "step": 444
+    },
+    {
+      "epoch": 0.07301815198441186,
+      "grad_norm": 0.46027064323425293,
+      "learning_rate": 0.00019972044089907882,
+      "loss": 1.0376,
+      "step": 445
+    },
+    {
+      "epoch": 0.07318223771920829,
+      "grad_norm": 0.4172539710998535,
+      "learning_rate": 0.00019971915469514973,
+      "loss": 0.7398,
+      "step": 446
+    },
+    {
+      "epoch": 0.07334632345400471,
+      "grad_norm": 0.3354978561401367,
+      "learning_rate": 0.0001997178655433654,
+      "loss": 0.9241,
+      "step": 447
+    },
+    {
+      "epoch": 0.07351040918880115,
+      "grad_norm": 0.48640313744544983,
+      "learning_rate": 0.0001997165734437639,
+      "loss": 0.9889,
+      "step": 448
+    },
+    {
+      "epoch": 0.07367449492359758,
+      "grad_norm": 0.36919528245925903,
+      "learning_rate": 0.0001997152783963834,
+      "loss": 0.9512,
+      "step": 449
+    },
+    {
+      "epoch": 0.073838580658394,
+      "grad_norm": 0.410256028175354,
+      "learning_rate": 0.0001997139804012622,
+      "loss": 0.9793,
+      "step": 450
+    },
+    {
+      "epoch": 0.07400266639319045,
+      "grad_norm": 0.25387945771217346,
+      "learning_rate": 0.00019971267945843865,
+      "loss": 0.4638,
+      "step": 451
+    },
+    {
+      "epoch": 0.07416675212798687,
+      "grad_norm": 0.2956909239292145,
+      "learning_rate": 0.0001997113755679513,
+      "loss": 0.6796,
+      "step": 452
+    },
+    {
+      "epoch": 0.0743308378627833,
+      "grad_norm": 0.26147258281707764,
+      "learning_rate": 0.00019971006872983858,
+      "loss": 0.6444,
+      "step": 453
+    },
+    {
+      "epoch": 0.07449492359757974,
+      "grad_norm": 0.29692378640174866,
+      "learning_rate": 0.00019970875894413916,
+      "loss": 0.7089,
+      "step": 454
+    },
+    {
+      "epoch": 0.07465900933237617,
+      "grad_norm": 0.3056322932243347,
+      "learning_rate": 0.00019970744621089178,
+      "loss": 0.6574,
+      "step": 455
+    },
+    {
+      "epoch": 0.0748230950671726,
+      "grad_norm": 0.290412962436676,
+      "learning_rate": 0.00019970613053013527,
+      "loss": 0.7592,
+      "step": 456
+    },
+    {
+      "epoch": 0.07498718080196903,
+      "grad_norm": 0.2817339301109314,
+      "learning_rate": 0.00019970481190190843,
+      "loss": 0.8211,
+      "step": 457
+    },
+    {
+      "epoch": 0.07515126653676546,
+      "grad_norm": 0.2601938545703888,
+      "learning_rate": 0.00019970349032625035,
+      "loss": 0.7817,
+      "step": 458
+    },
+    {
+      "epoch": 0.07531535227156189,
+      "grad_norm": 0.30698585510253906,
+      "learning_rate": 0.00019970216580320003,
+      "loss": 0.6768,
+      "step": 459
+    },
+    {
+      "epoch": 0.07547943800635833,
+      "grad_norm": 0.27613088488578796,
+      "learning_rate": 0.00019970083833279666,
+      "loss": 0.7026,
+      "step": 460
+    },
+    {
+      "epoch": 0.07564352374115475,
+      "grad_norm": 0.2695368826389313,
+      "learning_rate": 0.00019969950791507942,
+      "loss": 0.7914,
+      "step": 461
+    },
+    {
+      "epoch": 0.07580760947595118,
+      "grad_norm": 0.2841225266456604,
+      "learning_rate": 0.0001996981745500877,
+      "loss": 0.7972,
+      "step": 462
+    },
+    {
+      "epoch": 0.07597169521074762,
+      "grad_norm": 0.25151097774505615,
+      "learning_rate": 0.00019969683823786093,
+      "loss": 0.6287,
+      "step": 463
+    },
+    {
+      "epoch": 0.07613578094554405,
+      "grad_norm": 0.2680940330028534,
+      "learning_rate": 0.00019969549897843857,
+      "loss": 0.6865,
+      "step": 464
+    },
+    {
+      "epoch": 0.07629986668034047,
+      "grad_norm": 0.2942931354045868,
+      "learning_rate": 0.0001996941567718602,
+      "loss": 0.7175,
+      "step": 465
+    },
+    {
+      "epoch": 0.07646395241513691,
+      "grad_norm": 0.33051207661628723,
+      "learning_rate": 0.00019969281161816556,
+      "loss": 0.8518,
+      "step": 466
+    },
+    {
+      "epoch": 0.07662803814993334,
+      "grad_norm": 0.28793030977249146,
+      "learning_rate": 0.00019969146351739436,
+      "loss": 0.806,
+      "step": 467
+    },
+    {
+      "epoch": 0.07679212388472977,
+      "grad_norm": 0.27401405572891235,
+      "learning_rate": 0.00019969011246958647,
+      "loss": 0.7563,
+      "step": 468
+    },
+    {
+      "epoch": 0.0769562096195262,
+      "grad_norm": 0.28435268998146057,
+      "learning_rate": 0.00019968875847478184,
+      "loss": 0.8319,
+      "step": 469
+    },
+    {
+      "epoch": 0.07712029535432263,
+      "grad_norm": 0.37528687715530396,
+      "learning_rate": 0.00019968740153302047,
+      "loss": 0.7616,
+      "step": 470
+    },
+    {
+      "epoch": 0.07728438108911906,
+      "grad_norm": 0.3924008309841156,
+      "learning_rate": 0.00019968604164434246,
+      "loss": 0.8162,
+      "step": 471
+    },
+    {
+      "epoch": 0.0774484668239155,
+      "grad_norm": 0.3189226984977722,
+      "learning_rate": 0.0001996846788087881,
+      "loss": 0.718,
+      "step": 472
+    },
+    {
+      "epoch": 0.07761255255871193,
+      "grad_norm": 0.2787996828556061,
+      "learning_rate": 0.0001996833130263976,
+      "loss": 0.7551,
+      "step": 473
+    },
+    {
+      "epoch": 0.07777663829350835,
+      "grad_norm": 0.353397935628891,
+      "learning_rate": 0.0001996819442972113,
+      "loss": 0.6889,
+      "step": 474
+    },
+    {
+      "epoch": 0.07794072402830479,
+      "grad_norm": 0.28389084339141846,
+      "learning_rate": 0.00019968057262126972,
+      "loss": 0.8333,
+      "step": 475
+    },
+    {
+      "epoch": 0.07810480976310122,
+      "grad_norm": 0.2652452886104584,
+      "learning_rate": 0.00019967919799861346,
+      "loss": 0.7011,
+      "step": 476
+    },
+    {
+      "epoch": 0.07826889549789764,
+      "grad_norm": 0.2802514135837555,
+      "learning_rate": 0.00019967782042928307,
+      "loss": 0.7594,
+      "step": 477
+    },
+    {
+      "epoch": 0.07843298123269409,
+      "grad_norm": 0.4574797451496124,
+      "learning_rate": 0.0001996764399133193,
+      "loss": 0.7706,
+      "step": 478
+    },
+    {
+      "epoch": 0.07859706696749051,
+      "grad_norm": 0.3003863990306854,
+      "learning_rate": 0.00019967505645076294,
+      "loss": 0.7685,
+      "step": 479
+    },
+    {
+      "epoch": 0.07876115270228695,
+      "grad_norm": 0.3658941388130188,
+      "learning_rate": 0.0001996736700416549,
+      "loss": 0.7536,
+      "step": 480
+    },
+    {
+      "epoch": 0.07892523843708338,
+      "grad_norm": 0.3780055344104767,
+      "learning_rate": 0.00019967228068603623,
+      "loss": 0.7793,
+      "step": 481
+    },
+    {
+      "epoch": 0.0790893241718798,
+      "grad_norm": 0.3696160316467285,
+      "learning_rate": 0.00019967088838394789,
+      "loss": 0.8083,
+      "step": 482
+    },
+    {
+      "epoch": 0.07925340990667624,
+      "grad_norm": 0.3387402296066284,
+      "learning_rate": 0.00019966949313543112,
+      "loss": 0.8436,
+      "step": 483
+    },
+    {
+      "epoch": 0.07941749564147267,
+      "grad_norm": 0.3094363212585449,
+      "learning_rate": 0.00019966809494052713,
+      "loss": 0.7768,
+      "step": 484
+    },
+    {
+      "epoch": 0.0795815813762691,
+      "grad_norm": 0.3164433240890503,
+      "learning_rate": 0.00019966669379927726,
+      "loss": 0.8517,
+      "step": 485
+    },
+    {
+      "epoch": 0.07974566711106554,
+      "grad_norm": 0.3255369961261749,
+      "learning_rate": 0.00019966528971172295,
+      "loss": 0.8573,
+      "step": 486
+    },
+    {
+      "epoch": 0.07990975284586196,
+      "grad_norm": 0.31875723600387573,
+      "learning_rate": 0.00019966388267790566,
+      "loss": 0.8253,
+      "step": 487
+    },
+    {
+      "epoch": 0.08007383858065839,
+      "grad_norm": 0.32112181186676025,
+      "learning_rate": 0.00019966247269786701,
+      "loss": 0.9794,
+      "step": 488
+    },
+    {
+      "epoch": 0.08023792431545483,
+      "grad_norm": 0.34695595502853394,
+      "learning_rate": 0.00019966105977164872,
+      "loss": 0.7167,
+      "step": 489
+    },
+    {
+      "epoch": 0.08040201005025126,
+      "grad_norm": 0.33684611320495605,
+      "learning_rate": 0.0001996596438992925,
+      "loss": 0.8618,
+      "step": 490
+    },
+    {
+      "epoch": 0.08056609578504768,
+      "grad_norm": 0.3515377342700958,
+      "learning_rate": 0.00019965822508084022,
+      "loss": 0.7417,
+      "step": 491
+    },
+    {
+      "epoch": 0.08073018151984412,
+      "grad_norm": 0.45770692825317383,
+      "learning_rate": 0.00019965680331633382,
+      "loss": 0.7696,
+      "step": 492
+    },
+    {
+      "epoch": 0.08089426725464055,
+      "grad_norm": 0.35616007447242737,
+      "learning_rate": 0.00019965537860581537,
+      "loss": 0.9678,
+      "step": 493
+    },
+    {
+      "epoch": 0.08105835298943698,
+      "grad_norm": 0.5278313159942627,
+      "learning_rate": 0.00019965395094932693,
+      "loss": 0.8417,
+      "step": 494
+    },
+    {
+      "epoch": 0.08122243872423342,
+      "grad_norm": 0.40865620970726013,
+      "learning_rate": 0.00019965252034691075,
+      "loss": 0.7845,
+      "step": 495
+    },
+    {
+      "epoch": 0.08138652445902984,
+      "grad_norm": 0.37308233976364136,
+      "learning_rate": 0.0001996510867986091,
+      "loss": 0.8607,
+      "step": 496
+    },
+    {
+      "epoch": 0.08155061019382627,
+      "grad_norm": 0.5070668458938599,
+      "learning_rate": 0.00019964965030446434,
+      "loss": 0.8668,
+      "step": 497
+    },
+    {
+      "epoch": 0.08171469592862271,
+      "grad_norm": 0.40923750400543213,
+      "learning_rate": 0.00019964821086451896,
+      "loss": 0.9865,
+      "step": 498
+    },
+    {
+      "epoch": 0.08187878166341914,
+      "grad_norm": 0.42455339431762695,
+      "learning_rate": 0.00019964676847881551,
+      "loss": 1.0331,
+      "step": 499
+    },
+    {
+      "epoch": 0.08204286739821556,
+      "grad_norm": 0.5710748434066772,
+      "learning_rate": 0.00019964532314739662,
+      "loss": 0.9817,
+      "step": 500
+    },
+    {
+      "epoch": 0.08204286739821556,
+      "eval_loss": 0.7902427911758423,
+      "eval_runtime": 70.7583,
+      "eval_samples_per_second": 3.462,
+      "eval_steps_per_second": 3.462,
+      "step": 500
+    },
+    {
+      "epoch": 0.082206953133012,
+      "grad_norm": 0.2542221248149872,
+      "learning_rate": 0.00019964387487030503,
+      "loss": 0.544,
+      "step": 501
+    },
+    {
+      "epoch": 0.08237103886780843,
+      "grad_norm": 0.2645345628261566,
+      "learning_rate": 0.00019964242364758355,
+      "loss": 0.5941,
+      "step": 502
+    },
+    {
+      "epoch": 0.08253512460260486,
+      "grad_norm": 0.24813871085643768,
+      "learning_rate": 0.00019964096947927508,
+      "loss": 0.6608,
+      "step": 503
+    },
+    {
+      "epoch": 0.0826992103374013,
+      "grad_norm": 0.24130459129810333,
+      "learning_rate": 0.00019963951236542257,
+      "loss": 0.637,
+      "step": 504
+    },
+    {
+      "epoch": 0.08286329607219772,
+      "grad_norm": 0.3400072157382965,
+      "learning_rate": 0.00019963805230606915,
+      "loss": 0.638,
+      "step": 505
+    },
+    {
+      "epoch": 0.08302738180699415,
+      "grad_norm": 0.2924500107765198,
+      "learning_rate": 0.00019963658930125794,
+      "loss": 0.7683,
+      "step": 506
+    },
+    {
+      "epoch": 0.08319146754179059,
+      "grad_norm": 0.47059324383735657,
+      "learning_rate": 0.00019963512335103222,
+      "loss": 0.7232,
+      "step": 507
+    },
+    {
+      "epoch": 0.08335555327658702,
+      "grad_norm": 0.29273051023483276,
+      "learning_rate": 0.00019963365445543532,
+      "loss": 0.7313,
+      "step": 508
+    },
+    {
+      "epoch": 0.08351963901138344,
+      "grad_norm": 0.30427420139312744,
+      "learning_rate": 0.00019963218261451066,
+      "loss": 0.7675,
+      "step": 509
+    },
+    {
+      "epoch": 0.08368372474617988,
+      "grad_norm": 0.27146315574645996,
+      "learning_rate": 0.00019963070782830173,
+      "loss": 0.6338,
+      "step": 510
+    },
+    {
+      "epoch": 0.08384781048097631,
+      "grad_norm": 0.26816409826278687,
+      "learning_rate": 0.00019962923009685216,
+      "loss": 0.692,
+      "step": 511
+    },
+    {
+      "epoch": 0.08401189621577274,
+      "grad_norm": 0.2671113610267639,
+      "learning_rate": 0.0001996277494202056,
+      "loss": 0.7029,
+      "step": 512
+    },
+    {
+      "epoch": 0.08417598195056918,
+      "grad_norm": 0.26612916588783264,
+      "learning_rate": 0.00019962626579840583,
+      "loss": 0.8666,
+      "step": 513
+    },
+    {
+      "epoch": 0.0843400676853656,
+      "grad_norm": 0.2911565601825714,
+      "learning_rate": 0.00019962477923149674,
+      "loss": 0.6847,
+      "step": 514
+    },
+    {
+      "epoch": 0.08450415342016203,
+      "grad_norm": 0.2518673241138458,
+      "learning_rate": 0.00019962328971952225,
+      "loss": 0.6946,
+      "step": 515
+    },
+    {
+      "epoch": 0.08466823915495847,
+      "grad_norm": 0.32353517413139343,
+      "learning_rate": 0.0001996217972625264,
+      "loss": 0.7648,
+      "step": 516
+    },
+    {
+      "epoch": 0.0848323248897549,
+      "grad_norm": 0.33887815475463867,
+      "learning_rate": 0.00019962030186055328,
+      "loss": 0.74,
+      "step": 517
+    },
+    {
+      "epoch": 0.08499641062455132,
+      "grad_norm": 0.30510279536247253,
+      "learning_rate": 0.00019961880351364712,
+      "loss": 0.8442,
+      "step": 518
+    },
+    {
+      "epoch": 0.08516049635934776,
+      "grad_norm": 0.4953973889350891,
+      "learning_rate": 0.00019961730222185225,
+      "loss": 0.7548,
+      "step": 519
+    },
+    {
+      "epoch": 0.08532458209414419,
+      "grad_norm": 0.3338676393032074,
+      "learning_rate": 0.00019961579798521297,
+      "loss": 0.7259,
+      "step": 520
+    },
+    {
+      "epoch": 0.08548866782894061,
+      "grad_norm": 0.29042428731918335,
+      "learning_rate": 0.0001996142908037738,
+      "loss": 0.8281,
+      "step": 521
+    },
+    {
+      "epoch": 0.08565275356373706,
+      "grad_norm": 0.30394816398620605,
+      "learning_rate": 0.0001996127806775793,
+      "loss": 0.6939,
+      "step": 522
+    },
+    {
+      "epoch": 0.08581683929853348,
+      "grad_norm": 0.2836878001689911,
+      "learning_rate": 0.0001996112676066741,
+      "loss": 0.6618,
+      "step": 523
+    },
+    {
+      "epoch": 0.08598092503332991,
+      "grad_norm": 0.33087509870529175,
+      "learning_rate": 0.00019960975159110295,
+      "loss": 0.7448,
+      "step": 524
+    },
+    {
+      "epoch": 0.08614501076812635,
+      "grad_norm": 0.26727747917175293,
+      "learning_rate": 0.0001996082326309106,
+      "loss": 0.693,
+      "step": 525
+    },
+    {
+      "epoch": 0.08630909650292277,
+      "grad_norm": 0.2853117287158966,
+      "learning_rate": 0.000199606710726142,
+      "loss": 0.7681,
+      "step": 526
+    },
+    {
+      "epoch": 0.08647318223771921,
+      "grad_norm": 0.5314649343490601,
+      "learning_rate": 0.00019960518587684213,
+      "loss": 0.6646,
+      "step": 527
+    },
+    {
+      "epoch": 0.08663726797251564,
+      "grad_norm": 0.33825939893722534,
+      "learning_rate": 0.00019960365808305609,
+      "loss": 0.7447,
+      "step": 528
+    },
+    {
+      "epoch": 0.08680135370731207,
+      "grad_norm": 0.3486720323562622,
+      "learning_rate": 0.00019960212734482902,
+      "loss": 0.7324,
+      "step": 529
+    },
+    {
+      "epoch": 0.08696543944210851,
+      "grad_norm": 0.44422647356987,
+      "learning_rate": 0.00019960059366220617,
+      "loss": 0.798,
+      "step": 530
+    },
+    {
+      "epoch": 0.08712952517690493,
+      "grad_norm": 0.2957271635532379,
+      "learning_rate": 0.00019959905703523288,
+      "loss": 0.7875,
+      "step": 531
+    },
+    {
+      "epoch": 0.08729361091170136,
+      "grad_norm": 0.32711222767829895,
+      "learning_rate": 0.00019959751746395461,
+      "loss": 0.8314,
+      "step": 532
+    },
+    {
+      "epoch": 0.0874576966464978,
+      "grad_norm": 0.3585834205150604,
+      "learning_rate": 0.00019959597494841681,
+      "loss": 0.81,
+      "step": 533
+    },
+    {
+      "epoch": 0.08762178238129423,
+      "grad_norm": 0.4582952857017517,
+      "learning_rate": 0.00019959442948866513,
+      "loss": 0.8199,
+      "step": 534
+    },
+    {
+      "epoch": 0.08778586811609065,
+      "grad_norm": 0.3768620491027832,
+      "learning_rate": 0.00019959288108474527,
+      "loss": 0.7852,
+      "step": 535
+    },
+    {
+      "epoch": 0.0879499538508871,
+      "grad_norm": 0.37209224700927734,
+      "learning_rate": 0.00019959132973670292,
+      "loss": 0.8902,
+      "step": 536
+    },
+    {
+      "epoch": 0.08811403958568352,
+      "grad_norm": 0.3185194730758667,
+      "learning_rate": 0.00019958977544458402,
+      "loss": 0.7754,
+      "step": 537
+    },
+    {
+      "epoch": 0.08827812532047995,
+      "grad_norm": 0.32522639632225037,
+      "learning_rate": 0.00019958821820843448,
+      "loss": 0.8702,
+      "step": 538
+    },
+    {
+      "epoch": 0.08844221105527639,
+      "grad_norm": 0.33792081475257874,
+      "learning_rate": 0.00019958665802830036,
+      "loss": 0.7749,
+      "step": 539
+    },
+    {
+      "epoch": 0.08860629679007281,
+      "grad_norm": 0.3337690830230713,
+      "learning_rate": 0.0001995850949042277,
+      "loss": 0.8281,
+      "step": 540
+    },
+    {
+      "epoch": 0.08877038252486924,
+      "grad_norm": 0.3441987931728363,
+      "learning_rate": 0.00019958352883626284,
+      "loss": 0.7142,
+      "step": 541
+    },
+    {
+      "epoch": 0.08893446825966568,
+      "grad_norm": 0.42077013850212097,
+      "learning_rate": 0.00019958195982445199,
+      "loss": 0.7891,
+      "step": 542
+    },
+    {
+      "epoch": 0.0890985539944621,
+      "grad_norm": 0.4405931830406189,
+      "learning_rate": 0.00019958038786884155,
+      "loss": 0.9158,
+      "step": 543
+    },
+    {
+      "epoch": 0.08926263972925853,
+      "grad_norm": 0.3358551859855652,
+      "learning_rate": 0.00019957881296947798,
+      "loss": 0.6941,
+      "step": 544
+    },
+    {
+      "epoch": 0.08942672546405497,
+      "grad_norm": 0.39642512798309326,
+      "learning_rate": 0.00019957723512640784,
+      "loss": 0.7531,
+      "step": 545
+    },
+    {
+      "epoch": 0.0895908111988514,
+      "grad_norm": 0.428423672914505,
+      "learning_rate": 0.0001995756543396778,
+      "loss": 0.8928,
+      "step": 546
+    },
+    {
+      "epoch": 0.08975489693364783,
+      "grad_norm": 0.49737995862960815,
+      "learning_rate": 0.00019957407060933457,
+      "loss": 0.8893,
+      "step": 547
+    },
+    {
+      "epoch": 0.08991898266844427,
+      "grad_norm": 0.45987364649772644,
+      "learning_rate": 0.00019957248393542498,
+      "loss": 0.6813,
+      "step": 548
+    },
+    {
+      "epoch": 0.0900830684032407,
+      "grad_norm": 0.5005274415016174,
+      "learning_rate": 0.0001995708943179959,
+      "loss": 0.7574,
+      "step": 549
+    },
+    {
+      "epoch": 0.09024715413803712,
+      "grad_norm": 0.6577406525611877,
+      "learning_rate": 0.00019956930175709436,
+      "loss": 0.9468,
+      "step": 550
+    },
+    {
+      "epoch": 0.09041123987283356,
+      "grad_norm": 0.30720874667167664,
+      "learning_rate": 0.0001995677062527674,
+      "loss": 0.6136,
+      "step": 551
+    },
+    {
+      "epoch": 0.09057532560762999,
+      "grad_norm": 0.24331681430339813,
+      "learning_rate": 0.00019956610780506222,
+      "loss": 0.6703,
+      "step": 552
+    },
+    {
+      "epoch": 0.09073941134242641,
+      "grad_norm": 0.24991828203201294,
+      "learning_rate": 0.00019956450641402609,
+      "loss": 0.6179,
+      "step": 553
+    },
+    {
+      "epoch": 0.09090349707722285,
+      "grad_norm": 0.26630428433418274,
+      "learning_rate": 0.0001995629020797063,
+      "loss": 0.9014,
+      "step": 554
+    },
+    {
+      "epoch": 0.09106758281201928,
+      "grad_norm": 0.2855633795261383,
+      "learning_rate": 0.00019956129480215026,
+      "loss": 0.6828,
+      "step": 555
+    },
+    {
+      "epoch": 0.0912316685468157,
+      "grad_norm": 0.292644739151001,
+      "learning_rate": 0.00019955968458140557,
+      "loss": 0.6975,
+      "step": 556
+    },
+    {
+      "epoch": 0.09139575428161215,
+      "grad_norm": 0.3429616689682007,
+      "learning_rate": 0.00019955807141751975,
+      "loss": 0.7558,
+      "step": 557
+    },
+    {
+      "epoch": 0.09155984001640857,
+      "grad_norm": 0.3144456744194031,
+      "learning_rate": 0.00019955645531054056,
+      "loss": 0.6571,
+      "step": 558
+    },
+    {
+      "epoch": 0.091723925751205,
+      "grad_norm": 0.2671104669570923,
+      "learning_rate": 0.0001995548362605157,
+      "loss": 0.7479,
+      "step": 559
+    },
+    {
+      "epoch": 0.09188801148600144,
+      "grad_norm": 0.31418734788894653,
+      "learning_rate": 0.0001995532142674931,
+      "loss": 0.6673,
+      "step": 560
+    },
+    {
+      "epoch": 0.09205209722079787,
+      "grad_norm": 0.3422031104564667,
+      "learning_rate": 0.00019955158933152064,
+      "loss": 0.7965,
+      "step": 561
+    },
+    {
+      "epoch": 0.09221618295559429,
+      "grad_norm": 0.2794967591762543,
+      "learning_rate": 0.00019954996145264643,
+      "loss": 0.716,
+      "step": 562
+    },
+    {
+      "epoch": 0.09238026869039073,
+      "grad_norm": 0.39105239510536194,
+      "learning_rate": 0.00019954833063091853,
+      "loss": 0.6283,
+      "step": 563
+    },
+    {
+      "epoch": 0.09254435442518716,
+      "grad_norm": 0.3515319526195526,
+      "learning_rate": 0.00019954669686638514,
+      "loss": 0.7045,
+      "step": 564
+    },
+    {
+      "epoch": 0.09270844015998359,
+      "grad_norm": 0.32754653692245483,
+      "learning_rate": 0.00019954506015909464,
+      "loss": 0.7455,
+      "step": 565
+    },
+    {
+      "epoch": 0.09287252589478003,
+      "grad_norm": 0.38944128155708313,
+      "learning_rate": 0.00019954342050909534,
+      "loss": 0.7544,
+      "step": 566
+    },
+    {
+      "epoch": 0.09303661162957645,
+      "grad_norm": 0.32821983098983765,
+      "learning_rate": 0.00019954177791643574,
+      "loss": 0.6731,
+      "step": 567
+    },
+    {
+      "epoch": 0.09320069736437288,
+      "grad_norm": 0.34350642561912537,
+      "learning_rate": 0.00019954013238116438,
+      "loss": 0.8399,
+      "step": 568
+    },
+    {
+      "epoch": 0.09336478309916932,
+      "grad_norm": 0.33525383472442627,
+      "learning_rate": 0.00019953848390332992,
+      "loss": 0.8189,
+      "step": 569
+    },
+    {
+      "epoch": 0.09352886883396574,
+      "grad_norm": 0.35323095321655273,
+      "learning_rate": 0.00019953683248298106,
+      "loss": 0.8978,
+      "step": 570
+    },
+    {
+      "epoch": 0.09369295456876219,
+      "grad_norm": 0.34625014662742615,
+      "learning_rate": 0.0001995351781201667,
+      "loss": 0.9358,
+      "step": 571
+    },
+    {
+      "epoch": 0.09385704030355861,
+      "grad_norm": 0.34705492854118347,
+      "learning_rate": 0.00019953352081493567,
+      "loss": 0.6916,
+      "step": 572
+    },
+    {
+      "epoch": 0.09402112603835504,
+      "grad_norm": 0.3524523675441742,
+      "learning_rate": 0.00019953186056733698,
+      "loss": 0.8914,
+      "step": 573
+    },
+    {
+      "epoch": 0.09418521177315148,
+      "grad_norm": 0.3258850574493408,
+      "learning_rate": 0.00019953019737741973,
+      "loss": 0.798,
+      "step": 574
+    },
+    {
+      "epoch": 0.0943492975079479,
+      "grad_norm": 0.34496602416038513,
+      "learning_rate": 0.00019952853124523306,
+      "loss": 0.7979,
+      "step": 575
+    },
+    {
+      "epoch": 0.09451338324274433,
+      "grad_norm": 0.3083381652832031,
+      "learning_rate": 0.00019952686217082621,
+      "loss": 0.8979,
+      "step": 576
+    },
+    {
+      "epoch": 0.09467746897754077,
+      "grad_norm": 0.3396521508693695,
+      "learning_rate": 0.0001995251901542486,
+      "loss": 0.9223,
+      "step": 577
+    },
+    {
+      "epoch": 0.0948415547123372,
+      "grad_norm": 0.30544814467430115,
+      "learning_rate": 0.00019952351519554956,
+      "loss": 0.9028,
+      "step": 578
+    },
+    {
+      "epoch": 0.09500564044713362,
+      "grad_norm": 0.4109005033969879,
+      "learning_rate": 0.00019952183729477865,
+      "loss": 0.7769,
+      "step": 579
+    },
+    {
+      "epoch": 0.09516972618193006,
+      "grad_norm": 0.40897905826568604,
+      "learning_rate": 0.00019952015645198547,
+      "loss": 0.6695,
+      "step": 580
+    },
+    {
+      "epoch": 0.09533381191672649,
+      "grad_norm": 0.4280332624912262,
+      "learning_rate": 0.0001995184726672197,
+      "loss": 0.8043,
+      "step": 581
+    },
+    {
+      "epoch": 0.09549789765152292,
+      "grad_norm": 0.38212618231773376,
+      "learning_rate": 0.00019951678594053114,
+      "loss": 0.8747,
+      "step": 582
+    },
+    {
+      "epoch": 0.09566198338631936,
+      "grad_norm": 0.3472382128238678,
+      "learning_rate": 0.0001995150962719696,
+      "loss": 0.8115,
+      "step": 583
+    },
+    {
+      "epoch": 0.09582606912111578,
+      "grad_norm": 0.43354499340057373,
+      "learning_rate": 0.0001995134036615851,
+      "loss": 0.7358,
+      "step": 584
+    },
+    {
+      "epoch": 0.09599015485591221,
+      "grad_norm": 0.40974995493888855,
+      "learning_rate": 0.00019951170810942763,
+      "loss": 0.8083,
+      "step": 585
+    },
+    {
+      "epoch": 0.09615424059070865,
+      "grad_norm": 0.33038821816444397,
+      "learning_rate": 0.00019951000961554728,
+      "loss": 0.8238,
+      "step": 586
+    },
+    {
+      "epoch": 0.09631832632550508,
+      "grad_norm": 0.32848745584487915,
+      "learning_rate": 0.00019950830817999434,
+      "loss": 0.7763,
+      "step": 587
+    },
+    {
+      "epoch": 0.0964824120603015,
+      "grad_norm": 0.434444397687912,
+      "learning_rate": 0.0001995066038028191,
+      "loss": 0.7627,
+      "step": 588
+    },
+    {
+      "epoch": 0.09664649779509794,
+      "grad_norm": 0.5674214959144592,
+      "learning_rate": 0.00019950489648407188,
+      "loss": 0.8407,
+      "step": 589
+    },
+    {
+      "epoch": 0.09681058352989437,
+      "grad_norm": 0.49122345447540283,
+      "learning_rate": 0.00019950318622380316,
+      "loss": 0.8482,
+      "step": 590
+    },
+    {
+      "epoch": 0.0969746692646908,
+      "grad_norm": 0.40209364891052246,
+      "learning_rate": 0.00019950147302206353,
+      "loss": 0.8458,
+      "step": 591
+    },
+    {
+      "epoch": 0.09713875499948724,
+      "grad_norm": 0.4982354938983917,
+      "learning_rate": 0.00019949975687890366,
+      "loss": 0.9152,
+      "step": 592
+    },
+    {
+      "epoch": 0.09730284073428366,
+      "grad_norm": 0.4886113703250885,
+      "learning_rate": 0.00019949803779437426,
+      "loss": 0.8261,
+      "step": 593
+    },
+    {
+      "epoch": 0.09746692646908009,
+      "grad_norm": 0.3699701130390167,
+      "learning_rate": 0.0001994963157685261,
+      "loss": 0.8339,
+      "step": 594
+    },
+    {
+      "epoch": 0.09763101220387653,
+      "grad_norm": 0.3579908013343811,
+      "learning_rate": 0.00019949459080141014,
+      "loss": 0.7867,
+      "step": 595
+    },
+    {
+      "epoch": 0.09779509793867296,
+      "grad_norm": 0.3716687262058258,
+      "learning_rate": 0.00019949286289307737,
+      "loss": 0.7762,
+      "step": 596
+    },
+    {
+      "epoch": 0.09795918367346938,
+      "grad_norm": 0.5843008160591125,
+      "learning_rate": 0.00019949113204357883,
+      "loss": 0.9435,
+      "step": 597
+    },
+    {
+      "epoch": 0.09812326940826582,
+      "grad_norm": 0.5935463309288025,
+      "learning_rate": 0.00019948939825296572,
+      "loss": 0.7525,
+      "step": 598
+    },
+    {
+      "epoch": 0.09828735514306225,
+      "grad_norm": 0.6296449303627014,
+      "learning_rate": 0.00019948766152128928,
+      "loss": 0.8801,
+      "step": 599
+    },
+    {
+      "epoch": 0.09845144087785868,
+      "grad_norm": 0.7150892615318298,
+      "learning_rate": 0.00019948592184860088,
+      "loss": 0.7897,
+      "step": 600
+    },
+    {
+      "epoch": 0.09845144087785868,
+      "eval_loss": 0.7834404110908508,
+      "eval_runtime": 70.8563,
+      "eval_samples_per_second": 3.458,
+      "eval_steps_per_second": 3.458,
+      "step": 600
     }
   ],
   "logging_steps": 1,
       "attributes": {}
     }
   },
+  "total_flos": 8.361437762578022e+16,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null