Training in progress, step 18000, checkpoint

Browse files

Files changed (8) hide show

last-checkpoint/optimizer.pt +1 -1
last-checkpoint/pytorch_model.bin +1 -1
last-checkpoint/rng_state_0.pth +1 -1
last-checkpoint/rng_state_1.pth +1 -1
last-checkpoint/rng_state_2.pth +1 -1
last-checkpoint/rng_state_3.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +1403 -3

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:812fcda5061d7de4a7da92cdd6711ec2afbf835c8020238eba1a87fa2647c8b6
 size 487156538

 version https://git-lfs.github.com/spec/v1
+oid sha256:927ded2136161debbd96279849965288bf7431b685070e843239f647821d61f6
 size 487156538

last-checkpoint/pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c224c261add98fe0339d147148fc0a76e42c8e24310c6dd432321ae3f1dadb74
 size 1059459406

 version https://git-lfs.github.com/spec/v1
+oid sha256:5d903e8e590da474361b4050df7262a9b6e838c971d4765f118a1f9c5c121e79
 size 1059459406

last-checkpoint/rng_state_0.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d4c704a47dcf46cac382d90ff8e0624392a2b656e11bcc3992f398d84d0ebb82
 size 14960

 version https://git-lfs.github.com/spec/v1
+oid sha256:9f63df0717d7d403241658b9d1ef68a022304bfea4ed08ee6b9ae2a0e774deb9
 size 14960

last-checkpoint/rng_state_1.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5d9cda69328bbe0d0997fad8adf0fffaeeb59070f39dc23aab395cd9ad7c9d13
 size 14960

 version https://git-lfs.github.com/spec/v1
+oid sha256:3124bc54b5cf99f9af69a72fc0ea506085633799107751230612ddfb52753447
 size 14960

last-checkpoint/rng_state_2.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:123869f1503646ce31a03e53b8dd8e7f653031cf2bb8e77517beb9d61807080c
 size 14960

 version https://git-lfs.github.com/spec/v1
+oid sha256:3754b63410489156d62387b5a35a069ce6d7e085a76b453cde9b242ce0aa0610
 size 14960

last-checkpoint/rng_state_3.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b87cc1de1a8a392429e07dce46be36f67138e50a910364574606aaf5bb967b69
 size 14960

 version https://git-lfs.github.com/spec/v1
+oid sha256:93cbb7d637d48f06226dfd537b7544f80d61c3f2631cc41c955ddf30bc5b0a70
 size 14960

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ccc2a52ae0327def30cc40f7f273a4a1537961b9b580753fe57ec7ecdab69b35
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:add33ce1c647f1ad24436fdd2c7095ade5081fad618777000690c7e187278b49
 size 1064

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -2,9 +2,9 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.031199574905791908,
   "eval_steps": 500,
-  "global_step": 16000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -11208,6 +11208,1406 @@
       "learning_rate": 0.0004949612511467957,
       "loss": 16.6007,
       "step": 16000
     }
   ],
   "logging_steps": 10,
@@ -11227,7 +12627,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 3.559853212372566e+19,
   "train_batch_size": 48,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.035099521769015894,
   "eval_steps": 500,
+  "global_step": 18000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "learning_rate": 0.0004949612511467957,
       "loss": 16.6007,
       "step": 16000
+    },
+    {
+      "epoch": 0.03121907464010803,
+      "grad_norm": 7.5,
+      "learning_rate": 0.000494958000131341,
+      "loss": 16.6753,
+      "step": 16010
+    },
+    {
+      "epoch": 0.03123857437442415,
+      "grad_norm": 6.65625,
+      "learning_rate": 0.0004949547491158863,
+      "loss": 16.6359,
+      "step": 16020
+    },
+    {
+      "epoch": 0.03125807410874027,
+      "grad_norm": 7.09375,
+      "learning_rate": 0.0004949514981004317,
+      "loss": 16.5558,
+      "step": 16030
+    },
+    {
+      "epoch": 0.03127757384305639,
+      "grad_norm": 7.34375,
+      "learning_rate": 0.000494948247084977,
+      "loss": 16.643,
+      "step": 16040
+    },
+    {
+      "epoch": 0.031297073577372506,
+      "grad_norm": 7.21875,
+      "learning_rate": 0.0004949449960695223,
+      "loss": 16.7139,
+      "step": 16050
+    },
+    {
+      "epoch": 0.03131657331168863,
+      "grad_norm": 10.125,
+      "learning_rate": 0.0004949417450540676,
+      "loss": 16.5427,
+      "step": 16060
+    },
+    {
+      "epoch": 0.03133607304600475,
+      "grad_norm": 101.5,
+      "learning_rate": 0.000494938494038613,
+      "loss": 16.6593,
+      "step": 16070
+    },
+    {
+      "epoch": 0.03135557278032087,
+      "grad_norm": 8.25,
+      "learning_rate": 0.0004949352430231583,
+      "loss": 16.719,
+      "step": 16080
+    },
+    {
+      "epoch": 0.03137507251463699,
+      "grad_norm": 13.9375,
+      "learning_rate": 0.0004949319920077036,
+      "loss": 16.6558,
+      "step": 16090
+    },
+    {
+      "epoch": 0.03139457224895311,
+      "grad_norm": 7.6875,
+      "learning_rate": 0.000494928740992249,
+      "loss": 16.4561,
+      "step": 16100
+    },
+    {
+      "epoch": 0.031414071983269225,
+      "grad_norm": 9.0625,
+      "learning_rate": 0.0004949254899767943,
+      "loss": 16.4558,
+      "step": 16110
+    },
+    {
+      "epoch": 0.031433571717585346,
+      "grad_norm": 8.6875,
+      "learning_rate": 0.0004949222389613396,
+      "loss": 16.5572,
+      "step": 16120
+    },
+    {
+      "epoch": 0.03145307145190147,
+      "grad_norm": 9.9375,
+      "learning_rate": 0.0004949189879458849,
+      "loss": 16.6075,
+      "step": 16130
+    },
+    {
+      "epoch": 0.03147257118621759,
+      "grad_norm": 13.0,
+      "learning_rate": 0.0004949157369304303,
+      "loss": 16.552,
+      "step": 16140
+    },
+    {
+      "epoch": 0.03149207092053371,
+      "grad_norm": 6.5,
+      "learning_rate": 0.0004949124859149756,
+      "loss": 16.5597,
+      "step": 16150
+    },
+    {
+      "epoch": 0.03151157065484983,
+      "grad_norm": 8.3125,
+      "learning_rate": 0.0004949092348995209,
+      "loss": 16.758,
+      "step": 16160
+    },
+    {
+      "epoch": 0.03153107038916595,
+      "grad_norm": 6.875,
+      "learning_rate": 0.0004949059838840663,
+      "loss": 16.614,
+      "step": 16170
+    },
+    {
+      "epoch": 0.031550570123482065,
+      "grad_norm": 6.875,
+      "learning_rate": 0.0004949027328686116,
+      "loss": 16.6196,
+      "step": 16180
+    },
+    {
+      "epoch": 0.031570069857798186,
+      "grad_norm": 6.75,
+      "learning_rate": 0.0004948994818531569,
+      "loss": 16.6579,
+      "step": 16190
+    },
+    {
+      "epoch": 0.03158956959211431,
+      "grad_norm": 8.75,
+      "learning_rate": 0.0004948962308377022,
+      "loss": 16.7175,
+      "step": 16200
+    },
+    {
+      "epoch": 0.03160906932643043,
+      "grad_norm": 8.25,
+      "learning_rate": 0.0004948929798222476,
+      "loss": 16.6421,
+      "step": 16210
+    },
+    {
+      "epoch": 0.03162856906074655,
+      "grad_norm": 7.53125,
+      "learning_rate": 0.0004948897288067928,
+      "loss": 16.659,
+      "step": 16220
+    },
+    {
+      "epoch": 0.03164806879506267,
+      "grad_norm": 6.625,
+      "learning_rate": 0.0004948864777913381,
+      "loss": 16.5863,
+      "step": 16230
+    },
+    {
+      "epoch": 0.031667568529378784,
+      "grad_norm": 6.59375,
+      "learning_rate": 0.0004948832267758834,
+      "loss": 16.5491,
+      "step": 16240
+    },
+    {
+      "epoch": 0.031687068263694905,
+      "grad_norm": 9.3125,
+      "learning_rate": 0.0004948799757604288,
+      "loss": 16.6378,
+      "step": 16250
+    },
+    {
+      "epoch": 0.031706567998011026,
+      "grad_norm": 8.4375,
+      "learning_rate": 0.0004948767247449741,
+      "loss": 16.6742,
+      "step": 16260
+    },
+    {
+      "epoch": 0.03172606773232715,
+      "grad_norm": 6.90625,
+      "learning_rate": 0.0004948734737295194,
+      "loss": 16.6117,
+      "step": 16270
+    },
+    {
+      "epoch": 0.03174556746664327,
+      "grad_norm": 7.28125,
+      "learning_rate": 0.0004948702227140648,
+      "loss": 16.688,
+      "step": 16280
+    },
+    {
+      "epoch": 0.03176506720095939,
+      "grad_norm": 7.375,
+      "learning_rate": 0.0004948669716986101,
+      "loss": 16.6857,
+      "step": 16290
+    },
+    {
+      "epoch": 0.03178456693527551,
+      "grad_norm": 7.78125,
+      "learning_rate": 0.0004948637206831554,
+      "loss": 16.7109,
+      "step": 16300
+    },
+    {
+      "epoch": 0.031804066669591624,
+      "grad_norm": 8.375,
+      "learning_rate": 0.0004948604696677007,
+      "loss": 16.6176,
+      "step": 16310
+    },
+    {
+      "epoch": 0.031823566403907745,
+      "grad_norm": 7.25,
+      "learning_rate": 0.0004948572186522461,
+      "loss": 16.7202,
+      "step": 16320
+    },
+    {
+      "epoch": 0.031843066138223866,
+      "grad_norm": 7.78125,
+      "learning_rate": 0.0004948539676367914,
+      "loss": 16.5284,
+      "step": 16330
+    },
+    {
+      "epoch": 0.03186256587253999,
+      "grad_norm": 8.625,
+      "learning_rate": 0.0004948507166213367,
+      "loss": 16.6153,
+      "step": 16340
+    },
+    {
+      "epoch": 0.03188206560685611,
+      "grad_norm": 6.9375,
+      "learning_rate": 0.0004948474656058821,
+      "loss": 16.5773,
+      "step": 16350
+    },
+    {
+      "epoch": 0.03190156534117223,
+      "grad_norm": 268.0,
+      "learning_rate": 0.0004948442145904274,
+      "loss": 16.6544,
+      "step": 16360
+    },
+    {
+      "epoch": 0.03192106507548834,
+      "grad_norm": 9.4375,
+      "learning_rate": 0.0004948409635749726,
+      "loss": 16.7142,
+      "step": 16370
+    },
+    {
+      "epoch": 0.031940564809804464,
+      "grad_norm": 6.65625,
+      "learning_rate": 0.0004948377125595179,
+      "loss": 16.7066,
+      "step": 16380
+    },
+    {
+      "epoch": 0.031960064544120585,
+      "grad_norm": 9.0,
+      "learning_rate": 0.0004948344615440633,
+      "loss": 16.6738,
+      "step": 16390
+    },
+    {
+      "epoch": 0.031979564278436706,
+      "grad_norm": 7.15625,
+      "learning_rate": 0.0004948312105286086,
+      "loss": 16.5181,
+      "step": 16400
+    },
+    {
+      "epoch": 0.03199906401275283,
+      "grad_norm": 7.34375,
+      "learning_rate": 0.0004948279595131539,
+      "loss": 16.6785,
+      "step": 16410
+    },
+    {
+      "epoch": 0.03201856374706895,
+      "grad_norm": 6.5625,
+      "learning_rate": 0.0004948247084976992,
+      "loss": 16.5873,
+      "step": 16420
+    },
+    {
+      "epoch": 0.03203806348138507,
+      "grad_norm": 6.71875,
+      "learning_rate": 0.0004948214574822446,
+      "loss": 16.5564,
+      "step": 16430
+    },
+    {
+      "epoch": 0.03205756321570118,
+      "grad_norm": 6.90625,
+      "learning_rate": 0.0004948182064667899,
+      "loss": 16.5584,
+      "step": 16440
+    },
+    {
+      "epoch": 0.032077062950017304,
+      "grad_norm": 6.59375,
+      "learning_rate": 0.0004948149554513352,
+      "loss": 16.5856,
+      "step": 16450
+    },
+    {
+      "epoch": 0.032096562684333425,
+      "grad_norm": 8.125,
+      "learning_rate": 0.0004948117044358806,
+      "loss": 16.6783,
+      "step": 16460
+    },
+    {
+      "epoch": 0.032116062418649546,
+      "grad_norm": 8.875,
+      "learning_rate": 0.0004948084534204259,
+      "loss": 16.5922,
+      "step": 16470
+    },
+    {
+      "epoch": 0.03213556215296567,
+      "grad_norm": 9.5625,
+      "learning_rate": 0.0004948052024049712,
+      "loss": 16.541,
+      "step": 16480
+    },
+    {
+      "epoch": 0.03215506188728179,
+      "grad_norm": 9.25,
+      "learning_rate": 0.0004948019513895165,
+      "loss": 16.4979,
+      "step": 16490
+    },
+    {
+      "epoch": 0.0321745616215979,
+      "grad_norm": 7.4375,
+      "learning_rate": 0.0004947987003740619,
+      "loss": 16.5768,
+      "step": 16500
+    },
+    {
+      "epoch": 0.03219406135591402,
+      "grad_norm": 6.8125,
+      "learning_rate": 0.0004947954493586072,
+      "loss": 16.6362,
+      "step": 16510
+    },
+    {
+      "epoch": 0.032213561090230144,
+      "grad_norm": 7.25,
+      "learning_rate": 0.0004947921983431524,
+      "loss": 16.6625,
+      "step": 16520
+    },
+    {
+      "epoch": 0.032233060824546265,
+      "grad_norm": 7.0625,
+      "learning_rate": 0.0004947889473276978,
+      "loss": 16.4659,
+      "step": 16530
+    },
+    {
+      "epoch": 0.032252560558862386,
+      "grad_norm": 8.5625,
+      "learning_rate": 0.0004947856963122431,
+      "loss": 16.5525,
+      "step": 16540
+    },
+    {
+      "epoch": 0.03227206029317851,
+      "grad_norm": 7.46875,
+      "learning_rate": 0.0004947824452967884,
+      "loss": 16.6808,
+      "step": 16550
+    },
+    {
+      "epoch": 0.03229156002749463,
+      "grad_norm": 7.25,
+      "learning_rate": 0.0004947791942813337,
+      "loss": 16.6119,
+      "step": 16560
+    },
+    {
+      "epoch": 0.03231105976181074,
+      "grad_norm": 6.59375,
+      "learning_rate": 0.0004947759432658791,
+      "loss": 16.623,
+      "step": 16570
+    },
+    {
+      "epoch": 0.032330559496126864,
+      "grad_norm": 8.125,
+      "learning_rate": 0.0004947726922504244,
+      "loss": 16.5381,
+      "step": 16580
+    },
+    {
+      "epoch": 0.032350059230442985,
+      "grad_norm": 7.40625,
+      "learning_rate": 0.0004947694412349697,
+      "loss": 16.7365,
+      "step": 16590
+    },
+    {
+      "epoch": 0.032369558964759106,
+      "grad_norm": 6.5625,
+      "learning_rate": 0.000494766190219515,
+      "loss": 16.6286,
+      "step": 16600
+    },
+    {
+      "epoch": 0.03238905869907523,
+      "grad_norm": 11.875,
+      "learning_rate": 0.0004947629392040604,
+      "loss": 16.5944,
+      "step": 16610
+    },
+    {
+      "epoch": 0.03240855843339135,
+      "grad_norm": 7.0625,
+      "learning_rate": 0.0004947596881886057,
+      "loss": 16.6532,
+      "step": 16620
+    },
+    {
+      "epoch": 0.03242805816770746,
+      "grad_norm": 7.03125,
+      "learning_rate": 0.000494756437173151,
+      "loss": 16.6551,
+      "step": 16630
+    },
+    {
+      "epoch": 0.03244755790202358,
+      "grad_norm": 8.6875,
+      "learning_rate": 0.0004947531861576964,
+      "loss": 16.5883,
+      "step": 16640
+    },
+    {
+      "epoch": 0.032467057636339704,
+      "grad_norm": 7.15625,
+      "learning_rate": 0.0004947499351422417,
+      "loss": 16.6404,
+      "step": 16650
+    },
+    {
+      "epoch": 0.032486557370655825,
+      "grad_norm": 10.8125,
+      "learning_rate": 0.000494746684126787,
+      "loss": 16.6752,
+      "step": 16660
+    },
+    {
+      "epoch": 0.032506057104971946,
+      "grad_norm": 15.4375,
+      "learning_rate": 0.0004947434331113324,
+      "loss": 16.7058,
+      "step": 16670
+    },
+    {
+      "epoch": 0.03252555683928807,
+      "grad_norm": 8.9375,
+      "learning_rate": 0.0004947401820958777,
+      "loss": 16.6456,
+      "step": 16680
+    },
+    {
+      "epoch": 0.03254505657360419,
+      "grad_norm": 7.5625,
+      "learning_rate": 0.000494736931080423,
+      "loss": 16.4961,
+      "step": 16690
+    },
+    {
+      "epoch": 0.0325645563079203,
+      "grad_norm": 8.4375,
+      "learning_rate": 0.0004947336800649683,
+      "loss": 16.6947,
+      "step": 16700
+    },
+    {
+      "epoch": 0.03258405604223642,
+      "grad_norm": 7.53125,
+      "learning_rate": 0.0004947304290495137,
+      "loss": 16.6582,
+      "step": 16710
+    },
+    {
+      "epoch": 0.032603555776552544,
+      "grad_norm": 7.53125,
+      "learning_rate": 0.0004947271780340589,
+      "loss": 16.5912,
+      "step": 16720
+    },
+    {
+      "epoch": 0.032623055510868665,
+      "grad_norm": 7.3125,
+      "learning_rate": 0.0004947239270186042,
+      "loss": 16.5378,
+      "step": 16730
+    },
+    {
+      "epoch": 0.032642555245184786,
+      "grad_norm": 7.4375,
+      "learning_rate": 0.0004947206760031495,
+      "loss": 16.6431,
+      "step": 16740
+    },
+    {
+      "epoch": 0.03266205497950091,
+      "grad_norm": 6.53125,
+      "learning_rate": 0.0004947174249876949,
+      "loss": 16.5495,
+      "step": 16750
+    },
+    {
+      "epoch": 0.03268155471381702,
+      "grad_norm": 6.6875,
+      "learning_rate": 0.0004947141739722402,
+      "loss": 16.5369,
+      "step": 16760
+    },
+    {
+      "epoch": 0.03270105444813314,
+      "grad_norm": 10.5,
+      "learning_rate": 0.0004947109229567855,
+      "loss": 16.6108,
+      "step": 16770
+    },
+    {
+      "epoch": 0.03272055418244926,
+      "grad_norm": 7.34375,
+      "learning_rate": 0.0004947076719413309,
+      "loss": 16.7242,
+      "step": 16780
+    },
+    {
+      "epoch": 0.032740053916765384,
+      "grad_norm": 10.4375,
+      "learning_rate": 0.0004947044209258762,
+      "loss": 16.5792,
+      "step": 16790
+    },
+    {
+      "epoch": 0.032759553651081505,
+      "grad_norm": 7.84375,
+      "learning_rate": 0.0004947011699104215,
+      "loss": 16.6258,
+      "step": 16800
+    },
+    {
+      "epoch": 0.032779053385397626,
+      "grad_norm": 9.625,
+      "learning_rate": 0.0004946979188949668,
+      "loss": 16.5363,
+      "step": 16810
+    },
+    {
+      "epoch": 0.03279855311971375,
+      "grad_norm": 6.59375,
+      "learning_rate": 0.0004946946678795122,
+      "loss": 16.6006,
+      "step": 16820
+    },
+    {
+      "epoch": 0.03281805285402986,
+      "grad_norm": 6.03125,
+      "learning_rate": 0.0004946914168640575,
+      "loss": 16.6048,
+      "step": 16830
+    },
+    {
+      "epoch": 0.03283755258834598,
+      "grad_norm": 7.5625,
+      "learning_rate": 0.0004946881658486028,
+      "loss": 16.4615,
+      "step": 16840
+    },
+    {
+      "epoch": 0.0328570523226621,
+      "grad_norm": 8.75,
+      "learning_rate": 0.0004946849148331482,
+      "loss": 16.6284,
+      "step": 16850
+    },
+    {
+      "epoch": 0.032876552056978224,
+      "grad_norm": 7.46875,
+      "learning_rate": 0.0004946816638176935,
+      "loss": 16.6879,
+      "step": 16860
+    },
+    {
+      "epoch": 0.032896051791294345,
+      "grad_norm": 7.96875,
+      "learning_rate": 0.0004946784128022388,
+      "loss": 16.5871,
+      "step": 16870
+    },
+    {
+      "epoch": 0.032915551525610466,
+      "grad_norm": 8.75,
+      "learning_rate": 0.0004946751617867841,
+      "loss": 16.5434,
+      "step": 16880
+    },
+    {
+      "epoch": 0.03293505125992658,
+      "grad_norm": 6.53125,
+      "learning_rate": 0.0004946719107713295,
+      "loss": 16.5516,
+      "step": 16890
+    },
+    {
+      "epoch": 0.0329545509942427,
+      "grad_norm": 7.96875,
+      "learning_rate": 0.0004946686597558748,
+      "loss": 16.5549,
+      "step": 16900
+    },
+    {
+      "epoch": 0.03297405072855882,
+      "grad_norm": 9.25,
+      "learning_rate": 0.0004946654087404201,
+      "loss": 16.5084,
+      "step": 16910
+    },
+    {
+      "epoch": 0.03299355046287494,
+      "grad_norm": 6.9375,
+      "learning_rate": 0.0004946621577249655,
+      "loss": 16.5905,
+      "step": 16920
+    },
+    {
+      "epoch": 0.033013050197191064,
+      "grad_norm": 7.0,
+      "learning_rate": 0.0004946589067095108,
+      "loss": 16.5883,
+      "step": 16930
+    },
+    {
+      "epoch": 0.033032549931507185,
+      "grad_norm": 10.6875,
+      "learning_rate": 0.0004946556556940561,
+      "loss": 16.5743,
+      "step": 16940
+    },
+    {
+      "epoch": 0.033052049665823306,
+      "grad_norm": 8.4375,
+      "learning_rate": 0.0004946524046786014,
+      "loss": 16.5847,
+      "step": 16950
+    },
+    {
+      "epoch": 0.03307154940013942,
+      "grad_norm": 7.90625,
+      "learning_rate": 0.0004946491536631467,
+      "loss": 16.5937,
+      "step": 16960
+    },
+    {
+      "epoch": 0.03309104913445554,
+      "grad_norm": 6.875,
+      "learning_rate": 0.000494645902647692,
+      "loss": 16.5317,
+      "step": 16970
+    },
+    {
+      "epoch": 0.03311054886877166,
+      "grad_norm": 9.8125,
+      "learning_rate": 0.0004946426516322373,
+      "loss": 16.5498,
+      "step": 16980
+    },
+    {
+      "epoch": 0.03313004860308778,
+      "grad_norm": 7.0625,
+      "learning_rate": 0.0004946394006167826,
+      "loss": 16.6557,
+      "step": 16990
+    },
+    {
+      "epoch": 0.033149548337403904,
+      "grad_norm": 7.25,
+      "learning_rate": 0.000494636149601328,
+      "loss": 16.5932,
+      "step": 17000
+    },
+    {
+      "epoch": 0.033169048071720025,
+      "grad_norm": 7.34375,
+      "learning_rate": 0.0004946328985858733,
+      "loss": 16.5905,
+      "step": 17010
+    },
+    {
+      "epoch": 0.03318854780603614,
+      "grad_norm": 6.59375,
+      "learning_rate": 0.0004946296475704186,
+      "loss": 16.6248,
+      "step": 17020
+    },
+    {
+      "epoch": 0.03320804754035226,
+      "grad_norm": 7.4375,
+      "learning_rate": 0.000494626396554964,
+      "loss": 16.6214,
+      "step": 17030
+    },
+    {
+      "epoch": 0.03322754727466838,
+      "grad_norm": 7.4375,
+      "learning_rate": 0.0004946231455395093,
+      "loss": 16.656,
+      "step": 17040
+    },
+    {
+      "epoch": 0.0332470470089845,
+      "grad_norm": 8.75,
+      "learning_rate": 0.0004946198945240546,
+      "loss": 16.5983,
+      "step": 17050
+    },
+    {
+      "epoch": 0.03326654674330062,
+      "grad_norm": 9.125,
+      "learning_rate": 0.0004946166435085999,
+      "loss": 16.501,
+      "step": 17060
+    },
+    {
+      "epoch": 0.033286046477616744,
+      "grad_norm": 7.78125,
+      "learning_rate": 0.0004946133924931453,
+      "loss": 16.6354,
+      "step": 17070
+    },
+    {
+      "epoch": 0.033305546211932865,
+      "grad_norm": 10.0,
+      "learning_rate": 0.0004946101414776906,
+      "loss": 16.5313,
+      "step": 17080
+    },
+    {
+      "epoch": 0.03332504594624898,
+      "grad_norm": 8.3125,
+      "learning_rate": 0.0004946068904622359,
+      "loss": 16.6214,
+      "step": 17090
+    },
+    {
+      "epoch": 0.0333445456805651,
+      "grad_norm": 6.84375,
+      "learning_rate": 0.0004946036394467813,
+      "loss": 16.5653,
+      "step": 17100
+    },
+    {
+      "epoch": 0.03336404541488122,
+      "grad_norm": 17.75,
+      "learning_rate": 0.0004946003884313266,
+      "loss": 16.6099,
+      "step": 17110
+    },
+    {
+      "epoch": 0.03338354514919734,
+      "grad_norm": 7.75,
+      "learning_rate": 0.0004945971374158719,
+      "loss": 16.5001,
+      "step": 17120
+    },
+    {
+      "epoch": 0.033403044883513464,
+      "grad_norm": 7.59375,
+      "learning_rate": 0.0004945938864004172,
+      "loss": 16.6556,
+      "step": 17130
+    },
+    {
+      "epoch": 0.033422544617829585,
+      "grad_norm": 7.34375,
+      "learning_rate": 0.0004945906353849625,
+      "loss": 16.571,
+      "step": 17140
+    },
+    {
+      "epoch": 0.0334420443521457,
+      "grad_norm": 9.25,
+      "learning_rate": 0.0004945873843695078,
+      "loss": 16.6326,
+      "step": 17150
+    },
+    {
+      "epoch": 0.03346154408646182,
+      "grad_norm": 9.5625,
+      "learning_rate": 0.0004945841333540531,
+      "loss": 16.6164,
+      "step": 17160
+    },
+    {
+      "epoch": 0.03348104382077794,
+      "grad_norm": 7.875,
+      "learning_rate": 0.0004945808823385984,
+      "loss": 16.5917,
+      "step": 17170
+    },
+    {
+      "epoch": 0.03350054355509406,
+      "grad_norm": 6.5625,
+      "learning_rate": 0.0004945776313231438,
+      "loss": 16.6178,
+      "step": 17180
+    },
+    {
+      "epoch": 0.03352004328941018,
+      "grad_norm": 6.65625,
+      "learning_rate": 0.0004945743803076891,
+      "loss": 16.5651,
+      "step": 17190
+    },
+    {
+      "epoch": 0.033539543023726304,
+      "grad_norm": 8.0625,
+      "learning_rate": 0.0004945711292922344,
+      "loss": 16.7342,
+      "step": 17200
+    },
+    {
+      "epoch": 0.033559042758042425,
+      "grad_norm": 6.5,
+      "learning_rate": 0.0004945678782767798,
+      "loss": 16.6286,
+      "step": 17210
+    },
+    {
+      "epoch": 0.03357854249235854,
+      "grad_norm": 7.6875,
+      "learning_rate": 0.0004945646272613251,
+      "loss": 16.602,
+      "step": 17220
+    },
+    {
+      "epoch": 0.03359804222667466,
+      "grad_norm": 7.53125,
+      "learning_rate": 0.0004945613762458704,
+      "loss": 16.652,
+      "step": 17230
+    },
+    {
+      "epoch": 0.03361754196099078,
+      "grad_norm": 20.75,
+      "learning_rate": 0.0004945581252304157,
+      "loss": 16.5934,
+      "step": 17240
+    },
+    {
+      "epoch": 0.0336370416953069,
+      "grad_norm": 7.6875,
+      "learning_rate": 0.0004945548742149611,
+      "loss": 16.5494,
+      "step": 17250
+    },
+    {
+      "epoch": 0.03365654142962302,
+      "grad_norm": 7.09375,
+      "learning_rate": 0.0004945516231995064,
+      "loss": 16.6584,
+      "step": 17260
+    },
+    {
+      "epoch": 0.033676041163939144,
+      "grad_norm": 8.625,
+      "learning_rate": 0.0004945483721840517,
+      "loss": 16.6333,
+      "step": 17270
+    },
+    {
+      "epoch": 0.03369554089825526,
+      "grad_norm": 6.75,
+      "learning_rate": 0.0004945451211685971,
+      "loss": 16.6222,
+      "step": 17280
+    },
+    {
+      "epoch": 0.03371504063257138,
+      "grad_norm": 7.125,
+      "learning_rate": 0.0004945418701531423,
+      "loss": 16.4551,
+      "step": 17290
+    },
+    {
+      "epoch": 0.0337345403668875,
+      "grad_norm": 7.5625,
+      "learning_rate": 0.0004945386191376876,
+      "loss": 16.4826,
+      "step": 17300
+    },
+    {
+      "epoch": 0.03375404010120362,
+      "grad_norm": 8.1875,
+      "learning_rate": 0.0004945353681222329,
+      "loss": 16.6638,
+      "step": 17310
+    },
+    {
+      "epoch": 0.03377353983551974,
+      "grad_norm": 6.96875,
+      "learning_rate": 0.0004945321171067783,
+      "loss": 16.5386,
+      "step": 17320
+    },
+    {
+      "epoch": 0.03379303956983586,
+      "grad_norm": 8.0,
+      "learning_rate": 0.0004945288660913236,
+      "loss": 16.6675,
+      "step": 17330
+    },
+    {
+      "epoch": 0.033812539304151984,
+      "grad_norm": 9.0625,
+      "learning_rate": 0.0004945256150758689,
+      "loss": 16.6567,
+      "step": 17340
+    },
+    {
+      "epoch": 0.0338320390384681,
+      "grad_norm": 7.28125,
+      "learning_rate": 0.0004945223640604143,
+      "loss": 16.641,
+      "step": 17350
+    },
+    {
+      "epoch": 0.03385153877278422,
+      "grad_norm": 8.375,
+      "learning_rate": 0.0004945191130449596,
+      "loss": 16.5874,
+      "step": 17360
+    },
+    {
+      "epoch": 0.03387103850710034,
+      "grad_norm": 6.84375,
+      "learning_rate": 0.0004945158620295049,
+      "loss": 16.6083,
+      "step": 17370
+    },
+    {
+      "epoch": 0.03389053824141646,
+      "grad_norm": 7.3125,
+      "learning_rate": 0.0004945126110140502,
+      "loss": 16.5926,
+      "step": 17380
+    },
+    {
+      "epoch": 0.03391003797573258,
+      "grad_norm": 7.5625,
+      "learning_rate": 0.0004945093599985956,
+      "loss": 16.5794,
+      "step": 17390
+    },
+    {
+      "epoch": 0.0339295377100487,
+      "grad_norm": 7.40625,
+      "learning_rate": 0.0004945061089831409,
+      "loss": 16.5721,
+      "step": 17400
+    },
+    {
+      "epoch": 0.03394903744436482,
+      "grad_norm": 6.5,
+      "learning_rate": 0.0004945028579676862,
+      "loss": 16.6443,
+      "step": 17410
+    },
+    {
+      "epoch": 0.03396853717868094,
+      "grad_norm": 6.59375,
+      "learning_rate": 0.0004944996069522316,
+      "loss": 16.6037,
+      "step": 17420
+    },
+    {
+      "epoch": 0.03398803691299706,
+      "grad_norm": 8.5,
+      "learning_rate": 0.0004944963559367769,
+      "loss": 16.5677,
+      "step": 17430
+    },
+    {
+      "epoch": 0.03400753664731318,
+      "grad_norm": 10.1875,
+      "learning_rate": 0.0004944931049213222,
+      "loss": 16.5652,
+      "step": 17440
+    },
+    {
+      "epoch": 0.0340270363816293,
+      "grad_norm": 6.9375,
+      "learning_rate": 0.0004944898539058674,
+      "loss": 16.635,
+      "step": 17450
+    },
+    {
+      "epoch": 0.03404653611594542,
+      "grad_norm": 6.53125,
+      "learning_rate": 0.0004944866028904128,
+      "loss": 16.6619,
+      "step": 17460
+    },
+    {
+      "epoch": 0.03406603585026154,
+      "grad_norm": 6.875,
+      "learning_rate": 0.0004944833518749581,
+      "loss": 16.6537,
+      "step": 17470
+    },
+    {
+      "epoch": 0.03408553558457766,
+      "grad_norm": 7.59375,
+      "learning_rate": 0.0004944801008595034,
+      "loss": 16.6335,
+      "step": 17480
+    },
+    {
+      "epoch": 0.03410503531889378,
+      "grad_norm": 6.6875,
+      "learning_rate": 0.0004944768498440487,
+      "loss": 16.6003,
+      "step": 17490
+    },
+    {
+      "epoch": 0.0341245350532099,
+      "grad_norm": 7.375,
+      "learning_rate": 0.0004944735988285941,
+      "loss": 16.6049,
+      "step": 17500
+    },
+    {
+      "epoch": 0.03414403478752602,
+      "grad_norm": 8.5625,
+      "learning_rate": 0.0004944703478131394,
+      "loss": 16.6644,
+      "step": 17510
+    },
+    {
+      "epoch": 0.03416353452184214,
+      "grad_norm": 7.59375,
+      "learning_rate": 0.0004944670967976847,
+      "loss": 16.5921,
+      "step": 17520
+    },
+    {
+      "epoch": 0.03418303425615826,
+      "grad_norm": 10.5625,
+      "learning_rate": 0.0004944638457822301,
+      "loss": 16.7162,
+      "step": 17530
+    },
+    {
+      "epoch": 0.034202533990474376,
+      "grad_norm": 12.375,
+      "learning_rate": 0.0004944605947667754,
+      "loss": 16.6499,
+      "step": 17540
+    },
+    {
+      "epoch": 0.0342220337247905,
+      "grad_norm": 6.625,
+      "learning_rate": 0.0004944573437513207,
+      "loss": 16.5904,
+      "step": 17550
+    },
+    {
+      "epoch": 0.03424153345910662,
+      "grad_norm": 8.875,
+      "learning_rate": 0.000494454092735866,
+      "loss": 16.5917,
+      "step": 17560
+    },
+    {
+      "epoch": 0.03426103319342274,
+      "grad_norm": 8.25,
+      "learning_rate": 0.0004944508417204114,
+      "loss": 16.5057,
+      "step": 17570
+    },
+    {
+      "epoch": 0.03428053292773886,
+      "grad_norm": 9.875,
+      "learning_rate": 0.0004944475907049567,
+      "loss": 16.4435,
+      "step": 17580
+    },
+    {
+      "epoch": 0.03430003266205498,
+      "grad_norm": 7.03125,
+      "learning_rate": 0.000494444339689502,
+      "loss": 16.5729,
+      "step": 17590
+    },
+    {
+      "epoch": 0.0343195323963711,
+      "grad_norm": 7.8125,
+      "learning_rate": 0.0004944410886740474,
+      "loss": 16.4975,
+      "step": 17600
+    },
+    {
+      "epoch": 0.034339032130687216,
+      "grad_norm": 8.625,
+      "learning_rate": 0.0004944378376585927,
+      "loss": 16.5803,
+      "step": 17610
+    },
+    {
+      "epoch": 0.03435853186500334,
+      "grad_norm": 7.0,
+      "learning_rate": 0.000494434586643138,
+      "loss": 16.6196,
+      "step": 17620
+    },
+    {
+      "epoch": 0.03437803159931946,
+      "grad_norm": 7.34375,
+      "learning_rate": 0.0004944313356276833,
+      "loss": 16.5954,
+      "step": 17630
+    },
+    {
+      "epoch": 0.03439753133363558,
+      "grad_norm": 6.71875,
+      "learning_rate": 0.0004944280846122287,
+      "loss": 16.6301,
+      "step": 17640
+    },
+    {
+      "epoch": 0.0344170310679517,
+      "grad_norm": 8.3125,
+      "learning_rate": 0.000494424833596774,
+      "loss": 16.5994,
+      "step": 17650
+    },
+    {
+      "epoch": 0.03443653080226782,
+      "grad_norm": 7.78125,
+      "learning_rate": 0.0004944215825813193,
+      "loss": 16.584,
+      "step": 17660
+    },
+    {
+      "epoch": 0.034456030536583936,
+      "grad_norm": 8.8125,
+      "learning_rate": 0.0004944183315658647,
+      "loss": 16.5309,
+      "step": 17670
+    },
+    {
+      "epoch": 0.03447553027090006,
+      "grad_norm": 6.90625,
+      "learning_rate": 0.00049441508055041,
+      "loss": 16.6363,
+      "step": 17680
+    },
+    {
+      "epoch": 0.03449503000521618,
+      "grad_norm": 6.65625,
+      "learning_rate": 0.0004944118295349552,
+      "loss": 16.6538,
+      "step": 17690
+    },
+    {
+      "epoch": 0.0345145297395323,
+      "grad_norm": 58.0,
+      "learning_rate": 0.0004944085785195005,
+      "loss": 16.5892,
+      "step": 17700
+    },
+    {
+      "epoch": 0.03453402947384842,
+      "grad_norm": 9.0,
+      "learning_rate": 0.0004944053275040459,
+      "loss": 16.5848,
+      "step": 17710
+    },
+    {
+      "epoch": 0.03455352920816454,
+      "grad_norm": 7.15625,
+      "learning_rate": 0.0004944020764885912,
+      "loss": 16.5623,
+      "step": 17720
+    },
+    {
+      "epoch": 0.03457302894248066,
+      "grad_norm": 8.5,
+      "learning_rate": 0.0004943988254731365,
+      "loss": 16.6047,
+      "step": 17730
+    },
+    {
+      "epoch": 0.034592528676796776,
+      "grad_norm": 6.9375,
+      "learning_rate": 0.0004943955744576818,
+      "loss": 16.6139,
+      "step": 17740
+    },
+    {
+      "epoch": 0.0346120284111129,
+      "grad_norm": 9.25,
+      "learning_rate": 0.0004943923234422272,
+      "loss": 16.602,
+      "step": 17750
+    },
+    {
+      "epoch": 0.03463152814542902,
+      "grad_norm": 11.625,
+      "learning_rate": 0.0004943890724267725,
+      "loss": 16.5514,
+      "step": 17760
+    },
+    {
+      "epoch": 0.03465102787974514,
+      "grad_norm": 8.0625,
+      "learning_rate": 0.0004943858214113178,
+      "loss": 16.5125,
+      "step": 17770
+    },
+    {
+      "epoch": 0.03467052761406126,
+      "grad_norm": 7.0,
+      "learning_rate": 0.0004943825703958632,
+      "loss": 16.575,
+      "step": 17780
+    },
+    {
+      "epoch": 0.03469002734837738,
+      "grad_norm": 8.125,
+      "learning_rate": 0.0004943793193804085,
+      "loss": 16.6736,
+      "step": 17790
+    },
+    {
+      "epoch": 0.034709527082693495,
+      "grad_norm": 12.875,
+      "learning_rate": 0.0004943760683649538,
+      "loss": 16.6935,
+      "step": 17800
+    },
+    {
+      "epoch": 0.034729026817009616,
+      "grad_norm": 9.4375,
+      "learning_rate": 0.0004943728173494991,
+      "loss": 16.5395,
+      "step": 17810
+    },
+    {
+      "epoch": 0.03474852655132574,
+      "grad_norm": 7.25,
+      "learning_rate": 0.0004943695663340445,
+      "loss": 16.4912,
+      "step": 17820
+    },
+    {
+      "epoch": 0.03476802628564186,
+      "grad_norm": 6.46875,
+      "learning_rate": 0.0004943663153185898,
+      "loss": 16.5062,
+      "step": 17830
+    },
+    {
+      "epoch": 0.03478752601995798,
+      "grad_norm": 6.5,
+      "learning_rate": 0.0004943630643031351,
+      "loss": 16.5508,
+      "step": 17840
+    },
+    {
+      "epoch": 0.0348070257542741,
+      "grad_norm": 9.375,
+      "learning_rate": 0.0004943598132876805,
+      "loss": 16.7509,
+      "step": 17850
+    },
+    {
+      "epoch": 0.03482652548859022,
+      "grad_norm": 8.3125,
+      "learning_rate": 0.0004943565622722258,
+      "loss": 16.6271,
+      "step": 17860
+    },
+    {
+      "epoch": 0.034846025222906335,
+      "grad_norm": 8.6875,
+      "learning_rate": 0.0004943533112567711,
+      "loss": 16.5867,
+      "step": 17870
+    },
+    {
+      "epoch": 0.034865524957222456,
+      "grad_norm": 7.4375,
+      "learning_rate": 0.0004943500602413164,
+      "loss": 16.6566,
+      "step": 17880
+    },
+    {
+      "epoch": 0.03488502469153858,
+      "grad_norm": 7.53125,
+      "learning_rate": 0.0004943468092258618,
+      "loss": 16.573,
+      "step": 17890
+    },
+    {
+      "epoch": 0.0349045244258547,
+      "grad_norm": 7.21875,
+      "learning_rate": 0.0004943435582104071,
+      "loss": 16.6032,
+      "step": 17900
+    },
+    {
+      "epoch": 0.03492402416017082,
+      "grad_norm": 7.6875,
+      "learning_rate": 0.0004943403071949523,
+      "loss": 16.6089,
+      "step": 17910
+    },
+    {
+      "epoch": 0.03494352389448694,
+      "grad_norm": 7.46875,
+      "learning_rate": 0.0004943370561794977,
+      "loss": 16.6048,
+      "step": 17920
+    },
+    {
+      "epoch": 0.03496302362880306,
+      "grad_norm": 7.78125,
+      "learning_rate": 0.000494333805164043,
+      "loss": 16.608,
+      "step": 17930
+    },
+    {
+      "epoch": 0.034982523363119175,
+      "grad_norm": 8.0,
+      "learning_rate": 0.0004943305541485883,
+      "loss": 16.6105,
+      "step": 17940
+    },
+    {
+      "epoch": 0.035002023097435296,
+      "grad_norm": 8.0,
+      "learning_rate": 0.0004943273031331336,
+      "loss": 16.5891,
+      "step": 17950
+    },
+    {
+      "epoch": 0.03502152283175142,
+      "grad_norm": 9.1875,
+      "learning_rate": 0.000494324052117679,
+      "loss": 16.5736,
+      "step": 17960
+    },
+    {
+      "epoch": 0.03504102256606754,
+      "grad_norm": 7.5,
+      "learning_rate": 0.0004943208011022243,
+      "loss": 16.6407,
+      "step": 17970
+    },
+    {
+      "epoch": 0.03506052230038366,
+      "grad_norm": 9.625,
+      "learning_rate": 0.0004943175500867696,
+      "loss": 16.5699,
+      "step": 17980
+    },
+    {
+      "epoch": 0.03508002203469978,
+      "grad_norm": 6.875,
+      "learning_rate": 0.000494314299071315,
+      "loss": 16.5092,
+      "step": 17990
+    },
+    {
+      "epoch": 0.035099521769015894,
+      "grad_norm": 27.625,
+      "learning_rate": 0.0004943110480558603,
+      "loss": 16.482,
+      "step": 18000
     }
   ],
   "logging_steps": 10,
       "attributes": {}
     }
   },
+  "total_flos": 4.004838781385651e+19,
   "train_batch_size": 48,
   "trial_name": null,
   "trial_params": null