9b-106 / trainer_state.json
furproxy's picture
Upload folder using huggingface_hub
9756021 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 6.0,
"eval_steps": 500,
"global_step": 2844,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004219409282700422,
"grad_norm": 59.86221694946289,
"learning_rate": 5.594405594405594e-08,
"loss": 2.1981945037841797,
"step": 2
},
{
"epoch": 0.008438818565400843,
"grad_norm": 12.374919891357422,
"learning_rate": 1.6783216783216782e-07,
"loss": 1.7811565399169922,
"step": 4
},
{
"epoch": 0.012658227848101266,
"grad_norm": 2.8306868076324463,
"learning_rate": 2.7972027972027973e-07,
"loss": 1.9376487731933594,
"step": 6
},
{
"epoch": 0.016877637130801686,
"grad_norm": 5.625478267669678,
"learning_rate": 3.916083916083916e-07,
"loss": 1.9494853019714355,
"step": 8
},
{
"epoch": 0.02109704641350211,
"grad_norm": 15.797261238098145,
"learning_rate": 5.034965034965036e-07,
"loss": 1.849827766418457,
"step": 10
},
{
"epoch": 0.02531645569620253,
"grad_norm": 2.746943950653076,
"learning_rate": 6.153846153846154e-07,
"loss": 1.3138155937194824,
"step": 12
},
{
"epoch": 0.029535864978902954,
"grad_norm": 7.311520576477051,
"learning_rate": 7.272727272727272e-07,
"loss": 1.650458574295044,
"step": 14
},
{
"epoch": 0.03375527426160337,
"grad_norm": 10.263240814208984,
"learning_rate": 8.391608391608391e-07,
"loss": 2.136387825012207,
"step": 16
},
{
"epoch": 0.0379746835443038,
"grad_norm": 1.834839940071106,
"learning_rate": 9.51048951048951e-07,
"loss": 1.8011322021484375,
"step": 18
},
{
"epoch": 0.04219409282700422,
"grad_norm": 3.437499761581421,
"learning_rate": 1.0629370629370628e-06,
"loss": 1.872532606124878,
"step": 20
},
{
"epoch": 0.046413502109704644,
"grad_norm": 2.469942808151245,
"learning_rate": 1.1748251748251746e-06,
"loss": 1.5344078540802002,
"step": 22
},
{
"epoch": 0.05063291139240506,
"grad_norm": 3.802064895629883,
"learning_rate": 1.2867132867132867e-06,
"loss": 1.7150638103485107,
"step": 24
},
{
"epoch": 0.05485232067510549,
"grad_norm": 3.20348858833313,
"learning_rate": 1.3986013986013985e-06,
"loss": 1.5234665870666504,
"step": 26
},
{
"epoch": 0.05907172995780591,
"grad_norm": 3.4241185188293457,
"learning_rate": 1.5104895104895103e-06,
"loss": 1.8226149082183838,
"step": 28
},
{
"epoch": 0.06329113924050633,
"grad_norm": 9.308449745178223,
"learning_rate": 1.6223776223776222e-06,
"loss": 1.499394178390503,
"step": 30
},
{
"epoch": 0.06751054852320675,
"grad_norm": 2.2547056674957275,
"learning_rate": 1.734265734265734e-06,
"loss": 1.6274735927581787,
"step": 32
},
{
"epoch": 0.07172995780590717,
"grad_norm": 43.91905212402344,
"learning_rate": 1.8461538461538462e-06,
"loss": 1.1456708908081055,
"step": 34
},
{
"epoch": 0.0759493670886076,
"grad_norm": 3.9160234928131104,
"learning_rate": 1.958041958041958e-06,
"loss": 1.6436142921447754,
"step": 36
},
{
"epoch": 0.08016877637130802,
"grad_norm": 4.995796203613281,
"learning_rate": 2.06993006993007e-06,
"loss": 1.1828241348266602,
"step": 38
},
{
"epoch": 0.08438818565400844,
"grad_norm": 1.9018964767456055,
"learning_rate": 2.1818181818181815e-06,
"loss": 1.6038843393325806,
"step": 40
},
{
"epoch": 0.08860759493670886,
"grad_norm": 70.95392608642578,
"learning_rate": 2.2937062937062938e-06,
"loss": 1.1521950960159302,
"step": 42
},
{
"epoch": 0.09282700421940929,
"grad_norm": 5.062403202056885,
"learning_rate": 2.405594405594405e-06,
"loss": 1.8195090293884277,
"step": 44
},
{
"epoch": 0.0970464135021097,
"grad_norm": 7.01928186416626,
"learning_rate": 2.5174825174825174e-06,
"loss": 1.6717772483825684,
"step": 46
},
{
"epoch": 0.10126582278481013,
"grad_norm": 2.7374989986419678,
"learning_rate": 2.629370629370629e-06,
"loss": 1.5755090713500977,
"step": 48
},
{
"epoch": 0.10548523206751055,
"grad_norm": 3.8747036457061768,
"learning_rate": 2.741258741258741e-06,
"loss": 0.8314967751502991,
"step": 50
},
{
"epoch": 0.10970464135021098,
"grad_norm": 5.7753753662109375,
"learning_rate": 2.8531468531468534e-06,
"loss": 0.8825576305389404,
"step": 52
},
{
"epoch": 0.11392405063291139,
"grad_norm": 2.804755449295044,
"learning_rate": 2.9650349650349648e-06,
"loss": 0.8038457632064819,
"step": 54
},
{
"epoch": 0.11814345991561181,
"grad_norm": 2.4263148307800293,
"learning_rate": 3.076923076923077e-06,
"loss": 1.4315626621246338,
"step": 56
},
{
"epoch": 0.12236286919831224,
"grad_norm": 8.127001762390137,
"learning_rate": 3.1888111888111884e-06,
"loss": 0.7465399503707886,
"step": 58
},
{
"epoch": 0.12658227848101267,
"grad_norm": 2.8598272800445557,
"learning_rate": 3.3006993006993007e-06,
"loss": 1.4817099571228027,
"step": 60
},
{
"epoch": 0.1308016877637131,
"grad_norm": 3.184314489364624,
"learning_rate": 3.4125874125874125e-06,
"loss": 1.5089186429977417,
"step": 62
},
{
"epoch": 0.1350210970464135,
"grad_norm": 1.7058652639389038,
"learning_rate": 3.5244755244755243e-06,
"loss": 1.0980231761932373,
"step": 64
},
{
"epoch": 0.13924050632911392,
"grad_norm": 4.485511779785156,
"learning_rate": 3.636363636363636e-06,
"loss": 0.8224247694015503,
"step": 66
},
{
"epoch": 0.14345991561181434,
"grad_norm": 2.326599359512329,
"learning_rate": 3.748251748251748e-06,
"loss": 1.1156786680221558,
"step": 68
},
{
"epoch": 0.14767932489451477,
"grad_norm": 5.480278491973877,
"learning_rate": 3.860139860139859e-06,
"loss": 0.9959129095077515,
"step": 70
},
{
"epoch": 0.1518987341772152,
"grad_norm": 2.004271984100342,
"learning_rate": 3.972027972027972e-06,
"loss": 1.2949004173278809,
"step": 72
},
{
"epoch": 0.15611814345991562,
"grad_norm": 5.011590957641602,
"learning_rate": 4.083916083916084e-06,
"loss": 1.046140193939209,
"step": 74
},
{
"epoch": 0.16033755274261605,
"grad_norm": 5.78662633895874,
"learning_rate": 4.195804195804196e-06,
"loss": 0.86857008934021,
"step": 76
},
{
"epoch": 0.16455696202531644,
"grad_norm": 2.636563539505005,
"learning_rate": 4.3076923076923076e-06,
"loss": 1.3720424175262451,
"step": 78
},
{
"epoch": 0.16877637130801687,
"grad_norm": 2.0765841007232666,
"learning_rate": 4.4195804195804185e-06,
"loss": 1.3632028102874756,
"step": 80
},
{
"epoch": 0.1729957805907173,
"grad_norm": 2.3837273120880127,
"learning_rate": 4.531468531468531e-06,
"loss": 1.4058136940002441,
"step": 82
},
{
"epoch": 0.17721518987341772,
"grad_norm": 4.226639747619629,
"learning_rate": 4.643356643356643e-06,
"loss": 0.7853094339370728,
"step": 84
},
{
"epoch": 0.18143459915611815,
"grad_norm": 4.09359884262085,
"learning_rate": 4.755244755244755e-06,
"loss": 1.1315921545028687,
"step": 86
},
{
"epoch": 0.18565400843881857,
"grad_norm": 1.9499998092651367,
"learning_rate": 4.8671328671328676e-06,
"loss": 1.323297381401062,
"step": 88
},
{
"epoch": 0.189873417721519,
"grad_norm": 7.248386383056641,
"learning_rate": 4.9790209790209785e-06,
"loss": 0.6489843726158142,
"step": 90
},
{
"epoch": 0.1940928270042194,
"grad_norm": 3.945362091064453,
"learning_rate": 5.09090909090909e-06,
"loss": 1.7082250118255615,
"step": 92
},
{
"epoch": 0.19831223628691982,
"grad_norm": 9.237424850463867,
"learning_rate": 5.202797202797202e-06,
"loss": 0.9587538242340088,
"step": 94
},
{
"epoch": 0.20253164556962025,
"grad_norm": 2.5106499195098877,
"learning_rate": 5.314685314685315e-06,
"loss": 1.4145572185516357,
"step": 96
},
{
"epoch": 0.20675105485232068,
"grad_norm": 2.280298948287964,
"learning_rate": 5.426573426573427e-06,
"loss": 1.4730861186981201,
"step": 98
},
{
"epoch": 0.2109704641350211,
"grad_norm": 4.468693256378174,
"learning_rate": 5.538461538461538e-06,
"loss": 1.24980628490448,
"step": 100
},
{
"epoch": 0.21518987341772153,
"grad_norm": 2.57384991645813,
"learning_rate": 5.6503496503496495e-06,
"loss": 1.0641834735870361,
"step": 102
},
{
"epoch": 0.21940928270042195,
"grad_norm": 2.2377758026123047,
"learning_rate": 5.762237762237762e-06,
"loss": 0.9983944892883301,
"step": 104
},
{
"epoch": 0.22362869198312235,
"grad_norm": 7.83008337020874,
"learning_rate": 5.874125874125874e-06,
"loss": 0.9789789319038391,
"step": 106
},
{
"epoch": 0.22784810126582278,
"grad_norm": 2.1825568675994873,
"learning_rate": 5.986013986013986e-06,
"loss": 0.9948168992996216,
"step": 108
},
{
"epoch": 0.2320675105485232,
"grad_norm": 1.7740533351898193,
"learning_rate": 6.097902097902097e-06,
"loss": 1.0290191173553467,
"step": 110
},
{
"epoch": 0.23628691983122363,
"grad_norm": 1.6431820392608643,
"learning_rate": 6.2097902097902095e-06,
"loss": 1.3816218376159668,
"step": 112
},
{
"epoch": 0.24050632911392406,
"grad_norm": 4.050329208374023,
"learning_rate": 6.321678321678321e-06,
"loss": 1.2858781814575195,
"step": 114
},
{
"epoch": 0.24472573839662448,
"grad_norm": 4.519939422607422,
"learning_rate": 6.433566433566433e-06,
"loss": 1.6122548580169678,
"step": 116
},
{
"epoch": 0.2489451476793249,
"grad_norm": 1.7163703441619873,
"learning_rate": 6.545454545454546e-06,
"loss": 1.2705044746398926,
"step": 118
},
{
"epoch": 0.25316455696202533,
"grad_norm": 1.1608729362487793,
"learning_rate": 6.657342657342657e-06,
"loss": 1.0020270347595215,
"step": 120
},
{
"epoch": 0.25738396624472576,
"grad_norm": 4.328707695007324,
"learning_rate": 6.769230769230769e-06,
"loss": 1.2712280750274658,
"step": 122
},
{
"epoch": 0.2616033755274262,
"grad_norm": 1.8052810430526733,
"learning_rate": 6.8811188811188805e-06,
"loss": 1.2797789573669434,
"step": 124
},
{
"epoch": 0.26582278481012656,
"grad_norm": 2.120347023010254,
"learning_rate": 6.993006993006993e-06,
"loss": 1.3641468286514282,
"step": 126
},
{
"epoch": 0.270042194092827,
"grad_norm": 7.924063682556152,
"learning_rate": 7.104895104895105e-06,
"loss": 0.8769274950027466,
"step": 128
},
{
"epoch": 0.2742616033755274,
"grad_norm": 2.9971201419830322,
"learning_rate": 7.216783216783216e-06,
"loss": 1.1519945859909058,
"step": 130
},
{
"epoch": 0.27848101265822783,
"grad_norm": 4.976275444030762,
"learning_rate": 7.328671328671328e-06,
"loss": 1.217698335647583,
"step": 132
},
{
"epoch": 0.28270042194092826,
"grad_norm": 3.9333672523498535,
"learning_rate": 7.4405594405594405e-06,
"loss": 0.6807541847229004,
"step": 134
},
{
"epoch": 0.2869198312236287,
"grad_norm": 4.12578821182251,
"learning_rate": 7.552447552447552e-06,
"loss": 0.8635811805725098,
"step": 136
},
{
"epoch": 0.2911392405063291,
"grad_norm": 4.128167629241943,
"learning_rate": 7.664335664335663e-06,
"loss": 1.3738093376159668,
"step": 138
},
{
"epoch": 0.29535864978902954,
"grad_norm": 4.789083957672119,
"learning_rate": 7.776223776223776e-06,
"loss": 0.9322667717933655,
"step": 140
},
{
"epoch": 0.29957805907172996,
"grad_norm": 5.845694541931152,
"learning_rate": 7.888111888111889e-06,
"loss": 1.2719149589538574,
"step": 142
},
{
"epoch": 0.3037974683544304,
"grad_norm": 4.548868656158447,
"learning_rate": 8e-06,
"loss": 1.0755615234375,
"step": 144
},
{
"epoch": 0.3080168776371308,
"grad_norm": 2.8906826972961426,
"learning_rate": 7.99999025946351e-06,
"loss": 1.3829221725463867,
"step": 146
},
{
"epoch": 0.31223628691983124,
"grad_norm": 8.330571174621582,
"learning_rate": 7.999961037906754e-06,
"loss": 1.3621151447296143,
"step": 148
},
{
"epoch": 0.31645569620253167,
"grad_norm": 3.5093352794647217,
"learning_rate": 7.999912335487857e-06,
"loss": 1.2308037281036377,
"step": 150
},
{
"epoch": 0.3206751054852321,
"grad_norm": 6.054520606994629,
"learning_rate": 7.999844152470372e-06,
"loss": 1.2870557308197021,
"step": 152
},
{
"epoch": 0.32489451476793246,
"grad_norm": 1.853664755821228,
"learning_rate": 7.999756489223264e-06,
"loss": 1.465219259262085,
"step": 154
},
{
"epoch": 0.3291139240506329,
"grad_norm": 3.744748592376709,
"learning_rate": 7.999649346220915e-06,
"loss": 1.2533340454101562,
"step": 156
},
{
"epoch": 0.3333333333333333,
"grad_norm": 3.5540201663970947,
"learning_rate": 7.999522724043118e-06,
"loss": 1.3192243576049805,
"step": 158
},
{
"epoch": 0.33755274261603374,
"grad_norm": 1.734734058380127,
"learning_rate": 7.999376623375078e-06,
"loss": 1.302985429763794,
"step": 160
},
{
"epoch": 0.34177215189873417,
"grad_norm": 4.0876383781433105,
"learning_rate": 7.999211045007407e-06,
"loss": 0.8754786849021912,
"step": 162
},
{
"epoch": 0.3459915611814346,
"grad_norm": 4.771523952484131,
"learning_rate": 7.999025989836115e-06,
"loss": 1.2280066013336182,
"step": 164
},
{
"epoch": 0.350210970464135,
"grad_norm": 8.509572982788086,
"learning_rate": 7.998821458862613e-06,
"loss": 0.8188046813011169,
"step": 166
},
{
"epoch": 0.35443037974683544,
"grad_norm": 4.890702724456787,
"learning_rate": 7.998597453193701e-06,
"loss": 1.2213170528411865,
"step": 168
},
{
"epoch": 0.35864978902953587,
"grad_norm": 1.8209432363510132,
"learning_rate": 7.998353974041564e-06,
"loss": 1.312690019607544,
"step": 170
},
{
"epoch": 0.3628691983122363,
"grad_norm": 7.146384239196777,
"learning_rate": 7.998091022723772e-06,
"loss": 1.2072352170944214,
"step": 172
},
{
"epoch": 0.3670886075949367,
"grad_norm": 3.6562139987945557,
"learning_rate": 7.997808600663259e-06,
"loss": 1.0147764682769775,
"step": 174
},
{
"epoch": 0.37130801687763715,
"grad_norm": 1.787636637687683,
"learning_rate": 7.997506709388324e-06,
"loss": 1.3151808977127075,
"step": 176
},
{
"epoch": 0.3755274261603376,
"grad_norm": 3.019120454788208,
"learning_rate": 7.997185350532626e-06,
"loss": 1.3140928745269775,
"step": 178
},
{
"epoch": 0.379746835443038,
"grad_norm": 1.2904940843582153,
"learning_rate": 7.996844525835172e-06,
"loss": 1.0001540184020996,
"step": 180
},
{
"epoch": 0.38396624472573837,
"grad_norm": 1.6466349363327026,
"learning_rate": 7.9964842371403e-06,
"loss": 1.2761380672454834,
"step": 182
},
{
"epoch": 0.3881856540084388,
"grad_norm": 2.2190961837768555,
"learning_rate": 7.996104486397683e-06,
"loss": 1.272679090499878,
"step": 184
},
{
"epoch": 0.3924050632911392,
"grad_norm": 8.958863258361816,
"learning_rate": 7.995705275662305e-06,
"loss": 0.5206277966499329,
"step": 186
},
{
"epoch": 0.39662447257383965,
"grad_norm": 1.6779208183288574,
"learning_rate": 7.995286607094459e-06,
"loss": 0.9622843265533447,
"step": 188
},
{
"epoch": 0.4008438818565401,
"grad_norm": 1.6733808517456055,
"learning_rate": 7.994848482959734e-06,
"loss": 1.0529744625091553,
"step": 190
},
{
"epoch": 0.4050632911392405,
"grad_norm": 6.742412567138672,
"learning_rate": 7.994390905628996e-06,
"loss": 1.2907187938690186,
"step": 192
},
{
"epoch": 0.4092827004219409,
"grad_norm": 5.04363489151001,
"learning_rate": 7.993913877578386e-06,
"loss": 1.062695026397705,
"step": 194
},
{
"epoch": 0.41350210970464135,
"grad_norm": 1.8267443180084229,
"learning_rate": 7.993417401389293e-06,
"loss": 1.2746732234954834,
"step": 196
},
{
"epoch": 0.4177215189873418,
"grad_norm": 2.2780373096466064,
"learning_rate": 7.99290147974836e-06,
"loss": 1.0202131271362305,
"step": 198
},
{
"epoch": 0.4219409282700422,
"grad_norm": 2.0691215991973877,
"learning_rate": 7.992366115447445e-06,
"loss": 1.1179842948913574,
"step": 200
},
{
"epoch": 0.42616033755274263,
"grad_norm": 4.217780590057373,
"learning_rate": 7.991811311383625e-06,
"loss": 1.1258949041366577,
"step": 202
},
{
"epoch": 0.43037974683544306,
"grad_norm": 2.5018441677093506,
"learning_rate": 7.991237070559173e-06,
"loss": 0.8922556638717651,
"step": 204
},
{
"epoch": 0.4345991561181435,
"grad_norm": 2.525747776031494,
"learning_rate": 7.990643396081536e-06,
"loss": 1.4427449703216553,
"step": 206
},
{
"epoch": 0.4388185654008439,
"grad_norm": 2.50166916847229,
"learning_rate": 7.990030291163336e-06,
"loss": 0.723818838596344,
"step": 208
},
{
"epoch": 0.4430379746835443,
"grad_norm": 1.4910839796066284,
"learning_rate": 7.98939775912233e-06,
"loss": 1.2674278020858765,
"step": 210
},
{
"epoch": 0.4472573839662447,
"grad_norm": 4.068655490875244,
"learning_rate": 7.98874580338141e-06,
"loss": 0.7890869379043579,
"step": 212
},
{
"epoch": 0.45147679324894513,
"grad_norm": 2.0325798988342285,
"learning_rate": 7.988074427468575e-06,
"loss": 1.1955333948135376,
"step": 214
},
{
"epoch": 0.45569620253164556,
"grad_norm": 2.587684154510498,
"learning_rate": 7.987383635016914e-06,
"loss": 1.2449276447296143,
"step": 216
},
{
"epoch": 0.459915611814346,
"grad_norm": 2.6565630435943604,
"learning_rate": 7.986673429764587e-06,
"loss": 1.294593334197998,
"step": 218
},
{
"epoch": 0.4641350210970464,
"grad_norm": 3.8166139125823975,
"learning_rate": 7.985943815554808e-06,
"loss": 1.2401716709136963,
"step": 220
},
{
"epoch": 0.46835443037974683,
"grad_norm": 5.548577785491943,
"learning_rate": 7.985194796335814e-06,
"loss": 1.1999175548553467,
"step": 222
},
{
"epoch": 0.47257383966244726,
"grad_norm": 2.5439255237579346,
"learning_rate": 7.98442637616086e-06,
"loss": 1.0728691816329956,
"step": 224
},
{
"epoch": 0.4767932489451477,
"grad_norm": 12.275174140930176,
"learning_rate": 7.983638559188175e-06,
"loss": 1.2714494466781616,
"step": 226
},
{
"epoch": 0.4810126582278481,
"grad_norm": 23.638010025024414,
"learning_rate": 7.982831349680965e-06,
"loss": 0.7866320610046387,
"step": 228
},
{
"epoch": 0.48523206751054854,
"grad_norm": 4.956063270568848,
"learning_rate": 7.982004752007367e-06,
"loss": 0.921814501285553,
"step": 230
},
{
"epoch": 0.48945147679324896,
"grad_norm": 3.645993232727051,
"learning_rate": 7.98115877064044e-06,
"loss": 0.8924152851104736,
"step": 232
},
{
"epoch": 0.4936708860759494,
"grad_norm": 2.041003704071045,
"learning_rate": 7.980293410158139e-06,
"loss": 1.2708659172058105,
"step": 234
},
{
"epoch": 0.4978902953586498,
"grad_norm": 4.042453289031982,
"learning_rate": 7.979408675243278e-06,
"loss": 1.3152391910552979,
"step": 236
},
{
"epoch": 0.5021097046413502,
"grad_norm": 4.269692420959473,
"learning_rate": 7.978504570683523e-06,
"loss": 0.980125367641449,
"step": 238
},
{
"epoch": 0.5063291139240507,
"grad_norm": 3.801175355911255,
"learning_rate": 7.977581101371354e-06,
"loss": 0.4545478820800781,
"step": 240
},
{
"epoch": 0.510548523206751,
"grad_norm": 4.7520060539245605,
"learning_rate": 7.97663827230404e-06,
"loss": 1.4193034172058105,
"step": 242
},
{
"epoch": 0.5147679324894515,
"grad_norm": 3.5392112731933594,
"learning_rate": 7.975676088583614e-06,
"loss": 0.8708986043930054,
"step": 244
},
{
"epoch": 0.5189873417721519,
"grad_norm": 2.672224283218384,
"learning_rate": 7.974694555416848e-06,
"loss": 1.4755961894989014,
"step": 246
},
{
"epoch": 0.5232067510548524,
"grad_norm": 1.7014986276626587,
"learning_rate": 7.973693678115218e-06,
"loss": 1.186201810836792,
"step": 248
},
{
"epoch": 0.5274261603375527,
"grad_norm": 1.939778208732605,
"learning_rate": 7.97267346209488e-06,
"loss": 1.3046202659606934,
"step": 250
},
{
"epoch": 0.5316455696202531,
"grad_norm": 9.32943344116211,
"learning_rate": 7.971633912876644e-06,
"loss": 1.1170387268066406,
"step": 252
},
{
"epoch": 0.5358649789029536,
"grad_norm": 1.9259498119354248,
"learning_rate": 7.97057503608593e-06,
"loss": 1.2260360717773438,
"step": 254
},
{
"epoch": 0.540084388185654,
"grad_norm": 2.0438666343688965,
"learning_rate": 7.969496837452762e-06,
"loss": 1.1931499242782593,
"step": 256
},
{
"epoch": 0.5443037974683544,
"grad_norm": 5.915661811828613,
"learning_rate": 7.968399322811707e-06,
"loss": 1.0251163244247437,
"step": 258
},
{
"epoch": 0.5485232067510548,
"grad_norm": 5.656018257141113,
"learning_rate": 7.967282498101866e-06,
"loss": 0.9710787534713745,
"step": 260
},
{
"epoch": 0.5527426160337553,
"grad_norm": 1.6091376543045044,
"learning_rate": 7.966146369366839e-06,
"loss": 1.2647578716278076,
"step": 262
},
{
"epoch": 0.5569620253164557,
"grad_norm": 2.3578081130981445,
"learning_rate": 7.96499094275468e-06,
"loss": 1.2789413928985596,
"step": 264
},
{
"epoch": 0.5611814345991561,
"grad_norm": 2.2791264057159424,
"learning_rate": 7.963816224517875e-06,
"loss": 1.2268846035003662,
"step": 266
},
{
"epoch": 0.5654008438818565,
"grad_norm": 4.264853000640869,
"learning_rate": 7.962622221013308e-06,
"loss": 1.4937443733215332,
"step": 268
},
{
"epoch": 0.569620253164557,
"grad_norm": 3.884678602218628,
"learning_rate": 7.961408938702217e-06,
"loss": 1.050868034362793,
"step": 270
},
{
"epoch": 0.5738396624472574,
"grad_norm": 3.3875067234039307,
"learning_rate": 7.96017638415017e-06,
"loss": 1.3021985292434692,
"step": 272
},
{
"epoch": 0.5780590717299579,
"grad_norm": 2.571835994720459,
"learning_rate": 7.958924564027025e-06,
"loss": 1.1042567491531372,
"step": 274
},
{
"epoch": 0.5822784810126582,
"grad_norm": 2.283672571182251,
"learning_rate": 7.957653485106894e-06,
"loss": 1.2543787956237793,
"step": 276
},
{
"epoch": 0.5864978902953587,
"grad_norm": 3.0379388332366943,
"learning_rate": 7.956363154268103e-06,
"loss": 1.393994688987732,
"step": 278
},
{
"epoch": 0.5907172995780591,
"grad_norm": 4.789394378662109,
"learning_rate": 7.95505357849316e-06,
"loss": 0.9629275798797607,
"step": 280
},
{
"epoch": 0.5949367088607594,
"grad_norm": 7.168961524963379,
"learning_rate": 7.953724764868716e-06,
"loss": 1.330991268157959,
"step": 282
},
{
"epoch": 0.5991561181434599,
"grad_norm": 2.166527271270752,
"learning_rate": 7.952376720585524e-06,
"loss": 1.54081130027771,
"step": 284
},
{
"epoch": 0.6033755274261603,
"grad_norm": 0.7811316847801208,
"learning_rate": 7.951009452938407e-06,
"loss": 1.1209747791290283,
"step": 286
},
{
"epoch": 0.6075949367088608,
"grad_norm": 1.7322338819503784,
"learning_rate": 7.949622969326205e-06,
"loss": 1.248884916305542,
"step": 288
},
{
"epoch": 0.6118143459915611,
"grad_norm": 31.41063690185547,
"learning_rate": 7.94821727725175e-06,
"loss": 0.8109699487686157,
"step": 290
},
{
"epoch": 0.6160337552742616,
"grad_norm": 2.779398202896118,
"learning_rate": 7.946792384321818e-06,
"loss": 0.6689173579216003,
"step": 292
},
{
"epoch": 0.620253164556962,
"grad_norm": 2.087568521499634,
"learning_rate": 7.945348298247087e-06,
"loss": 1.2270828485488892,
"step": 294
},
{
"epoch": 0.6244725738396625,
"grad_norm": 5.774115085601807,
"learning_rate": 7.943885026842097e-06,
"loss": 0.7052218317985535,
"step": 296
},
{
"epoch": 0.6286919831223629,
"grad_norm": 8.402689933776855,
"learning_rate": 7.94240257802521e-06,
"loss": 0.6502029895782471,
"step": 298
},
{
"epoch": 0.6329113924050633,
"grad_norm": 6.8001556396484375,
"learning_rate": 7.94090095981856e-06,
"loss": 1.7823140621185303,
"step": 300
},
{
"epoch": 0.6371308016877637,
"grad_norm": 1.8385719060897827,
"learning_rate": 7.939380180348018e-06,
"loss": 1.2579293251037598,
"step": 302
},
{
"epoch": 0.6413502109704642,
"grad_norm": 3.531500816345215,
"learning_rate": 7.937840247843148e-06,
"loss": 0.8740752339363098,
"step": 304
},
{
"epoch": 0.6455696202531646,
"grad_norm": 4.576239109039307,
"learning_rate": 7.93628117063715e-06,
"loss": 1.0002870559692383,
"step": 306
},
{
"epoch": 0.6497890295358649,
"grad_norm": 1.4867647886276245,
"learning_rate": 7.934702957166833e-06,
"loss": 1.2564589977264404,
"step": 308
},
{
"epoch": 0.6540084388185654,
"grad_norm": 2.41863751411438,
"learning_rate": 7.933105615972553e-06,
"loss": 1.1982673406600952,
"step": 310
},
{
"epoch": 0.6582278481012658,
"grad_norm": 8.212380409240723,
"learning_rate": 7.931489155698178e-06,
"loss": 0.9985597729682922,
"step": 312
},
{
"epoch": 0.6624472573839663,
"grad_norm": 1.5422508716583252,
"learning_rate": 7.929853585091034e-06,
"loss": 1.2045118808746338,
"step": 314
},
{
"epoch": 0.6666666666666666,
"grad_norm": 2.346829652786255,
"learning_rate": 7.928198913001865e-06,
"loss": 1.0920261144638062,
"step": 316
},
{
"epoch": 0.6708860759493671,
"grad_norm": 2.013681173324585,
"learning_rate": 7.926525148384776e-06,
"loss": 1.0814929008483887,
"step": 318
},
{
"epoch": 0.6751054852320675,
"grad_norm": 6.084042072296143,
"learning_rate": 7.924832300297197e-06,
"loss": 1.0774112939834595,
"step": 320
},
{
"epoch": 0.679324894514768,
"grad_norm": 2.2710366249084473,
"learning_rate": 7.923120377899818e-06,
"loss": 0.872334897518158,
"step": 322
},
{
"epoch": 0.6835443037974683,
"grad_norm": 4.1272969245910645,
"learning_rate": 7.921389390456549e-06,
"loss": 0.866448163986206,
"step": 324
},
{
"epoch": 0.6877637130801688,
"grad_norm": 1.4605779647827148,
"learning_rate": 7.919639347334477e-06,
"loss": 0.8561316132545471,
"step": 326
},
{
"epoch": 0.6919831223628692,
"grad_norm": 1.4929612874984741,
"learning_rate": 7.917870258003798e-06,
"loss": 0.8531728982925415,
"step": 328
},
{
"epoch": 0.6962025316455697,
"grad_norm": 3.4534809589385986,
"learning_rate": 7.916082132037782e-06,
"loss": 1.5728954076766968,
"step": 330
},
{
"epoch": 0.70042194092827,
"grad_norm": 6.895410537719727,
"learning_rate": 7.914274979112704e-06,
"loss": 1.0785008668899536,
"step": 332
},
{
"epoch": 0.7046413502109705,
"grad_norm": 2.275595188140869,
"learning_rate": 7.912448809007812e-06,
"loss": 1.3434700965881348,
"step": 334
},
{
"epoch": 0.7088607594936709,
"grad_norm": 2.5494587421417236,
"learning_rate": 7.910603631605259e-06,
"loss": 1.2570360898971558,
"step": 336
},
{
"epoch": 0.7130801687763713,
"grad_norm": 1.906052589416504,
"learning_rate": 7.908739456890056e-06,
"loss": 1.2807261943817139,
"step": 338
},
{
"epoch": 0.7172995780590717,
"grad_norm": 8.606728553771973,
"learning_rate": 7.906856294950012e-06,
"loss": 1.254488468170166,
"step": 340
},
{
"epoch": 0.7215189873417721,
"grad_norm": 1.998100757598877,
"learning_rate": 7.90495415597569e-06,
"loss": 1.249230146408081,
"step": 342
},
{
"epoch": 0.7257383966244726,
"grad_norm": 1.9510746002197266,
"learning_rate": 7.90303305026034e-06,
"loss": 1.07149076461792,
"step": 344
},
{
"epoch": 0.729957805907173,
"grad_norm": 4.211492538452148,
"learning_rate": 7.901092988199852e-06,
"loss": 0.8842002153396606,
"step": 346
},
{
"epoch": 0.7341772151898734,
"grad_norm": 2.758723497390747,
"learning_rate": 7.899133980292698e-06,
"loss": 1.2383522987365723,
"step": 348
},
{
"epoch": 0.7383966244725738,
"grad_norm": 10.058371543884277,
"learning_rate": 7.897156037139865e-06,
"loss": 1.1752148866653442,
"step": 350
},
{
"epoch": 0.7426160337552743,
"grad_norm": 13.896164894104004,
"learning_rate": 7.89515916944482e-06,
"loss": 1.0859854221343994,
"step": 352
},
{
"epoch": 0.7468354430379747,
"grad_norm": 4.250057697296143,
"learning_rate": 7.893143388013425e-06,
"loss": 0.6642742156982422,
"step": 354
},
{
"epoch": 0.7510548523206751,
"grad_norm": 9.338140487670898,
"learning_rate": 7.891108703753902e-06,
"loss": 1.047743320465088,
"step": 356
},
{
"epoch": 0.7552742616033755,
"grad_norm": 3.788804054260254,
"learning_rate": 7.88905512767676e-06,
"loss": 1.0082635879516602,
"step": 358
},
{
"epoch": 0.759493670886076,
"grad_norm": 2.8348827362060547,
"learning_rate": 7.886982670894736e-06,
"loss": 1.210444688796997,
"step": 360
},
{
"epoch": 0.7637130801687764,
"grad_norm": 4.0919718742370605,
"learning_rate": 7.884891344622746e-06,
"loss": 0.98717200756073,
"step": 362
},
{
"epoch": 0.7679324894514767,
"grad_norm": 2.804220676422119,
"learning_rate": 7.88278116017781e-06,
"loss": 1.5726983547210693,
"step": 364
},
{
"epoch": 0.7721518987341772,
"grad_norm": 11.296149253845215,
"learning_rate": 7.880652128978999e-06,
"loss": 0.7079776525497437,
"step": 366
},
{
"epoch": 0.7763713080168776,
"grad_norm": 2.3584940433502197,
"learning_rate": 7.878504262547373e-06,
"loss": 1.1683130264282227,
"step": 368
},
{
"epoch": 0.7805907172995781,
"grad_norm": 4.991513252258301,
"learning_rate": 7.876337572505914e-06,
"loss": 1.472283124923706,
"step": 370
},
{
"epoch": 0.7848101265822784,
"grad_norm": 7.023402214050293,
"learning_rate": 7.87415207057947e-06,
"loss": 0.9845293760299683,
"step": 372
},
{
"epoch": 0.7890295358649789,
"grad_norm": 5.190583229064941,
"learning_rate": 7.871947768594688e-06,
"loss": 1.0484483242034912,
"step": 374
},
{
"epoch": 0.7932489451476793,
"grad_norm": 2.0012083053588867,
"learning_rate": 7.869724678479944e-06,
"loss": 1.152682900428772,
"step": 376
},
{
"epoch": 0.7974683544303798,
"grad_norm": 0.8431169986724854,
"learning_rate": 7.86748281226529e-06,
"loss": 1.119347333908081,
"step": 378
},
{
"epoch": 0.8016877637130801,
"grad_norm": 1.5772409439086914,
"learning_rate": 7.865222182082384e-06,
"loss": 0.7247622013092041,
"step": 380
},
{
"epoch": 0.8059071729957806,
"grad_norm": 4.462345123291016,
"learning_rate": 7.862942800164416e-06,
"loss": 1.2693476676940918,
"step": 382
},
{
"epoch": 0.810126582278481,
"grad_norm": 18.4215145111084,
"learning_rate": 7.860644678846057e-06,
"loss": 1.0132197141647339,
"step": 384
},
{
"epoch": 0.8143459915611815,
"grad_norm": 2.6663124561309814,
"learning_rate": 7.858327830563384e-06,
"loss": 1.2443773746490479,
"step": 386
},
{
"epoch": 0.8185654008438819,
"grad_norm": 7.3652448654174805,
"learning_rate": 7.855992267853806e-06,
"loss": 0.9400072693824768,
"step": 388
},
{
"epoch": 0.8227848101265823,
"grad_norm": 33.57978820800781,
"learning_rate": 7.85363800335601e-06,
"loss": 1.216599941253662,
"step": 390
},
{
"epoch": 0.8270042194092827,
"grad_norm": 4.298940181732178,
"learning_rate": 7.851265049809886e-06,
"loss": 0.9645065069198608,
"step": 392
},
{
"epoch": 0.8312236286919831,
"grad_norm": 3.8016936779022217,
"learning_rate": 7.848873420056456e-06,
"loss": 0.9074147939682007,
"step": 394
},
{
"epoch": 0.8354430379746836,
"grad_norm": 2.417340040206909,
"learning_rate": 7.846463127037807e-06,
"loss": 1.2312746047973633,
"step": 396
},
{
"epoch": 0.8396624472573839,
"grad_norm": 0.5479583144187927,
"learning_rate": 7.844034183797021e-06,
"loss": 1.0866131782531738,
"step": 398
},
{
"epoch": 0.8438818565400844,
"grad_norm": 1.3444585800170898,
"learning_rate": 7.841586603478105e-06,
"loss": 0.9458938837051392,
"step": 400
},
{
"epoch": 0.8481012658227848,
"grad_norm": 5.396076679229736,
"learning_rate": 7.839120399325913e-06,
"loss": 0.7105859518051147,
"step": 402
},
{
"epoch": 0.8523206751054853,
"grad_norm": 3.8646159172058105,
"learning_rate": 7.836635584686089e-06,
"loss": 1.21824049949646,
"step": 404
},
{
"epoch": 0.8565400843881856,
"grad_norm": 1.9759020805358887,
"learning_rate": 7.834132173004981e-06,
"loss": 0.9442010521888733,
"step": 406
},
{
"epoch": 0.8607594936708861,
"grad_norm": 2.555657148361206,
"learning_rate": 7.831610177829574e-06,
"loss": 0.9720205664634705,
"step": 408
},
{
"epoch": 0.8649789029535865,
"grad_norm": 15.906916618347168,
"learning_rate": 7.829069612807413e-06,
"loss": 1.23225998878479,
"step": 410
},
{
"epoch": 0.869198312236287,
"grad_norm": 2.261281967163086,
"learning_rate": 7.826510491686538e-06,
"loss": 0.8622678518295288,
"step": 412
},
{
"epoch": 0.8734177215189873,
"grad_norm": 30.68505859375,
"learning_rate": 7.823932828315398e-06,
"loss": 1.210330605506897,
"step": 414
},
{
"epoch": 0.8776371308016878,
"grad_norm": 4.517366409301758,
"learning_rate": 7.82133663664279e-06,
"loss": 1.0655572414398193,
"step": 416
},
{
"epoch": 0.8818565400843882,
"grad_norm": 8.33056926727295,
"learning_rate": 7.81872193071776e-06,
"loss": 1.163268804550171,
"step": 418
},
{
"epoch": 0.8860759493670886,
"grad_norm": 1.1660995483398438,
"learning_rate": 7.81608872468956e-06,
"loss": 0.9473620653152466,
"step": 420
},
{
"epoch": 0.890295358649789,
"grad_norm": 1.9505749940872192,
"learning_rate": 7.813437032807541e-06,
"loss": 1.3407762050628662,
"step": 422
},
{
"epoch": 0.8945147679324894,
"grad_norm": 13.3614501953125,
"learning_rate": 7.810766869421092e-06,
"loss": 0.9824624061584473,
"step": 424
},
{
"epoch": 0.8987341772151899,
"grad_norm": 4.366639137268066,
"learning_rate": 7.808078248979564e-06,
"loss": 1.1881823539733887,
"step": 426
},
{
"epoch": 0.9029535864978903,
"grad_norm": 1.7050063610076904,
"learning_rate": 7.805371186032176e-06,
"loss": 1.0488433837890625,
"step": 428
},
{
"epoch": 0.9071729957805907,
"grad_norm": 7.406988143920898,
"learning_rate": 7.80264569522796e-06,
"loss": 1.0144481658935547,
"step": 430
},
{
"epoch": 0.9113924050632911,
"grad_norm": 3.4340415000915527,
"learning_rate": 7.799901791315658e-06,
"loss": 1.0802500247955322,
"step": 432
},
{
"epoch": 0.9156118143459916,
"grad_norm": 3.148068428039551,
"learning_rate": 7.797139489143655e-06,
"loss": 1.2489020824432373,
"step": 434
},
{
"epoch": 0.919831223628692,
"grad_norm": 4.874792098999023,
"learning_rate": 7.794358803659903e-06,
"loss": 0.8715201616287231,
"step": 436
},
{
"epoch": 0.9240506329113924,
"grad_norm": 1.5414555072784424,
"learning_rate": 7.791559749911826e-06,
"loss": 1.2029755115509033,
"step": 438
},
{
"epoch": 0.9282700421940928,
"grad_norm": 1.9815410375595093,
"learning_rate": 7.788742343046248e-06,
"loss": 0.9946187138557434,
"step": 440
},
{
"epoch": 0.9324894514767933,
"grad_norm": 10.352864265441895,
"learning_rate": 7.785906598309314e-06,
"loss": 0.9312165975570679,
"step": 442
},
{
"epoch": 0.9367088607594937,
"grad_norm": 8.163040161132812,
"learning_rate": 7.783052531046397e-06,
"loss": 1.0982768535614014,
"step": 444
},
{
"epoch": 0.9409282700421941,
"grad_norm": 6.510847091674805,
"learning_rate": 7.780180156702023e-06,
"loss": 1.3022956848144531,
"step": 446
},
{
"epoch": 0.9451476793248945,
"grad_norm": 1.658566951751709,
"learning_rate": 7.777289490819783e-06,
"loss": 1.0020906925201416,
"step": 448
},
{
"epoch": 0.9493670886075949,
"grad_norm": 3.4121336936950684,
"learning_rate": 7.774380549042255e-06,
"loss": 0.9293044209480286,
"step": 450
},
{
"epoch": 0.9535864978902954,
"grad_norm": 1.8673920631408691,
"learning_rate": 7.771453347110913e-06,
"loss": 1.2867658138275146,
"step": 452
},
{
"epoch": 0.9578059071729957,
"grad_norm": 1.841113805770874,
"learning_rate": 7.768507900866044e-06,
"loss": 0.982481062412262,
"step": 454
},
{
"epoch": 0.9620253164556962,
"grad_norm": 4.477797985076904,
"learning_rate": 7.765544226246663e-06,
"loss": 1.1560728549957275,
"step": 456
},
{
"epoch": 0.9662447257383966,
"grad_norm": 2.086230993270874,
"learning_rate": 7.762562339290425e-06,
"loss": 0.8860993981361389,
"step": 458
},
{
"epoch": 0.9704641350210971,
"grad_norm": 27.607803344726562,
"learning_rate": 7.759562256133541e-06,
"loss": 0.7165157794952393,
"step": 460
},
{
"epoch": 0.9746835443037974,
"grad_norm": 9.253504753112793,
"learning_rate": 7.75654399301069e-06,
"loss": 0.9001080989837646,
"step": 462
},
{
"epoch": 0.9789029535864979,
"grad_norm": 1.412550449371338,
"learning_rate": 7.753507566254927e-06,
"loss": 1.168654441833496,
"step": 464
},
{
"epoch": 0.9831223628691983,
"grad_norm": 4.56351900100708,
"learning_rate": 7.750452992297599e-06,
"loss": 0.7248488664627075,
"step": 466
},
{
"epoch": 0.9873417721518988,
"grad_norm": 2.234978199005127,
"learning_rate": 7.747380287668257e-06,
"loss": 1.3152525424957275,
"step": 468
},
{
"epoch": 0.9915611814345991,
"grad_norm": 7.138265609741211,
"learning_rate": 7.744289468994562e-06,
"loss": 0.8874726891517639,
"step": 470
},
{
"epoch": 0.9957805907172996,
"grad_norm": 3.042675256729126,
"learning_rate": 7.741180553002199e-06,
"loss": 1.2144908905029297,
"step": 472
},
{
"epoch": 1.0,
"grad_norm": 1.7988324165344238,
"learning_rate": 7.738053556514784e-06,
"loss": 1.2585757970809937,
"step": 474
},
{
"epoch": 1.0042194092827004,
"grad_norm": 6.92104434967041,
"learning_rate": 7.734908496453774e-06,
"loss": 1.060208797454834,
"step": 476
},
{
"epoch": 1.0084388185654007,
"grad_norm": 1.8207498788833618,
"learning_rate": 7.73174538983838e-06,
"loss": 1.1424498558044434,
"step": 478
},
{
"epoch": 1.0126582278481013,
"grad_norm": 2.2936971187591553,
"learning_rate": 7.72856425378546e-06,
"loss": 1.1568377017974854,
"step": 480
},
{
"epoch": 1.0168776371308017,
"grad_norm": 48.475311279296875,
"learning_rate": 7.725365105509444e-06,
"loss": 0.8294604420661926,
"step": 482
},
{
"epoch": 1.021097046413502,
"grad_norm": 4.676328659057617,
"learning_rate": 7.722147962322236e-06,
"loss": 1.0818572044372559,
"step": 484
},
{
"epoch": 1.0253164556962024,
"grad_norm": 5.833449840545654,
"learning_rate": 7.718912841633112e-06,
"loss": 0.5055439472198486,
"step": 486
},
{
"epoch": 1.029535864978903,
"grad_norm": 4.804643630981445,
"learning_rate": 7.715659760948632e-06,
"loss": 0.9713239073753357,
"step": 488
},
{
"epoch": 1.0337552742616034,
"grad_norm": 3.2628095149993896,
"learning_rate": 7.71238873787255e-06,
"loss": 0.8403753042221069,
"step": 490
},
{
"epoch": 1.0379746835443038,
"grad_norm": 2.2691550254821777,
"learning_rate": 7.709099790105707e-06,
"loss": 1.1320157051086426,
"step": 492
},
{
"epoch": 1.0421940928270041,
"grad_norm": 19.545082092285156,
"learning_rate": 7.705792935445948e-06,
"loss": 0.8306432962417603,
"step": 494
},
{
"epoch": 1.0464135021097047,
"grad_norm": 4.1693949699401855,
"learning_rate": 7.702468191788014e-06,
"loss": 0.9293802976608276,
"step": 496
},
{
"epoch": 1.0506329113924051,
"grad_norm": 2.5364606380462646,
"learning_rate": 7.699125577123455e-06,
"loss": 1.2761287689208984,
"step": 498
},
{
"epoch": 1.0548523206751055,
"grad_norm": 11.509333610534668,
"learning_rate": 7.695765109540526e-06,
"loss": 1.0367153882980347,
"step": 500
},
{
"epoch": 1.0590717299578059,
"grad_norm": 2.0936784744262695,
"learning_rate": 7.692386807224092e-06,
"loss": 1.1410118341445923,
"step": 502
},
{
"epoch": 1.0632911392405062,
"grad_norm": 1.661012053489685,
"learning_rate": 7.68899068845553e-06,
"loss": 1.2624824047088623,
"step": 504
},
{
"epoch": 1.0675105485232068,
"grad_norm": 3.211272716522217,
"learning_rate": 7.685576771612624e-06,
"loss": 0.727929413318634,
"step": 506
},
{
"epoch": 1.0717299578059072,
"grad_norm": 2.976832151412964,
"learning_rate": 7.682145075169482e-06,
"loss": 0.9856585264205933,
"step": 508
},
{
"epoch": 1.0759493670886076,
"grad_norm": 1.821731448173523,
"learning_rate": 7.678695617696413e-06,
"loss": 1.0898807048797607,
"step": 510
},
{
"epoch": 1.080168776371308,
"grad_norm": 3.3283071517944336,
"learning_rate": 7.675228417859842e-06,
"loss": 1.0197210311889648,
"step": 512
},
{
"epoch": 1.0843881856540085,
"grad_norm": 2.766814708709717,
"learning_rate": 7.67174349442221e-06,
"loss": 1.07771897315979,
"step": 514
},
{
"epoch": 1.0886075949367089,
"grad_norm": 2.035024881362915,
"learning_rate": 7.66824086624186e-06,
"loss": 1.0527117252349854,
"step": 516
},
{
"epoch": 1.0928270042194093,
"grad_norm": 2.086318254470825,
"learning_rate": 7.664720552272948e-06,
"loss": 0.9818480014801025,
"step": 518
},
{
"epoch": 1.0970464135021096,
"grad_norm": 2.671875,
"learning_rate": 7.661182571565332e-06,
"loss": 0.9276726245880127,
"step": 520
},
{
"epoch": 1.1012658227848102,
"grad_norm": 8.45070743560791,
"learning_rate": 7.657626943264474e-06,
"loss": 0.8790248036384583,
"step": 522
},
{
"epoch": 1.1054852320675106,
"grad_norm": 2.1058099269866943,
"learning_rate": 7.654053686611334e-06,
"loss": 1.0137805938720703,
"step": 524
},
{
"epoch": 1.109704641350211,
"grad_norm": 8.899941444396973,
"learning_rate": 7.650462820942264e-06,
"loss": 0.8656354546546936,
"step": 526
},
{
"epoch": 1.1139240506329113,
"grad_norm": 2.253026247024536,
"learning_rate": 7.64685436568891e-06,
"loss": 1.2973110675811768,
"step": 528
},
{
"epoch": 1.1181434599156117,
"grad_norm": 1.9157750606536865,
"learning_rate": 7.643228340378098e-06,
"loss": 1.208802342414856,
"step": 530
},
{
"epoch": 1.1223628691983123,
"grad_norm": 3.9860875606536865,
"learning_rate": 7.639584764631736e-06,
"loss": 0.6784745454788208,
"step": 532
},
{
"epoch": 1.1265822784810127,
"grad_norm": 4.096652984619141,
"learning_rate": 7.6359236581667e-06,
"loss": 0.969964861869812,
"step": 534
},
{
"epoch": 1.130801687763713,
"grad_norm": 2.2989354133605957,
"learning_rate": 7.632245040794737e-06,
"loss": 1.0640895366668701,
"step": 536
},
{
"epoch": 1.1350210970464134,
"grad_norm": 2.0458743572235107,
"learning_rate": 7.6285489324223534e-06,
"loss": 0.8687632083892822,
"step": 538
},
{
"epoch": 1.139240506329114,
"grad_norm": 3.1451354026794434,
"learning_rate": 7.6248353530507e-06,
"loss": 0.8472435474395752,
"step": 540
},
{
"epoch": 1.1434599156118144,
"grad_norm": 1.7947264909744263,
"learning_rate": 7.621104322775477e-06,
"loss": 0.7688232660293579,
"step": 542
},
{
"epoch": 1.1476793248945147,
"grad_norm": 1.6323504447937012,
"learning_rate": 7.617355861786813e-06,
"loss": 0.7883434891700745,
"step": 544
},
{
"epoch": 1.1518987341772151,
"grad_norm": 3.807711601257324,
"learning_rate": 7.613589990369167e-06,
"loss": 1.1249892711639404,
"step": 546
},
{
"epoch": 1.1561181434599157,
"grad_norm": 3.2176730632781982,
"learning_rate": 7.6098067289012086e-06,
"loss": 1.1086885929107666,
"step": 548
},
{
"epoch": 1.160337552742616,
"grad_norm": 8.372694969177246,
"learning_rate": 7.606006097855713e-06,
"loss": 0.8941718339920044,
"step": 550
},
{
"epoch": 1.1645569620253164,
"grad_norm": 3.027027130126953,
"learning_rate": 7.602188117799451e-06,
"loss": 1.2869350910186768,
"step": 552
},
{
"epoch": 1.1687763713080168,
"grad_norm": 3.2669458389282227,
"learning_rate": 7.598352809393074e-06,
"loss": 1.1333280801773071,
"step": 554
},
{
"epoch": 1.1729957805907172,
"grad_norm": 30.656925201416016,
"learning_rate": 7.594500193391006e-06,
"loss": 0.6378011703491211,
"step": 556
},
{
"epoch": 1.1772151898734178,
"grad_norm": 4.539504528045654,
"learning_rate": 7.590630290641327e-06,
"loss": 1.1045113801956177,
"step": 558
},
{
"epoch": 1.1814345991561181,
"grad_norm": 26.276714324951172,
"learning_rate": 7.586743122085666e-06,
"loss": 0.5744074583053589,
"step": 560
},
{
"epoch": 1.1856540084388185,
"grad_norm": 1.9147850275039673,
"learning_rate": 7.582838708759082e-06,
"loss": 0.7490895986557007,
"step": 562
},
{
"epoch": 1.189873417721519,
"grad_norm": 3.1463582515716553,
"learning_rate": 7.5789170717899516e-06,
"loss": 1.3113162517547607,
"step": 564
},
{
"epoch": 1.1940928270042195,
"grad_norm": 2.1219849586486816,
"learning_rate": 7.57497823239986e-06,
"loss": 1.0440800189971924,
"step": 566
},
{
"epoch": 1.1983122362869199,
"grad_norm": 4.773017406463623,
"learning_rate": 7.571022211903475e-06,
"loss": 0.8006106615066528,
"step": 568
},
{
"epoch": 1.2025316455696202,
"grad_norm": 1.4802496433258057,
"learning_rate": 7.567049031708445e-06,
"loss": 1.0503010749816895,
"step": 570
},
{
"epoch": 1.2067510548523206,
"grad_norm": 6.079049587249756,
"learning_rate": 7.563058713315273e-06,
"loss": 1.0806069374084473,
"step": 572
},
{
"epoch": 1.2109704641350212,
"grad_norm": 2.4842324256896973,
"learning_rate": 7.559051278317204e-06,
"loss": 1.1085004806518555,
"step": 574
},
{
"epoch": 1.2151898734177216,
"grad_norm": 1.6897258758544922,
"learning_rate": 7.5550267484001084e-06,
"loss": 0.9017348289489746,
"step": 576
},
{
"epoch": 1.219409282700422,
"grad_norm": 3.8387186527252197,
"learning_rate": 7.5509851453423665e-06,
"loss": 0.6278250813484192,
"step": 578
},
{
"epoch": 1.2236286919831223,
"grad_norm": 4.397797107696533,
"learning_rate": 7.546926491014742e-06,
"loss": 0.9347223043441772,
"step": 580
},
{
"epoch": 1.2278481012658227,
"grad_norm": 6.560842514038086,
"learning_rate": 7.5428508073802765e-06,
"loss": 0.8352513313293457,
"step": 582
},
{
"epoch": 1.2320675105485233,
"grad_norm": 2.411914825439453,
"learning_rate": 7.538758116494163e-06,
"loss": 0.8624718189239502,
"step": 584
},
{
"epoch": 1.2362869198312236,
"grad_norm": 13.554781913757324,
"learning_rate": 7.534648440503624e-06,
"loss": 0.9081147909164429,
"step": 586
},
{
"epoch": 1.240506329113924,
"grad_norm": 4.150012493133545,
"learning_rate": 7.530521801647799e-06,
"loss": 0.8333830833435059,
"step": 588
},
{
"epoch": 1.2447257383966246,
"grad_norm": 2.9597084522247314,
"learning_rate": 7.52637822225762e-06,
"loss": 1.1118628978729248,
"step": 590
},
{
"epoch": 1.248945147679325,
"grad_norm": 1.8890756368637085,
"learning_rate": 7.522217724755688e-06,
"loss": 0.58323734998703,
"step": 592
},
{
"epoch": 1.2531645569620253,
"grad_norm": 1.4861680269241333,
"learning_rate": 7.51804033165616e-06,
"loss": 0.8740505576133728,
"step": 594
},
{
"epoch": 1.2573839662447257,
"grad_norm": 7.2344560623168945,
"learning_rate": 7.513846065564618e-06,
"loss": 0.7560818195343018,
"step": 596
},
{
"epoch": 1.261603375527426,
"grad_norm": 5.018416404724121,
"learning_rate": 7.509634949177952e-06,
"loss": 0.664783239364624,
"step": 598
},
{
"epoch": 1.2658227848101267,
"grad_norm": 1.6520812511444092,
"learning_rate": 7.505407005284236e-06,
"loss": 0.9736641645431519,
"step": 600
},
{
"epoch": 1.270042194092827,
"grad_norm": 6.091807842254639,
"learning_rate": 7.5011622567626055e-06,
"loss": 1.3401249647140503,
"step": 602
},
{
"epoch": 1.2742616033755274,
"grad_norm": 6.131025791168213,
"learning_rate": 7.4969007265831284e-06,
"loss": 1.0688127279281616,
"step": 604
},
{
"epoch": 1.2784810126582278,
"grad_norm": 5.798713207244873,
"learning_rate": 7.4926224378066905e-06,
"loss": 0.7948801517486572,
"step": 606
},
{
"epoch": 1.2827004219409281,
"grad_norm": 10.305594444274902,
"learning_rate": 7.488327413584863e-06,
"loss": 0.8427482843399048,
"step": 608
},
{
"epoch": 1.2869198312236287,
"grad_norm": 8.171891212463379,
"learning_rate": 7.484015677159779e-06,
"loss": 0.9117364883422852,
"step": 610
},
{
"epoch": 1.2911392405063291,
"grad_norm": 5.7218804359436035,
"learning_rate": 7.479687251864008e-06,
"loss": 0.6430226564407349,
"step": 612
},
{
"epoch": 1.2953586497890295,
"grad_norm": 1.4333503246307373,
"learning_rate": 7.47534216112043e-06,
"loss": 0.6988530158996582,
"step": 614
},
{
"epoch": 1.29957805907173,
"grad_norm": 2.1469638347625732,
"learning_rate": 7.4709804284421096e-06,
"loss": 1.0747710466384888,
"step": 616
},
{
"epoch": 1.3037974683544304,
"grad_norm": 2.6359219551086426,
"learning_rate": 7.466602077432167e-06,
"loss": 1.0839927196502686,
"step": 618
},
{
"epoch": 1.3080168776371308,
"grad_norm": 2.7480428218841553,
"learning_rate": 7.4622071317836495e-06,
"loss": 0.6828069090843201,
"step": 620
},
{
"epoch": 1.3122362869198312,
"grad_norm": 3.1572508811950684,
"learning_rate": 7.45779561527941e-06,
"loss": 0.7725319862365723,
"step": 622
},
{
"epoch": 1.3164556962025316,
"grad_norm": 4.627418518066406,
"learning_rate": 7.453367551791965e-06,
"loss": 0.8618891835212708,
"step": 624
},
{
"epoch": 1.3206751054852321,
"grad_norm": 13.366847038269043,
"learning_rate": 7.448922965283379e-06,
"loss": 1.0350444316864014,
"step": 626
},
{
"epoch": 1.3248945147679325,
"grad_norm": 9.86931324005127,
"learning_rate": 7.44446187980513e-06,
"loss": 1.5988271236419678,
"step": 628
},
{
"epoch": 1.3291139240506329,
"grad_norm": 2.421264171600342,
"learning_rate": 7.439984319497975e-06,
"loss": 0.8888686895370483,
"step": 630
},
{
"epoch": 1.3333333333333333,
"grad_norm": 2.454784870147705,
"learning_rate": 7.435490308591826e-06,
"loss": 0.9130518436431885,
"step": 632
},
{
"epoch": 1.3375527426160336,
"grad_norm": 1.1312861442565918,
"learning_rate": 7.4309798714056145e-06,
"loss": 0.7504403591156006,
"step": 634
},
{
"epoch": 1.3417721518987342,
"grad_norm": 6.462110996246338,
"learning_rate": 7.4264530323471605e-06,
"loss": 0.6684986352920532,
"step": 636
},
{
"epoch": 1.3459915611814346,
"grad_norm": 34.968597412109375,
"learning_rate": 7.421909815913044e-06,
"loss": 0.7958526611328125,
"step": 638
},
{
"epoch": 1.350210970464135,
"grad_norm": 2.127056121826172,
"learning_rate": 7.4173502466884655e-06,
"loss": 1.2176686525344849,
"step": 640
},
{
"epoch": 1.3544303797468356,
"grad_norm": 2.8063347339630127,
"learning_rate": 7.412774349347123e-06,
"loss": 0.781902015209198,
"step": 642
},
{
"epoch": 1.358649789029536,
"grad_norm": 2.1645452976226807,
"learning_rate": 7.408182148651068e-06,
"loss": 1.2542736530303955,
"step": 644
},
{
"epoch": 1.3628691983122363,
"grad_norm": 2.789949417114258,
"learning_rate": 7.4035736694505765e-06,
"loss": 1.1398190259933472,
"step": 646
},
{
"epoch": 1.3670886075949367,
"grad_norm": 3.9663474559783936,
"learning_rate": 7.398948936684016e-06,
"loss": 0.8999311923980713,
"step": 648
},
{
"epoch": 1.371308016877637,
"grad_norm": 3.7580132484436035,
"learning_rate": 7.394307975377705e-06,
"loss": 1.3543846607208252,
"step": 650
},
{
"epoch": 1.3755274261603376,
"grad_norm": 2.3560047149658203,
"learning_rate": 7.389650810645788e-06,
"loss": 1.067474365234375,
"step": 652
},
{
"epoch": 1.379746835443038,
"grad_norm": 8.556138038635254,
"learning_rate": 7.384977467690088e-06,
"loss": 0.7700819373130798,
"step": 654
},
{
"epoch": 1.3839662447257384,
"grad_norm": 2.5057106018066406,
"learning_rate": 7.380287971799974e-06,
"loss": 1.1854264736175537,
"step": 656
},
{
"epoch": 1.3881856540084387,
"grad_norm": 3.844391107559204,
"learning_rate": 7.37558234835223e-06,
"loss": 0.6306626796722412,
"step": 658
},
{
"epoch": 1.3924050632911391,
"grad_norm": 5.850677013397217,
"learning_rate": 7.370860622810906e-06,
"loss": 1.1226918697357178,
"step": 660
},
{
"epoch": 1.3966244725738397,
"grad_norm": 3.9669106006622314,
"learning_rate": 7.3661228207271954e-06,
"loss": 0.7194290161132812,
"step": 662
},
{
"epoch": 1.40084388185654,
"grad_norm": 1.6219054460525513,
"learning_rate": 7.3613689677392795e-06,
"loss": 1.117922306060791,
"step": 664
},
{
"epoch": 1.4050632911392404,
"grad_norm": 2.222062826156616,
"learning_rate": 7.356599089572203e-06,
"loss": 0.7130357027053833,
"step": 666
},
{
"epoch": 1.409282700421941,
"grad_norm": 2.1885979175567627,
"learning_rate": 7.35181321203773e-06,
"loss": 1.0384951829910278,
"step": 668
},
{
"epoch": 1.4135021097046414,
"grad_norm": 1.6668306589126587,
"learning_rate": 7.3470113610342025e-06,
"loss": 1.1504696607589722,
"step": 670
},
{
"epoch": 1.4177215189873418,
"grad_norm": 3.1233675479888916,
"learning_rate": 7.342193562546399e-06,
"loss": 1.2578648328781128,
"step": 672
},
{
"epoch": 1.4219409282700421,
"grad_norm": 1.6057863235473633,
"learning_rate": 7.337359842645397e-06,
"loss": 0.6594195365905762,
"step": 674
},
{
"epoch": 1.4261603375527425,
"grad_norm": 4.513533592224121,
"learning_rate": 7.332510227488436e-06,
"loss": 0.6804168224334717,
"step": 676
},
{
"epoch": 1.4303797468354431,
"grad_norm": 3.0312631130218506,
"learning_rate": 7.327644743318766e-06,
"loss": 0.9876019954681396,
"step": 678
},
{
"epoch": 1.4345991561181435,
"grad_norm": 5.478196144104004,
"learning_rate": 7.322763416465513e-06,
"loss": 1.087882399559021,
"step": 680
},
{
"epoch": 1.4388185654008439,
"grad_norm": 4.24491024017334,
"learning_rate": 7.317866273343534e-06,
"loss": 0.8171271085739136,
"step": 682
},
{
"epoch": 1.4430379746835442,
"grad_norm": 1.7392427921295166,
"learning_rate": 7.312953340453274e-06,
"loss": 1.1222814321517944,
"step": 684
},
{
"epoch": 1.4472573839662446,
"grad_norm": 6.056474208831787,
"learning_rate": 7.308024644380625e-06,
"loss": 0.7576450109481812,
"step": 686
},
{
"epoch": 1.4514767932489452,
"grad_norm": 3.2466626167297363,
"learning_rate": 7.303080211796774e-06,
"loss": 0.8898618221282959,
"step": 688
},
{
"epoch": 1.4556962025316456,
"grad_norm": 8.08736515045166,
"learning_rate": 7.298120069458071e-06,
"loss": 1.0252546072006226,
"step": 690
},
{
"epoch": 1.459915611814346,
"grad_norm": 6.501577377319336,
"learning_rate": 7.293144244205875e-06,
"loss": 0.6603308916091919,
"step": 692
},
{
"epoch": 1.4641350210970465,
"grad_norm": 2.7896158695220947,
"learning_rate": 7.288152762966415e-06,
"loss": 1.225158452987671,
"step": 694
},
{
"epoch": 1.4683544303797469,
"grad_norm": 4.285532474517822,
"learning_rate": 7.283145652750635e-06,
"loss": 0.5561915636062622,
"step": 696
},
{
"epoch": 1.4725738396624473,
"grad_norm": 1.7746226787567139,
"learning_rate": 7.27812294065406e-06,
"loss": 1.1419634819030762,
"step": 698
},
{
"epoch": 1.4767932489451476,
"grad_norm": 1.805708408355713,
"learning_rate": 7.2730846538566375e-06,
"loss": 1.178218126296997,
"step": 700
},
{
"epoch": 1.481012658227848,
"grad_norm": 5.747068405151367,
"learning_rate": 7.2680308196226e-06,
"loss": 0.989362359046936,
"step": 702
},
{
"epoch": 1.4852320675105486,
"grad_norm": 7.557291030883789,
"learning_rate": 7.262961465300312e-06,
"loss": 0.7833366394042969,
"step": 704
},
{
"epoch": 1.489451476793249,
"grad_norm": 1.8191746473312378,
"learning_rate": 7.257876618322125e-06,
"loss": 1.1235054731369019,
"step": 706
},
{
"epoch": 1.4936708860759493,
"grad_norm": 2.4673242568969727,
"learning_rate": 7.252776306204226e-06,
"loss": 1.1172146797180176,
"step": 708
},
{
"epoch": 1.49789029535865,
"grad_norm": 1.062907099723816,
"learning_rate": 7.247660556546489e-06,
"loss": 0.984178900718689,
"step": 710
},
{
"epoch": 1.50210970464135,
"grad_norm": 2.470454692840576,
"learning_rate": 7.242529397032332e-06,
"loss": 0.467578649520874,
"step": 712
},
{
"epoch": 1.5063291139240507,
"grad_norm": 4.536571979522705,
"learning_rate": 7.237382855428555e-06,
"loss": 1.0994584560394287,
"step": 714
},
{
"epoch": 1.510548523206751,
"grad_norm": 1.8687479496002197,
"learning_rate": 7.232220959585203e-06,
"loss": 1.0004863739013672,
"step": 716
},
{
"epoch": 1.5147679324894514,
"grad_norm": 1.6920982599258423,
"learning_rate": 7.227043737435406e-06,
"loss": 1.0185657739639282,
"step": 718
},
{
"epoch": 1.518987341772152,
"grad_norm": 3.6787755489349365,
"learning_rate": 7.221851216995229e-06,
"loss": 1.0142695903778076,
"step": 720
},
{
"epoch": 1.5232067510548524,
"grad_norm": 3.2205076217651367,
"learning_rate": 7.216643426363528e-06,
"loss": 1.115187644958496,
"step": 722
},
{
"epoch": 1.5274261603375527,
"grad_norm": 2.2605745792388916,
"learning_rate": 7.211420393721787e-06,
"loss": 1.204482078552246,
"step": 724
},
{
"epoch": 1.5316455696202531,
"grad_norm": 4.8364667892456055,
"learning_rate": 7.206182147333974e-06,
"loss": 0.6358588933944702,
"step": 726
},
{
"epoch": 1.5358649789029535,
"grad_norm": 6.24644136428833,
"learning_rate": 7.200928715546382e-06,
"loss": 0.3853702247142792,
"step": 728
},
{
"epoch": 1.540084388185654,
"grad_norm": 2.5594065189361572,
"learning_rate": 7.1956601267874806e-06,
"loss": 0.8702763319015503,
"step": 730
},
{
"epoch": 1.5443037974683544,
"grad_norm": 1.8882993459701538,
"learning_rate": 7.1903764095677595e-06,
"loss": 0.958168625831604,
"step": 732
},
{
"epoch": 1.5485232067510548,
"grad_norm": 2.2798001766204834,
"learning_rate": 7.185077592479573e-06,
"loss": 1.0963058471679688,
"step": 734
},
{
"epoch": 1.5527426160337554,
"grad_norm": 3.2573153972625732,
"learning_rate": 7.17976370419699e-06,
"loss": 0.851632833480835,
"step": 736
},
{
"epoch": 1.5569620253164556,
"grad_norm": 3.9381816387176514,
"learning_rate": 7.174434773475635e-06,
"loss": 0.9015741944313049,
"step": 738
},
{
"epoch": 1.5611814345991561,
"grad_norm": 2.2864365577697754,
"learning_rate": 7.169090829152531e-06,
"loss": 1.0464608669281006,
"step": 740
},
{
"epoch": 1.5654008438818565,
"grad_norm": 1.112131118774414,
"learning_rate": 7.163731900145947e-06,
"loss": 0.6916845440864563,
"step": 742
},
{
"epoch": 1.5696202531645569,
"grad_norm": 4.296708583831787,
"learning_rate": 7.158358015455243e-06,
"loss": 0.5111595392227173,
"step": 744
},
{
"epoch": 1.5738396624472575,
"grad_norm": 3.6669418811798096,
"learning_rate": 7.152969204160704e-06,
"loss": 0.6755394339561462,
"step": 746
},
{
"epoch": 1.5780590717299579,
"grad_norm": 14.373259544372559,
"learning_rate": 7.147565495423394e-06,
"loss": 0.6762098073959351,
"step": 748
},
{
"epoch": 1.5822784810126582,
"grad_norm": 1.3357782363891602,
"learning_rate": 7.142146918484996e-06,
"loss": 1.0881752967834473,
"step": 750
},
{
"epoch": 1.5864978902953588,
"grad_norm": 2.055518627166748,
"learning_rate": 7.13671350266764e-06,
"loss": 0.6121246814727783,
"step": 752
},
{
"epoch": 1.590717299578059,
"grad_norm": 4.617133617401123,
"learning_rate": 7.131265277373768e-06,
"loss": 0.7100765705108643,
"step": 754
},
{
"epoch": 1.5949367088607596,
"grad_norm": 2.564394950866699,
"learning_rate": 7.125802272085954e-06,
"loss": 0.5700541734695435,
"step": 756
},
{
"epoch": 1.59915611814346,
"grad_norm": 3.551905393600464,
"learning_rate": 7.120324516366754e-06,
"loss": 0.8716294765472412,
"step": 758
},
{
"epoch": 1.6033755274261603,
"grad_norm": 2.745734930038452,
"learning_rate": 7.114832039858547e-06,
"loss": 1.1156964302062988,
"step": 760
},
{
"epoch": 1.6075949367088609,
"grad_norm": 2.40625262260437,
"learning_rate": 7.109324872283371e-06,
"loss": 0.6260151863098145,
"step": 762
},
{
"epoch": 1.611814345991561,
"grad_norm": 1.5272423028945923,
"learning_rate": 7.10380304344276e-06,
"loss": 1.1335524320602417,
"step": 764
},
{
"epoch": 1.6160337552742616,
"grad_norm": 4.155389785766602,
"learning_rate": 7.098266583217592e-06,
"loss": 1.0015907287597656,
"step": 766
},
{
"epoch": 1.620253164556962,
"grad_norm": 0.7669569849967957,
"learning_rate": 7.0927155215679175e-06,
"loss": 0.8719363212585449,
"step": 768
},
{
"epoch": 1.6244725738396624,
"grad_norm": 2.079810380935669,
"learning_rate": 7.087149888532803e-06,
"loss": 1.0807125568389893,
"step": 770
},
{
"epoch": 1.628691983122363,
"grad_norm": 1.8407090902328491,
"learning_rate": 7.081569714230168e-06,
"loss": 1.1782212257385254,
"step": 772
},
{
"epoch": 1.6329113924050633,
"grad_norm": 2.1639020442962646,
"learning_rate": 7.075975028856614e-06,
"loss": 1.0721291303634644,
"step": 774
},
{
"epoch": 1.6371308016877637,
"grad_norm": 4.351590633392334,
"learning_rate": 7.070365862687276e-06,
"loss": 1.0304412841796875,
"step": 776
},
{
"epoch": 1.6413502109704643,
"grad_norm": 3.4083518981933594,
"learning_rate": 7.064742246075647e-06,
"loss": 0.8413809537887573,
"step": 778
},
{
"epoch": 1.6455696202531644,
"grad_norm": 4.749937534332275,
"learning_rate": 7.059104209453417e-06,
"loss": 0.3687572479248047,
"step": 780
},
{
"epoch": 1.649789029535865,
"grad_norm": 9.06319808959961,
"learning_rate": 7.0534517833303085e-06,
"loss": 1.0481884479522705,
"step": 782
},
{
"epoch": 1.6540084388185654,
"grad_norm": 4.062221527099609,
"learning_rate": 7.047784998293913e-06,
"loss": 0.8559701442718506,
"step": 784
},
{
"epoch": 1.6582278481012658,
"grad_norm": 10.564360618591309,
"learning_rate": 7.0421038850095235e-06,
"loss": 1.074246883392334,
"step": 786
},
{
"epoch": 1.6624472573839664,
"grad_norm": 12.672545433044434,
"learning_rate": 7.036408474219966e-06,
"loss": 0.5824824571609497,
"step": 788
},
{
"epoch": 1.6666666666666665,
"grad_norm": 2.3040852546691895,
"learning_rate": 7.03069879674544e-06,
"loss": 1.0848541259765625,
"step": 790
},
{
"epoch": 1.6708860759493671,
"grad_norm": 4.974740982055664,
"learning_rate": 7.024974883483347e-06,
"loss": 0.5032600164413452,
"step": 792
},
{
"epoch": 1.6751054852320675,
"grad_norm": 0.5375344753265381,
"learning_rate": 7.019236765408122e-06,
"loss": 0.7874377965927124,
"step": 794
},
{
"epoch": 1.6793248945147679,
"grad_norm": 2.140566349029541,
"learning_rate": 7.013484473571073e-06,
"loss": 1.0540302991867065,
"step": 796
},
{
"epoch": 1.6835443037974684,
"grad_norm": 6.119294166564941,
"learning_rate": 7.007718039100201e-06,
"loss": 0.8562701940536499,
"step": 798
},
{
"epoch": 1.6877637130801688,
"grad_norm": 11.767963409423828,
"learning_rate": 7.001937493200045e-06,
"loss": 1.2052388191223145,
"step": 800
},
{
"epoch": 1.6919831223628692,
"grad_norm": 17.296886444091797,
"learning_rate": 6.996142867151502e-06,
"loss": 0.6549183130264282,
"step": 802
},
{
"epoch": 1.6962025316455698,
"grad_norm": 2.24831485748291,
"learning_rate": 6.990334192311668e-06,
"loss": 1.2283351421356201,
"step": 804
},
{
"epoch": 1.70042194092827,
"grad_norm": 2.4578514099121094,
"learning_rate": 6.9845115001136545e-06,
"loss": 1.1071836948394775,
"step": 806
},
{
"epoch": 1.7046413502109705,
"grad_norm": 10.491120338439941,
"learning_rate": 6.978674822066434e-06,
"loss": 0.7744203805923462,
"step": 808
},
{
"epoch": 1.7088607594936709,
"grad_norm": 9.302081108093262,
"learning_rate": 6.97282418975466e-06,
"loss": 0.8782643675804138,
"step": 810
},
{
"epoch": 1.7130801687763713,
"grad_norm": 1.8191728591918945,
"learning_rate": 6.966959634838495e-06,
"loss": 1.128312349319458,
"step": 812
},
{
"epoch": 1.7172995780590719,
"grad_norm": 5.150999069213867,
"learning_rate": 6.961081189053449e-06,
"loss": 1.454809546470642,
"step": 814
},
{
"epoch": 1.721518987341772,
"grad_norm": 1.7257297039031982,
"learning_rate": 6.955188884210195e-06,
"loss": 1.0828335285186768,
"step": 816
},
{
"epoch": 1.7257383966244726,
"grad_norm": 4.542337894439697,
"learning_rate": 6.9492827521944066e-06,
"loss": 0.8022172451019287,
"step": 818
},
{
"epoch": 1.729957805907173,
"grad_norm": 8.734732627868652,
"learning_rate": 6.943362824966579e-06,
"loss": 0.46849238872528076,
"step": 820
},
{
"epoch": 1.7341772151898733,
"grad_norm": 7.7328200340271,
"learning_rate": 6.937429134561862e-06,
"loss": 0.5579560995101929,
"step": 822
},
{
"epoch": 1.738396624472574,
"grad_norm": 2.0381147861480713,
"learning_rate": 6.9314817130898826e-06,
"loss": 0.7268804311752319,
"step": 824
},
{
"epoch": 1.7426160337552743,
"grad_norm": 1.5466476678848267,
"learning_rate": 6.925520592734571e-06,
"loss": 0.9139357805252075,
"step": 826
},
{
"epoch": 1.7468354430379747,
"grad_norm": 9.312906265258789,
"learning_rate": 6.919545805753988e-06,
"loss": 0.9899505376815796,
"step": 828
},
{
"epoch": 1.7510548523206753,
"grad_norm": 1.7241586446762085,
"learning_rate": 6.913557384480151e-06,
"loss": 1.066752314567566,
"step": 830
},
{
"epoch": 1.7552742616033754,
"grad_norm": 1.62288236618042,
"learning_rate": 6.907555361318859e-06,
"loss": 1.0838364362716675,
"step": 832
},
{
"epoch": 1.759493670886076,
"grad_norm": 1.7631701231002808,
"learning_rate": 6.901539768749513e-06,
"loss": 0.8664329051971436,
"step": 834
},
{
"epoch": 1.7637130801687764,
"grad_norm": 1.4140545129776,
"learning_rate": 6.895510639324947e-06,
"loss": 1.0501675605773926,
"step": 836
},
{
"epoch": 1.7679324894514767,
"grad_norm": 2.3473498821258545,
"learning_rate": 6.889468005671248e-06,
"loss": 0.9035965204238892,
"step": 838
},
{
"epoch": 1.7721518987341773,
"grad_norm": 4.273458003997803,
"learning_rate": 6.883411900487578e-06,
"loss": 0.7962709665298462,
"step": 840
},
{
"epoch": 1.7763713080168775,
"grad_norm": 7.660892963409424,
"learning_rate": 6.877342356545999e-06,
"loss": 0.9311078190803528,
"step": 842
},
{
"epoch": 1.780590717299578,
"grad_norm": 1.3525784015655518,
"learning_rate": 6.871259406691299e-06,
"loss": 0.46452265977859497,
"step": 844
},
{
"epoch": 1.7848101265822784,
"grad_norm": 1.2650240659713745,
"learning_rate": 6.865163083840808e-06,
"loss": 0.653459370136261,
"step": 846
},
{
"epoch": 1.7890295358649788,
"grad_norm": 16.892318725585938,
"learning_rate": 6.859053420984222e-06,
"loss": 0.587769091129303,
"step": 848
},
{
"epoch": 1.7932489451476794,
"grad_norm": 3.177400588989258,
"learning_rate": 6.852930451183426e-06,
"loss": 0.8080633878707886,
"step": 850
},
{
"epoch": 1.7974683544303798,
"grad_norm": 2.106072187423706,
"learning_rate": 6.846794207572317e-06,
"loss": 1.09242582321167,
"step": 852
},
{
"epoch": 1.8016877637130801,
"grad_norm": 3.0559608936309814,
"learning_rate": 6.840644723356619e-06,
"loss": 1.4061119556427002,
"step": 854
},
{
"epoch": 1.8059071729957807,
"grad_norm": 0.9994240403175354,
"learning_rate": 6.834482031813709e-06,
"loss": 0.7950407862663269,
"step": 856
},
{
"epoch": 1.810126582278481,
"grad_norm": 17.731142044067383,
"learning_rate": 6.82830616629243e-06,
"loss": 1.0046894550323486,
"step": 858
},
{
"epoch": 1.8143459915611815,
"grad_norm": 1.822799801826477,
"learning_rate": 6.822117160212916e-06,
"loss": 0.6357040405273438,
"step": 860
},
{
"epoch": 1.8185654008438819,
"grad_norm": 3.4448676109313965,
"learning_rate": 6.815915047066415e-06,
"loss": 1.0787222385406494,
"step": 862
},
{
"epoch": 1.8227848101265822,
"grad_norm": 1.7101033926010132,
"learning_rate": 6.809699860415097e-06,
"loss": 1.0257686376571655,
"step": 864
},
{
"epoch": 1.8270042194092828,
"grad_norm": 6.336645603179932,
"learning_rate": 6.80347163389188e-06,
"loss": 0.9438542127609253,
"step": 866
},
{
"epoch": 1.831223628691983,
"grad_norm": 4.888845920562744,
"learning_rate": 6.797230401200247e-06,
"loss": 0.9173398017883301,
"step": 868
},
{
"epoch": 1.8354430379746836,
"grad_norm": 1.9469853639602661,
"learning_rate": 6.790976196114059e-06,
"loss": 1.036512851715088,
"step": 870
},
{
"epoch": 1.839662447257384,
"grad_norm": 5.8768815994262695,
"learning_rate": 6.784709052477382e-06,
"loss": 0.8006809949874878,
"step": 872
},
{
"epoch": 1.8438818565400843,
"grad_norm": 2.4700989723205566,
"learning_rate": 6.7784290042042924e-06,
"loss": 0.9556717276573181,
"step": 874
},
{
"epoch": 1.8481012658227849,
"grad_norm": 3.1491811275482178,
"learning_rate": 6.772136085278703e-06,
"loss": 1.1224122047424316,
"step": 876
},
{
"epoch": 1.8523206751054853,
"grad_norm": 23.419981002807617,
"learning_rate": 6.765830329754171e-06,
"loss": 0.7619462013244629,
"step": 878
},
{
"epoch": 1.8565400843881856,
"grad_norm": 4.361552715301514,
"learning_rate": 6.7595117717537186e-06,
"loss": 0.6938849687576294,
"step": 880
},
{
"epoch": 1.8607594936708862,
"grad_norm": 3.267629623413086,
"learning_rate": 6.753180445469651e-06,
"loss": 0.8586090803146362,
"step": 882
},
{
"epoch": 1.8649789029535864,
"grad_norm": 1.915306806564331,
"learning_rate": 6.746836385163365e-06,
"loss": 0.7172484397888184,
"step": 884
},
{
"epoch": 1.869198312236287,
"grad_norm": 10.15339183807373,
"learning_rate": 6.740479625165166e-06,
"loss": 0.7663919925689697,
"step": 886
},
{
"epoch": 1.8734177215189873,
"grad_norm": 1.4916582107543945,
"learning_rate": 6.734110199874082e-06,
"loss": 1.0811569690704346,
"step": 888
},
{
"epoch": 1.8776371308016877,
"grad_norm": 3.9946820735931396,
"learning_rate": 6.727728143757681e-06,
"loss": 0.4816530644893646,
"step": 890
},
{
"epoch": 1.8818565400843883,
"grad_norm": 1.4981932640075684,
"learning_rate": 6.7213334913518795e-06,
"loss": 0.6716771721839905,
"step": 892
},
{
"epoch": 1.8860759493670884,
"grad_norm": 1.4426230192184448,
"learning_rate": 6.714926277260759e-06,
"loss": 1.055748462677002,
"step": 894
},
{
"epoch": 1.890295358649789,
"grad_norm": 2.779737949371338,
"learning_rate": 6.708506536156375e-06,
"loss": 1.2872055768966675,
"step": 896
},
{
"epoch": 1.8945147679324894,
"grad_norm": 18.332468032836914,
"learning_rate": 6.702074302778574e-06,
"loss": 0.7888720631599426,
"step": 898
},
{
"epoch": 1.8987341772151898,
"grad_norm": 4.269297122955322,
"learning_rate": 6.695629611934803e-06,
"loss": 0.9360828995704651,
"step": 900
},
{
"epoch": 1.9029535864978904,
"grad_norm": 3.4333150386810303,
"learning_rate": 6.689172498499919e-06,
"loss": 1.1581498384475708,
"step": 902
},
{
"epoch": 1.9071729957805907,
"grad_norm": 3.325373888015747,
"learning_rate": 6.6827029974160085e-06,
"loss": 1.0004583597183228,
"step": 904
},
{
"epoch": 1.9113924050632911,
"grad_norm": 1.337743878364563,
"learning_rate": 6.676221143692186e-06,
"loss": 1.2600127458572388,
"step": 906
},
{
"epoch": 1.9156118143459917,
"grad_norm": 9.333498001098633,
"learning_rate": 6.669726972404415e-06,
"loss": 0.5244170427322388,
"step": 908
},
{
"epoch": 1.9198312236286919,
"grad_norm": 1.4464713335037231,
"learning_rate": 6.663220518695314e-06,
"loss": 1.0309032201766968,
"step": 910
},
{
"epoch": 1.9240506329113924,
"grad_norm": 3.8824985027313232,
"learning_rate": 6.656701817773966e-06,
"loss": 0.7978178262710571,
"step": 912
},
{
"epoch": 1.9282700421940928,
"grad_norm": 2.117260217666626,
"learning_rate": 6.650170904915727e-06,
"loss": 1.1143381595611572,
"step": 914
},
{
"epoch": 1.9324894514767932,
"grad_norm": 2.3016726970672607,
"learning_rate": 6.643627815462041e-06,
"loss": 0.7327609062194824,
"step": 916
},
{
"epoch": 1.9367088607594938,
"grad_norm": 1.3250812292099,
"learning_rate": 6.637072584820241e-06,
"loss": 1.0381274223327637,
"step": 918
},
{
"epoch": 1.9409282700421941,
"grad_norm": 3.2023909091949463,
"learning_rate": 6.630505248463364e-06,
"loss": 0.5368826985359192,
"step": 920
},
{
"epoch": 1.9451476793248945,
"grad_norm": 1.5674402713775635,
"learning_rate": 6.623925841929953e-06,
"loss": 1.0610504150390625,
"step": 922
},
{
"epoch": 1.9493670886075949,
"grad_norm": 1.7156344652175903,
"learning_rate": 6.617334400823867e-06,
"loss": 1.154762625694275,
"step": 924
},
{
"epoch": 1.9535864978902953,
"grad_norm": 1.6762484312057495,
"learning_rate": 6.610730960814092e-06,
"loss": 0.8508365154266357,
"step": 926
},
{
"epoch": 1.9578059071729959,
"grad_norm": 1.3070154190063477,
"learning_rate": 6.604115557634545e-06,
"loss": 0.7161068916320801,
"step": 928
},
{
"epoch": 1.9620253164556962,
"grad_norm": 2.4822962284088135,
"learning_rate": 6.597488227083879e-06,
"loss": 1.1143286228179932,
"step": 930
},
{
"epoch": 1.9662447257383966,
"grad_norm": 2.1459968090057373,
"learning_rate": 6.590849005025289e-06,
"loss": 0.8785426020622253,
"step": 932
},
{
"epoch": 1.9704641350210972,
"grad_norm": 18.12381935119629,
"learning_rate": 6.584197927386326e-06,
"loss": 1.200589656829834,
"step": 934
},
{
"epoch": 1.9746835443037973,
"grad_norm": 1.572724461555481,
"learning_rate": 6.577535030158689e-06,
"loss": 1.1270561218261719,
"step": 936
},
{
"epoch": 1.978902953586498,
"grad_norm": 0.8099290132522583,
"learning_rate": 6.570860349398041e-06,
"loss": 0.6693128347396851,
"step": 938
},
{
"epoch": 1.9831223628691983,
"grad_norm": 1.4404888153076172,
"learning_rate": 6.5641739212238136e-06,
"loss": 1.1134912967681885,
"step": 940
},
{
"epoch": 1.9873417721518987,
"grad_norm": 11.569640159606934,
"learning_rate": 6.557475781819004e-06,
"loss": 0.9092779159545898,
"step": 942
},
{
"epoch": 1.9915611814345993,
"grad_norm": 2.00720477104187,
"learning_rate": 6.550765967429984e-06,
"loss": 0.7343477010726929,
"step": 944
},
{
"epoch": 1.9957805907172996,
"grad_norm": 8.226292610168457,
"learning_rate": 6.544044514366306e-06,
"loss": 1.0801680088043213,
"step": 946
},
{
"epoch": 2.0,
"grad_norm": 2.7631890773773193,
"learning_rate": 6.537311459000502e-06,
"loss": 0.5224167108535767,
"step": 948
},
{
"epoch": 2.0042194092827006,
"grad_norm": 4.7062296867370605,
"learning_rate": 6.53056683776789e-06,
"loss": 0.8504010438919067,
"step": 950
},
{
"epoch": 2.0084388185654007,
"grad_norm": 8.53116512298584,
"learning_rate": 6.5238106871663755e-06,
"loss": 0.6483380794525146,
"step": 952
},
{
"epoch": 2.0126582278481013,
"grad_norm": 3.5020530223846436,
"learning_rate": 6.517043043756252e-06,
"loss": 0.8229789733886719,
"step": 954
},
{
"epoch": 2.0168776371308015,
"grad_norm": 2.5090668201446533,
"learning_rate": 6.5102639441600086e-06,
"loss": 0.868636965751648,
"step": 956
},
{
"epoch": 2.021097046413502,
"grad_norm": 6.097753047943115,
"learning_rate": 6.503473425062126e-06,
"loss": 0.6441227197647095,
"step": 958
},
{
"epoch": 2.0253164556962027,
"grad_norm": 3.8150646686553955,
"learning_rate": 6.4966715232088835e-06,
"loss": 0.7223113179206848,
"step": 960
},
{
"epoch": 2.029535864978903,
"grad_norm": 2.547377109527588,
"learning_rate": 6.489858275408152e-06,
"loss": 1.046697735786438,
"step": 962
},
{
"epoch": 2.0337552742616034,
"grad_norm": 2.0660111904144287,
"learning_rate": 6.483033718529204e-06,
"loss": 0.7585334777832031,
"step": 964
},
{
"epoch": 2.037974683544304,
"grad_norm": 1.3951258659362793,
"learning_rate": 6.476197889502512e-06,
"loss": 0.571182370185852,
"step": 966
},
{
"epoch": 2.042194092827004,
"grad_norm": 5.060699939727783,
"learning_rate": 6.46935082531954e-06,
"loss": 0.6977952718734741,
"step": 968
},
{
"epoch": 2.0464135021097047,
"grad_norm": 9.534734725952148,
"learning_rate": 6.4624925630325555e-06,
"loss": 0.924410343170166,
"step": 970
},
{
"epoch": 2.050632911392405,
"grad_norm": 10.224217414855957,
"learning_rate": 6.455623139754423e-06,
"loss": 0.7734869122505188,
"step": 972
},
{
"epoch": 2.0548523206751055,
"grad_norm": 2.367072343826294,
"learning_rate": 6.4487425926584005e-06,
"loss": 0.762604832649231,
"step": 974
},
{
"epoch": 2.059071729957806,
"grad_norm": 3.0896172523498535,
"learning_rate": 6.441850958977945e-06,
"loss": 0.6143279075622559,
"step": 976
},
{
"epoch": 2.0632911392405062,
"grad_norm": 1.7992668151855469,
"learning_rate": 6.434948276006505e-06,
"loss": 0.6615221500396729,
"step": 978
},
{
"epoch": 2.067510548523207,
"grad_norm": 3.8281936645507812,
"learning_rate": 6.4280345810973225e-06,
"loss": 0.6476603150367737,
"step": 980
},
{
"epoch": 2.071729957805907,
"grad_norm": 1.7640984058380127,
"learning_rate": 6.42110991166323e-06,
"loss": 0.8162950277328491,
"step": 982
},
{
"epoch": 2.0759493670886076,
"grad_norm": 4.995830059051514,
"learning_rate": 6.414174305176448e-06,
"loss": 0.9169092774391174,
"step": 984
},
{
"epoch": 2.080168776371308,
"grad_norm": 1.7362895011901855,
"learning_rate": 6.407227799168378e-06,
"loss": 0.9022603034973145,
"step": 986
},
{
"epoch": 2.0843881856540083,
"grad_norm": 2.571808338165283,
"learning_rate": 6.400270431229409e-06,
"loss": 0.9147624969482422,
"step": 988
},
{
"epoch": 2.088607594936709,
"grad_norm": 9.426867485046387,
"learning_rate": 6.393302239008705e-06,
"loss": 0.46702778339385986,
"step": 990
},
{
"epoch": 2.0928270042194095,
"grad_norm": 1.8141038417816162,
"learning_rate": 6.386323260214006e-06,
"loss": 0.49038439989089966,
"step": 992
},
{
"epoch": 2.0970464135021096,
"grad_norm": 2.054802894592285,
"learning_rate": 6.37933353261142e-06,
"loss": 1.0175153017044067,
"step": 994
},
{
"epoch": 2.1012658227848102,
"grad_norm": 5.117776870727539,
"learning_rate": 6.372333094025224e-06,
"loss": 0.8956054449081421,
"step": 996
},
{
"epoch": 2.1054852320675104,
"grad_norm": 0.727590024471283,
"learning_rate": 6.365321982337655e-06,
"loss": 0.5565606951713562,
"step": 998
},
{
"epoch": 2.109704641350211,
"grad_norm": 2.0183682441711426,
"learning_rate": 6.3583002354887065e-06,
"loss": 1.0998228788375854,
"step": 1000
},
{
"epoch": 2.1139240506329116,
"grad_norm": 3.211463689804077,
"learning_rate": 6.351267891475925e-06,
"loss": 0.8330961465835571,
"step": 1002
},
{
"epoch": 2.1181434599156117,
"grad_norm": 4.385818004608154,
"learning_rate": 6.344224988354201e-06,
"loss": 0.8911874294281006,
"step": 1004
},
{
"epoch": 2.1223628691983123,
"grad_norm": 1.8507286310195923,
"learning_rate": 6.3371715642355665e-06,
"loss": 0.5850310325622559,
"step": 1006
},
{
"epoch": 2.1265822784810124,
"grad_norm": 1.5357205867767334,
"learning_rate": 6.3301076572889804e-06,
"loss": 0.6495864391326904,
"step": 1008
},
{
"epoch": 2.130801687763713,
"grad_norm": 1.9483667612075806,
"learning_rate": 6.32303330574014e-06,
"loss": 0.6409696340560913,
"step": 1010
},
{
"epoch": 2.1350210970464136,
"grad_norm": 5.264192581176758,
"learning_rate": 6.3159485478712504e-06,
"loss": 0.8244346976280212,
"step": 1012
},
{
"epoch": 2.1392405063291138,
"grad_norm": 6.72544527053833,
"learning_rate": 6.308853422020838e-06,
"loss": 1.0458412170410156,
"step": 1014
},
{
"epoch": 2.1434599156118144,
"grad_norm": 4.4975738525390625,
"learning_rate": 6.301747966583533e-06,
"loss": 0.5240525007247925,
"step": 1016
},
{
"epoch": 2.147679324894515,
"grad_norm": 2.4814205169677734,
"learning_rate": 6.294632220009858e-06,
"loss": 0.7953197360038757,
"step": 1018
},
{
"epoch": 2.151898734177215,
"grad_norm": 1.7783337831497192,
"learning_rate": 6.2875062208060345e-06,
"loss": 0.6177500486373901,
"step": 1020
},
{
"epoch": 2.1561181434599157,
"grad_norm": 4.14943790435791,
"learning_rate": 6.280370007533755e-06,
"loss": 0.7844660878181458,
"step": 1022
},
{
"epoch": 2.160337552742616,
"grad_norm": 17.742002487182617,
"learning_rate": 6.2732236188099925e-06,
"loss": 0.7165024280548096,
"step": 1024
},
{
"epoch": 2.1645569620253164,
"grad_norm": 1.9838588237762451,
"learning_rate": 6.266067093306778e-06,
"loss": 0.9177765846252441,
"step": 1026
},
{
"epoch": 2.168776371308017,
"grad_norm": 1.8689168691635132,
"learning_rate": 6.258900469751002e-06,
"loss": 0.9903367757797241,
"step": 1028
},
{
"epoch": 2.172995780590717,
"grad_norm": 1.8121206760406494,
"learning_rate": 6.251723786924195e-06,
"loss": 0.9095609188079834,
"step": 1030
},
{
"epoch": 2.1772151898734178,
"grad_norm": 1.719159483909607,
"learning_rate": 6.244537083662325e-06,
"loss": 0.9629115462303162,
"step": 1032
},
{
"epoch": 2.181434599156118,
"grad_norm": 3.3003413677215576,
"learning_rate": 6.237340398855583e-06,
"loss": 0.9314064979553223,
"step": 1034
},
{
"epoch": 2.1856540084388185,
"grad_norm": 1.902093768119812,
"learning_rate": 6.230133771448174e-06,
"loss": 0.8848311305046082,
"step": 1036
},
{
"epoch": 2.189873417721519,
"grad_norm": 4.84321403503418,
"learning_rate": 6.222917240438112e-06,
"loss": 0.9192149639129639,
"step": 1038
},
{
"epoch": 2.1940928270042193,
"grad_norm": 1.7536414861679077,
"learning_rate": 6.215690844876994e-06,
"loss": 1.1547870635986328,
"step": 1040
},
{
"epoch": 2.19831223628692,
"grad_norm": 6.852333068847656,
"learning_rate": 6.208454623869805e-06,
"loss": 0.32395103573799133,
"step": 1042
},
{
"epoch": 2.2025316455696204,
"grad_norm": 1.9538402557373047,
"learning_rate": 6.2012086165747e-06,
"loss": 0.9581727981567383,
"step": 1044
},
{
"epoch": 2.2067510548523206,
"grad_norm": 4.133998394012451,
"learning_rate": 6.193952862202785e-06,
"loss": 0.6086496710777283,
"step": 1046
},
{
"epoch": 2.210970464135021,
"grad_norm": 1.9011021852493286,
"learning_rate": 6.18668740001792e-06,
"loss": 0.7543759346008301,
"step": 1048
},
{
"epoch": 2.2151898734177213,
"grad_norm": 1.46916663646698,
"learning_rate": 6.17941226933649e-06,
"loss": 0.9485968947410583,
"step": 1050
},
{
"epoch": 2.219409282700422,
"grad_norm": 5.856541156768799,
"learning_rate": 6.172127509527205e-06,
"loss": 0.8059616088867188,
"step": 1052
},
{
"epoch": 2.2236286919831225,
"grad_norm": 4.198894023895264,
"learning_rate": 6.164833160010882e-06,
"loss": 0.7487938404083252,
"step": 1054
},
{
"epoch": 2.2278481012658227,
"grad_norm": 23.342222213745117,
"learning_rate": 6.157529260260229e-06,
"loss": 0.7880909442901611,
"step": 1056
},
{
"epoch": 2.2320675105485233,
"grad_norm": 23.40158462524414,
"learning_rate": 6.150215849799637e-06,
"loss": 0.5327481031417847,
"step": 1058
},
{
"epoch": 2.2362869198312234,
"grad_norm": 1.634332537651062,
"learning_rate": 6.142892968204963e-06,
"loss": 0.883295476436615,
"step": 1060
},
{
"epoch": 2.240506329113924,
"grad_norm": 0.7251645922660828,
"learning_rate": 6.135560655103316e-06,
"loss": 0.5540227890014648,
"step": 1062
},
{
"epoch": 2.2447257383966246,
"grad_norm": 1.5355224609375,
"learning_rate": 6.12821895017284e-06,
"loss": 0.50773686170578,
"step": 1064
},
{
"epoch": 2.2489451476793247,
"grad_norm": 2.305499792098999,
"learning_rate": 6.120867893142506e-06,
"loss": 0.8910026550292969,
"step": 1066
},
{
"epoch": 2.2531645569620253,
"grad_norm": 3.746581792831421,
"learning_rate": 6.1135075237918905e-06,
"loss": 1.0884243249893188,
"step": 1068
},
{
"epoch": 2.257383966244726,
"grad_norm": 3.282155752182007,
"learning_rate": 6.106137881950965e-06,
"loss": 1.0420414209365845,
"step": 1070
},
{
"epoch": 2.261603375527426,
"grad_norm": 2.951901435852051,
"learning_rate": 6.098759007499875e-06,
"loss": 0.9006770849227905,
"step": 1072
},
{
"epoch": 2.2658227848101267,
"grad_norm": 2.8723626136779785,
"learning_rate": 6.091370940368729e-06,
"loss": 1.1099491119384766,
"step": 1074
},
{
"epoch": 2.270042194092827,
"grad_norm": 1.841613531112671,
"learning_rate": 6.083973720537386e-06,
"loss": 0.9306420087814331,
"step": 1076
},
{
"epoch": 2.2742616033755274,
"grad_norm": 0.8245161771774292,
"learning_rate": 6.0765673880352224e-06,
"loss": 0.6501108407974243,
"step": 1078
},
{
"epoch": 2.278481012658228,
"grad_norm": 16.89291763305664,
"learning_rate": 6.069151982940936e-06,
"loss": 0.7018378376960754,
"step": 1080
},
{
"epoch": 2.282700421940928,
"grad_norm": 15.395925521850586,
"learning_rate": 6.06172754538232e-06,
"loss": 0.3668671250343323,
"step": 1082
},
{
"epoch": 2.2869198312236287,
"grad_norm": 7.03673791885376,
"learning_rate": 6.054294115536044e-06,
"loss": 0.6594992280006409,
"step": 1084
},
{
"epoch": 2.291139240506329,
"grad_norm": 1.275587797164917,
"learning_rate": 6.046851733627436e-06,
"loss": 0.48280084133148193,
"step": 1086
},
{
"epoch": 2.2953586497890295,
"grad_norm": 3.333641290664673,
"learning_rate": 6.039400439930271e-06,
"loss": 0.6253411769866943,
"step": 1088
},
{
"epoch": 2.29957805907173,
"grad_norm": 1.850312352180481,
"learning_rate": 6.031940274766546e-06,
"loss": 0.49555736780166626,
"step": 1090
},
{
"epoch": 2.3037974683544302,
"grad_norm": 3.2576518058776855,
"learning_rate": 6.024471278506269e-06,
"loss": 0.7540421485900879,
"step": 1092
},
{
"epoch": 2.308016877637131,
"grad_norm": 2.6489086151123047,
"learning_rate": 6.016993491567234e-06,
"loss": 0.6014547944068909,
"step": 1094
},
{
"epoch": 2.3122362869198314,
"grad_norm": 5.111599445343018,
"learning_rate": 6.0095069544148075e-06,
"loss": 0.3525955379009247,
"step": 1096
},
{
"epoch": 2.3164556962025316,
"grad_norm": 13.784951210021973,
"learning_rate": 6.002011707561704e-06,
"loss": 0.8247784376144409,
"step": 1098
},
{
"epoch": 2.320675105485232,
"grad_norm": 4.951453685760498,
"learning_rate": 5.9945077915677695e-06,
"loss": 0.8657753467559814,
"step": 1100
},
{
"epoch": 2.3248945147679323,
"grad_norm": 1.6704450845718384,
"learning_rate": 5.9869952470397655e-06,
"loss": 0.841392993927002,
"step": 1102
},
{
"epoch": 2.329113924050633,
"grad_norm": 1.9911108016967773,
"learning_rate": 5.979474114631144e-06,
"loss": 1.0287697315216064,
"step": 1104
},
{
"epoch": 2.3333333333333335,
"grad_norm": 1.8969277143478394,
"learning_rate": 5.971944435041831e-06,
"loss": 0.730893611907959,
"step": 1106
},
{
"epoch": 2.3375527426160336,
"grad_norm": 4.81918478012085,
"learning_rate": 5.9644062490180004e-06,
"loss": 0.5627094507217407,
"step": 1108
},
{
"epoch": 2.3417721518987342,
"grad_norm": 2.462564468383789,
"learning_rate": 5.956859597351862e-06,
"loss": 0.8845915198326111,
"step": 1110
},
{
"epoch": 2.3459915611814344,
"grad_norm": 2.167839527130127,
"learning_rate": 5.94930452088144e-06,
"loss": 0.9817606210708618,
"step": 1112
},
{
"epoch": 2.350210970464135,
"grad_norm": 8.067427635192871,
"learning_rate": 5.941741060490339e-06,
"loss": 1.1635032892227173,
"step": 1114
},
{
"epoch": 2.3544303797468356,
"grad_norm": 0.5956460237503052,
"learning_rate": 5.93416925710754e-06,
"loss": 0.4855182468891144,
"step": 1116
},
{
"epoch": 2.3586497890295357,
"grad_norm": 15.0598783493042,
"learning_rate": 5.9265891517071695e-06,
"loss": 0.9245091676712036,
"step": 1118
},
{
"epoch": 2.3628691983122363,
"grad_norm": 2.6616246700286865,
"learning_rate": 5.9190007853082795e-06,
"loss": 0.6047594547271729,
"step": 1120
},
{
"epoch": 2.367088607594937,
"grad_norm": 7.563075542449951,
"learning_rate": 5.911404198974625e-06,
"loss": 0.9117496013641357,
"step": 1122
},
{
"epoch": 2.371308016877637,
"grad_norm": 5.370510101318359,
"learning_rate": 5.903799433814442e-06,
"loss": 0.5350353717803955,
"step": 1124
},
{
"epoch": 2.3755274261603376,
"grad_norm": 1.819912075996399,
"learning_rate": 5.8961865309802285e-06,
"loss": 0.667518138885498,
"step": 1126
},
{
"epoch": 2.379746835443038,
"grad_norm": 2.640817165374756,
"learning_rate": 5.888565531668514e-06,
"loss": 0.8784997463226318,
"step": 1128
},
{
"epoch": 2.3839662447257384,
"grad_norm": 66.16091918945312,
"learning_rate": 5.880936477119645e-06,
"loss": 0.4616549611091614,
"step": 1130
},
{
"epoch": 2.388185654008439,
"grad_norm": 5.994080066680908,
"learning_rate": 5.873299408617559e-06,
"loss": 0.3559979200363159,
"step": 1132
},
{
"epoch": 2.392405063291139,
"grad_norm": 23.00125503540039,
"learning_rate": 5.865654367489556e-06,
"loss": 0.40349674224853516,
"step": 1134
},
{
"epoch": 2.3966244725738397,
"grad_norm": 5.636394500732422,
"learning_rate": 5.858001395106082e-06,
"loss": 0.5823970437049866,
"step": 1136
},
{
"epoch": 2.40084388185654,
"grad_norm": 2.3996365070343018,
"learning_rate": 5.850340532880504e-06,
"loss": 0.921074628829956,
"step": 1138
},
{
"epoch": 2.4050632911392404,
"grad_norm": 10.836583137512207,
"learning_rate": 5.842671822268878e-06,
"loss": 0.7500771880149841,
"step": 1140
},
{
"epoch": 2.409282700421941,
"grad_norm": 7.443431854248047,
"learning_rate": 5.83499530476974e-06,
"loss": 0.3230987787246704,
"step": 1142
},
{
"epoch": 2.413502109704641,
"grad_norm": 2.597289800643921,
"learning_rate": 5.827311021923863e-06,
"loss": 0.732123851776123,
"step": 1144
},
{
"epoch": 2.4177215189873418,
"grad_norm": 1.982795238494873,
"learning_rate": 5.819619015314047e-06,
"loss": 0.9608519077301025,
"step": 1146
},
{
"epoch": 2.4219409282700424,
"grad_norm": 3.819395065307617,
"learning_rate": 5.8119193265648865e-06,
"loss": 0.6804056167602539,
"step": 1148
},
{
"epoch": 2.4261603375527425,
"grad_norm": 6.851272106170654,
"learning_rate": 5.80421199734255e-06,
"loss": 1.004921555519104,
"step": 1150
},
{
"epoch": 2.430379746835443,
"grad_norm": 3.0809147357940674,
"learning_rate": 5.7964970693545466e-06,
"loss": 0.6196656823158264,
"step": 1152
},
{
"epoch": 2.4345991561181437,
"grad_norm": 1.8345930576324463,
"learning_rate": 5.788774584349508e-06,
"loss": 1.043914556503296,
"step": 1154
},
{
"epoch": 2.438818565400844,
"grad_norm": 3.7520220279693604,
"learning_rate": 5.781044584116963e-06,
"loss": 0.30900609493255615,
"step": 1156
},
{
"epoch": 2.4430379746835444,
"grad_norm": 1.6004582643508911,
"learning_rate": 5.773307110487106e-06,
"loss": 0.7037574052810669,
"step": 1158
},
{
"epoch": 2.4472573839662446,
"grad_norm": 1.8472445011138916,
"learning_rate": 5.765562205330568e-06,
"loss": 0.9773483872413635,
"step": 1160
},
{
"epoch": 2.451476793248945,
"grad_norm": 2.698925018310547,
"learning_rate": 5.757809910558205e-06,
"loss": 0.6617934703826904,
"step": 1162
},
{
"epoch": 2.4556962025316453,
"grad_norm": 1.6956886053085327,
"learning_rate": 5.750050268120851e-06,
"loss": 0.851616382598877,
"step": 1164
},
{
"epoch": 2.459915611814346,
"grad_norm": 1.288453221321106,
"learning_rate": 5.742283320009111e-06,
"loss": 0.8924407958984375,
"step": 1166
},
{
"epoch": 2.4641350210970465,
"grad_norm": 1.4182209968566895,
"learning_rate": 5.734509108253117e-06,
"loss": 0.48247936367988586,
"step": 1168
},
{
"epoch": 2.4683544303797467,
"grad_norm": 2.1459646224975586,
"learning_rate": 5.726727674922309e-06,
"loss": 0.8906441926956177,
"step": 1170
},
{
"epoch": 2.4725738396624473,
"grad_norm": 1.393717885017395,
"learning_rate": 5.718939062125207e-06,
"loss": 0.876624584197998,
"step": 1172
},
{
"epoch": 2.476793248945148,
"grad_norm": 1.8553041219711304,
"learning_rate": 5.711143312009183e-06,
"loss": 0.9824315309524536,
"step": 1174
},
{
"epoch": 2.481012658227848,
"grad_norm": 2.4953160285949707,
"learning_rate": 5.703340466760228e-06,
"loss": 0.7499101161956787,
"step": 1176
},
{
"epoch": 2.4852320675105486,
"grad_norm": 5.494137763977051,
"learning_rate": 5.695530568602733e-06,
"loss": 0.42195141315460205,
"step": 1178
},
{
"epoch": 2.489451476793249,
"grad_norm": 4.595831394195557,
"learning_rate": 5.687713659799253e-06,
"loss": 0.7049263715744019,
"step": 1180
},
{
"epoch": 2.4936708860759493,
"grad_norm": 5.080184459686279,
"learning_rate": 5.679889782650275e-06,
"loss": 0.880506157875061,
"step": 1182
},
{
"epoch": 2.49789029535865,
"grad_norm": 27.051029205322266,
"learning_rate": 5.672058979494004e-06,
"loss": 0.5125079154968262,
"step": 1184
},
{
"epoch": 2.50210970464135,
"grad_norm": 1.5325865745544434,
"learning_rate": 5.6642212927061185e-06,
"loss": 0.385905385017395,
"step": 1186
},
{
"epoch": 2.5063291139240507,
"grad_norm": 7.488584995269775,
"learning_rate": 5.656376764699549e-06,
"loss": 0.5802481770515442,
"step": 1188
},
{
"epoch": 2.510548523206751,
"grad_norm": 2.4869072437286377,
"learning_rate": 5.648525437924244e-06,
"loss": 0.810112476348877,
"step": 1190
},
{
"epoch": 2.5147679324894514,
"grad_norm": 5.196420192718506,
"learning_rate": 5.640667354866948e-06,
"loss": 0.40649741888046265,
"step": 1192
},
{
"epoch": 2.518987341772152,
"grad_norm": 10.881569862365723,
"learning_rate": 5.632802558050964e-06,
"loss": 1.1927690505981445,
"step": 1194
},
{
"epoch": 2.523206751054852,
"grad_norm": 2.63506817817688,
"learning_rate": 5.6249310900359236e-06,
"loss": 0.969944179058075,
"step": 1196
},
{
"epoch": 2.5274261603375527,
"grad_norm": 11.932506561279297,
"learning_rate": 5.617052993417562e-06,
"loss": 0.9280753135681152,
"step": 1198
},
{
"epoch": 2.5316455696202533,
"grad_norm": 45.778499603271484,
"learning_rate": 5.609168310827482e-06,
"loss": 0.7399793267250061,
"step": 1200
},
{
"epoch": 2.5358649789029535,
"grad_norm": 1.8853310346603394,
"learning_rate": 5.6012770849329275e-06,
"loss": 0.7420691251754761,
"step": 1202
},
{
"epoch": 2.540084388185654,
"grad_norm": 3.9101645946502686,
"learning_rate": 5.593379358436551e-06,
"loss": 0.7088044285774231,
"step": 1204
},
{
"epoch": 2.5443037974683547,
"grad_norm": 3.0373728275299072,
"learning_rate": 5.585475174076184e-06,
"loss": 0.8735544681549072,
"step": 1206
},
{
"epoch": 2.548523206751055,
"grad_norm": 8.735350608825684,
"learning_rate": 5.577564574624599e-06,
"loss": 0.6918007135391235,
"step": 1208
},
{
"epoch": 2.5527426160337554,
"grad_norm": 3.198167085647583,
"learning_rate": 5.569647602889289e-06,
"loss": 1.1403307914733887,
"step": 1210
},
{
"epoch": 2.5569620253164556,
"grad_norm": 2.7564687728881836,
"learning_rate": 5.561724301712225e-06,
"loss": 0.9847512245178223,
"step": 1212
},
{
"epoch": 2.561181434599156,
"grad_norm": 5.109920501708984,
"learning_rate": 5.553794713969632e-06,
"loss": 0.30179572105407715,
"step": 1214
},
{
"epoch": 2.5654008438818563,
"grad_norm": 4.672483921051025,
"learning_rate": 5.545858882571755e-06,
"loss": 0.7192697525024414,
"step": 1216
},
{
"epoch": 2.569620253164557,
"grad_norm": 2.7057371139526367,
"learning_rate": 5.5379168504626256e-06,
"loss": 0.9119170308113098,
"step": 1218
},
{
"epoch": 2.5738396624472575,
"grad_norm": 5.782102584838867,
"learning_rate": 5.5299686606198255e-06,
"loss": 0.59529709815979,
"step": 1220
},
{
"epoch": 2.5780590717299576,
"grad_norm": 5.356619834899902,
"learning_rate": 5.522014356054264e-06,
"loss": 0.888773500919342,
"step": 1222
},
{
"epoch": 2.5822784810126582,
"grad_norm": 34.81894302368164,
"learning_rate": 5.51405397980994e-06,
"loss": 0.671154260635376,
"step": 1224
},
{
"epoch": 2.586497890295359,
"grad_norm": 4.4581193923950195,
"learning_rate": 5.506087574963703e-06,
"loss": 0.5387101173400879,
"step": 1226
},
{
"epoch": 2.590717299578059,
"grad_norm": 4.779836177825928,
"learning_rate": 5.49811518462503e-06,
"loss": 0.9294767379760742,
"step": 1228
},
{
"epoch": 2.5949367088607596,
"grad_norm": 4.019083499908447,
"learning_rate": 5.4901368519357886e-06,
"loss": 0.9565463066101074,
"step": 1230
},
{
"epoch": 2.59915611814346,
"grad_norm": 2.806299924850464,
"learning_rate": 5.482152620070001e-06,
"loss": 0.8302749991416931,
"step": 1232
},
{
"epoch": 2.6033755274261603,
"grad_norm": 46.3029899597168,
"learning_rate": 5.474162532233609e-06,
"loss": 0.28912973403930664,
"step": 1234
},
{
"epoch": 2.607594936708861,
"grad_norm": 4.818080425262451,
"learning_rate": 5.4661666316642534e-06,
"loss": 1.0101039409637451,
"step": 1236
},
{
"epoch": 2.611814345991561,
"grad_norm": 4.704904556274414,
"learning_rate": 5.458164961631019e-06,
"loss": 1.141682505607605,
"step": 1238
},
{
"epoch": 2.6160337552742616,
"grad_norm": 2.6413064002990723,
"learning_rate": 5.450157565434217e-06,
"loss": 0.7691728472709656,
"step": 1240
},
{
"epoch": 2.620253164556962,
"grad_norm": 2.3116259574890137,
"learning_rate": 5.442144486405146e-06,
"loss": 0.8952039480209351,
"step": 1242
},
{
"epoch": 2.6244725738396624,
"grad_norm": 3.752659797668457,
"learning_rate": 5.434125767905855e-06,
"loss": 0.41019898653030396,
"step": 1244
},
{
"epoch": 2.628691983122363,
"grad_norm": 2.37690806388855,
"learning_rate": 5.426101453328911e-06,
"loss": 0.704147219657898,
"step": 1246
},
{
"epoch": 2.632911392405063,
"grad_norm": 3.650939702987671,
"learning_rate": 5.418071586097162e-06,
"loss": 1.3898766040802002,
"step": 1248
},
{
"epoch": 2.6371308016877637,
"grad_norm": 1.6102105379104614,
"learning_rate": 5.410036209663506e-06,
"loss": 0.961624026298523,
"step": 1250
},
{
"epoch": 2.6413502109704643,
"grad_norm": 3.446720600128174,
"learning_rate": 5.401995367510652e-06,
"loss": 0.924649715423584,
"step": 1252
},
{
"epoch": 2.6455696202531644,
"grad_norm": 4.68242073059082,
"learning_rate": 5.393949103150889e-06,
"loss": 0.4435887932777405,
"step": 1254
},
{
"epoch": 2.649789029535865,
"grad_norm": 51.22077178955078,
"learning_rate": 5.385897460125841e-06,
"loss": 0.5546849370002747,
"step": 1256
},
{
"epoch": 2.6540084388185656,
"grad_norm": 2.3170604705810547,
"learning_rate": 5.377840482006247e-06,
"loss": 0.7113304138183594,
"step": 1258
},
{
"epoch": 2.6582278481012658,
"grad_norm": 4.746129512786865,
"learning_rate": 5.369778212391713e-06,
"loss": 0.8765827417373657,
"step": 1260
},
{
"epoch": 2.6624472573839664,
"grad_norm": 4.732134819030762,
"learning_rate": 5.361710694910476e-06,
"loss": 0.8504003882408142,
"step": 1262
},
{
"epoch": 2.6666666666666665,
"grad_norm": 1.4788029193878174,
"learning_rate": 5.3536379732191735e-06,
"loss": 0.5229237079620361,
"step": 1264
},
{
"epoch": 2.670886075949367,
"grad_norm": 9.586766242980957,
"learning_rate": 5.3455600910026075e-06,
"loss": 0.776203989982605,
"step": 1266
},
{
"epoch": 2.6751054852320673,
"grad_norm": 1.5792274475097656,
"learning_rate": 5.337477091973503e-06,
"loss": 0.7061780691146851,
"step": 1268
},
{
"epoch": 2.679324894514768,
"grad_norm": 1.9227761030197144,
"learning_rate": 5.3293890198722765e-06,
"loss": 0.40927794575691223,
"step": 1270
},
{
"epoch": 2.6835443037974684,
"grad_norm": 2.802013635635376,
"learning_rate": 5.321295918466793e-06,
"loss": 0.9143922924995422,
"step": 1272
},
{
"epoch": 2.6877637130801686,
"grad_norm": 14.795599937438965,
"learning_rate": 5.3131978315521355e-06,
"loss": 0.6321116089820862,
"step": 1274
},
{
"epoch": 2.691983122362869,
"grad_norm": 3.627547264099121,
"learning_rate": 5.305094802950368e-06,
"loss": 0.7536362409591675,
"step": 1276
},
{
"epoch": 2.6962025316455698,
"grad_norm": 0.6867983937263489,
"learning_rate": 5.296986876510293e-06,
"loss": 0.27872833609580994,
"step": 1278
},
{
"epoch": 2.70042194092827,
"grad_norm": 3.1109073162078857,
"learning_rate": 5.288874096107218e-06,
"loss": 0.8334829807281494,
"step": 1280
},
{
"epoch": 2.7046413502109705,
"grad_norm": 1.4203203916549683,
"learning_rate": 5.2807565056427155e-06,
"loss": 0.9659562110900879,
"step": 1282
},
{
"epoch": 2.708860759493671,
"grad_norm": 2.113590955734253,
"learning_rate": 5.2726341490443915e-06,
"loss": 0.3422914743423462,
"step": 1284
},
{
"epoch": 2.7130801687763713,
"grad_norm": 4.1795759201049805,
"learning_rate": 5.264507070265639e-06,
"loss": 0.44313400983810425,
"step": 1286
},
{
"epoch": 2.717299578059072,
"grad_norm": 0.8358619809150696,
"learning_rate": 5.256375313285407e-06,
"loss": 0.50257408618927,
"step": 1288
},
{
"epoch": 2.721518987341772,
"grad_norm": 0.6378387808799744,
"learning_rate": 5.248238922107958e-06,
"loss": 0.5335341095924377,
"step": 1290
},
{
"epoch": 2.7257383966244726,
"grad_norm": 2.0149025917053223,
"learning_rate": 5.240097940762638e-06,
"loss": 0.9738786220550537,
"step": 1292
},
{
"epoch": 2.7299578059071727,
"grad_norm": 3.022477149963379,
"learning_rate": 5.231952413303623e-06,
"loss": 0.41252389550209045,
"step": 1294
},
{
"epoch": 2.7341772151898733,
"grad_norm": 2.5130767822265625,
"learning_rate": 5.2238023838097e-06,
"loss": 0.9761707186698914,
"step": 1296
},
{
"epoch": 2.738396624472574,
"grad_norm": 2.43636155128479,
"learning_rate": 5.21564789638401e-06,
"loss": 0.9268041253089905,
"step": 1298
},
{
"epoch": 2.742616033755274,
"grad_norm": 1.058410406112671,
"learning_rate": 5.207488995153821e-06,
"loss": 0.6909565925598145,
"step": 1300
},
{
"epoch": 2.7468354430379747,
"grad_norm": 1.987685203552246,
"learning_rate": 5.1993257242702874e-06,
"loss": 1.0122733116149902,
"step": 1302
},
{
"epoch": 2.7510548523206753,
"grad_norm": 1.9147788286209106,
"learning_rate": 5.191158127908207e-06,
"loss": 0.5920695066452026,
"step": 1304
},
{
"epoch": 2.7552742616033754,
"grad_norm": 2.714449405670166,
"learning_rate": 5.182986250265786e-06,
"loss": 1.0310044288635254,
"step": 1306
},
{
"epoch": 2.759493670886076,
"grad_norm": 1.7678923606872559,
"learning_rate": 5.174810135564397e-06,
"loss": 0.9253189563751221,
"step": 1308
},
{
"epoch": 2.7637130801687766,
"grad_norm": 5.892001152038574,
"learning_rate": 5.1666298280483436e-06,
"loss": 0.80256587266922,
"step": 1310
},
{
"epoch": 2.7679324894514767,
"grad_norm": 12.360274314880371,
"learning_rate": 5.158445371984614e-06,
"loss": 0.9463623762130737,
"step": 1312
},
{
"epoch": 2.7721518987341773,
"grad_norm": 2.3304073810577393,
"learning_rate": 5.150256811662653e-06,
"loss": 0.9907184839248657,
"step": 1314
},
{
"epoch": 2.7763713080168775,
"grad_norm": 3.796537160873413,
"learning_rate": 5.142064191394107e-06,
"loss": 0.609095573425293,
"step": 1316
},
{
"epoch": 2.780590717299578,
"grad_norm": 2.1420092582702637,
"learning_rate": 5.133867555512599e-06,
"loss": 0.5119812488555908,
"step": 1318
},
{
"epoch": 2.7848101265822782,
"grad_norm": 3.1301980018615723,
"learning_rate": 5.125666948373477e-06,
"loss": 0.9296759366989136,
"step": 1320
},
{
"epoch": 2.789029535864979,
"grad_norm": 2.0127930641174316,
"learning_rate": 5.1174624143535845e-06,
"loss": 0.4965199828147888,
"step": 1322
},
{
"epoch": 2.7932489451476794,
"grad_norm": 6.2706217765808105,
"learning_rate": 5.10925399785101e-06,
"loss": 1.0675724744796753,
"step": 1324
},
{
"epoch": 2.7974683544303796,
"grad_norm": 9.385963439941406,
"learning_rate": 5.101041743284855e-06,
"loss": 0.8606825470924377,
"step": 1326
},
{
"epoch": 2.80168776371308,
"grad_norm": 1.8741198778152466,
"learning_rate": 5.0928256950949874e-06,
"loss": 0.6247942447662354,
"step": 1328
},
{
"epoch": 2.8059071729957807,
"grad_norm": 1.5202703475952148,
"learning_rate": 5.084605897741808e-06,
"loss": 0.9863821268081665,
"step": 1330
},
{
"epoch": 2.810126582278481,
"grad_norm": 2.4014227390289307,
"learning_rate": 5.076382395706001e-06,
"loss": 0.7821711301803589,
"step": 1332
},
{
"epoch": 2.8143459915611815,
"grad_norm": 2.5113935470581055,
"learning_rate": 5.0681552334883015e-06,
"loss": 0.48877081274986267,
"step": 1334
},
{
"epoch": 2.818565400843882,
"grad_norm": 1.116832971572876,
"learning_rate": 5.059924455609252e-06,
"loss": 0.639763593673706,
"step": 1336
},
{
"epoch": 2.8227848101265822,
"grad_norm": 2.161829710006714,
"learning_rate": 5.051690106608958e-06,
"loss": 0.6487863063812256,
"step": 1338
},
{
"epoch": 2.827004219409283,
"grad_norm": 3.6037724018096924,
"learning_rate": 5.04345223104685e-06,
"loss": 0.8599737882614136,
"step": 1340
},
{
"epoch": 2.831223628691983,
"grad_norm": 25.895301818847656,
"learning_rate": 5.035210873501446e-06,
"loss": 0.8409707546234131,
"step": 1342
},
{
"epoch": 2.8354430379746836,
"grad_norm": 1.7677528858184814,
"learning_rate": 5.026966078570102e-06,
"loss": 1.0647809505462646,
"step": 1344
},
{
"epoch": 2.8396624472573837,
"grad_norm": 5.221706390380859,
"learning_rate": 5.0187178908687765e-06,
"loss": 0.6761691570281982,
"step": 1346
},
{
"epoch": 2.8438818565400843,
"grad_norm": 2.069338798522949,
"learning_rate": 5.010466355031788e-06,
"loss": 0.5935064554214478,
"step": 1348
},
{
"epoch": 2.848101265822785,
"grad_norm": 1.759072184562683,
"learning_rate": 5.002211515711574e-06,
"loss": 0.9735701680183411,
"step": 1350
},
{
"epoch": 2.852320675105485,
"grad_norm": 6.272833824157715,
"learning_rate": 4.993953417578447e-06,
"loss": 0.6328434944152832,
"step": 1352
},
{
"epoch": 2.8565400843881856,
"grad_norm": 8.284124374389648,
"learning_rate": 4.985692105320356e-06,
"loss": 0.6582671403884888,
"step": 1354
},
{
"epoch": 2.8607594936708862,
"grad_norm": 4.635993003845215,
"learning_rate": 4.977427623642641e-06,
"loss": 0.56138014793396,
"step": 1356
},
{
"epoch": 2.8649789029535864,
"grad_norm": 1.4133601188659668,
"learning_rate": 4.9691600172677945e-06,
"loss": 0.9400450587272644,
"step": 1358
},
{
"epoch": 2.869198312236287,
"grad_norm": 3.7701480388641357,
"learning_rate": 4.960889330935215e-06,
"loss": 0.8297948837280273,
"step": 1360
},
{
"epoch": 2.8734177215189876,
"grad_norm": 3.301804780960083,
"learning_rate": 4.952615609400973e-06,
"loss": 0.5724865794181824,
"step": 1362
},
{
"epoch": 2.8776371308016877,
"grad_norm": 3.880316972732544,
"learning_rate": 4.94433889743756e-06,
"loss": 0.9015120267868042,
"step": 1364
},
{
"epoch": 2.8818565400843883,
"grad_norm": 2.4567813873291016,
"learning_rate": 4.93605923983365e-06,
"loss": 1.0346885919570923,
"step": 1366
},
{
"epoch": 2.8860759493670884,
"grad_norm": 6.220330238342285,
"learning_rate": 4.92777668139386e-06,
"loss": 0.9936701059341431,
"step": 1368
},
{
"epoch": 2.890295358649789,
"grad_norm": 2.123796224594116,
"learning_rate": 4.919491266938501e-06,
"loss": 0.9021327495574951,
"step": 1370
},
{
"epoch": 2.894514767932489,
"grad_norm": 5.658734321594238,
"learning_rate": 4.911203041303342e-06,
"loss": 0.4772055745124817,
"step": 1372
},
{
"epoch": 2.8987341772151898,
"grad_norm": 3.852552890777588,
"learning_rate": 4.902912049339362e-06,
"loss": 0.7514923214912415,
"step": 1374
},
{
"epoch": 2.9029535864978904,
"grad_norm": 1.6569684743881226,
"learning_rate": 4.894618335912511e-06,
"loss": 0.9316278696060181,
"step": 1376
},
{
"epoch": 2.9071729957805905,
"grad_norm": 3.75467586517334,
"learning_rate": 4.886321945903466e-06,
"loss": 0.7876487374305725,
"step": 1378
},
{
"epoch": 2.911392405063291,
"grad_norm": 5.47602653503418,
"learning_rate": 4.8780229242073895e-06,
"loss": 1.141374111175537,
"step": 1380
},
{
"epoch": 2.9156118143459917,
"grad_norm": 4.051374435424805,
"learning_rate": 4.86972131573368e-06,
"loss": 0.640509307384491,
"step": 1382
},
{
"epoch": 2.919831223628692,
"grad_norm": 7.645662307739258,
"learning_rate": 4.86141716540574e-06,
"loss": 0.6477132439613342,
"step": 1384
},
{
"epoch": 2.9240506329113924,
"grad_norm": 2.780121088027954,
"learning_rate": 4.853110518160723e-06,
"loss": 0.5821589827537537,
"step": 1386
},
{
"epoch": 2.928270042194093,
"grad_norm": 4.329258441925049,
"learning_rate": 4.844801418949299e-06,
"loss": 0.9183673858642578,
"step": 1388
},
{
"epoch": 2.932489451476793,
"grad_norm": 1.9503802061080933,
"learning_rate": 4.836489912735402e-06,
"loss": 0.8357143402099609,
"step": 1390
},
{
"epoch": 2.9367088607594938,
"grad_norm": 1.0354031324386597,
"learning_rate": 4.8281760444959926e-06,
"loss": 0.45355841517448425,
"step": 1392
},
{
"epoch": 2.9409282700421944,
"grad_norm": 5.098978519439697,
"learning_rate": 4.8198598592208126e-06,
"loss": 0.6029504537582397,
"step": 1394
},
{
"epoch": 2.9451476793248945,
"grad_norm": 1.7021719217300415,
"learning_rate": 4.811541401912146e-06,
"loss": 0.8993232250213623,
"step": 1396
},
{
"epoch": 2.9493670886075947,
"grad_norm": 0.9127289652824402,
"learning_rate": 4.803220717584566e-06,
"loss": 0.7546182870864868,
"step": 1398
},
{
"epoch": 2.9535864978902953,
"grad_norm": 4.401092529296875,
"learning_rate": 4.7948978512647016e-06,
"loss": 0.813082218170166,
"step": 1400
},
{
"epoch": 2.957805907172996,
"grad_norm": 4.511460304260254,
"learning_rate": 4.786572847990987e-06,
"loss": 0.5571738481521606,
"step": 1402
},
{
"epoch": 2.962025316455696,
"grad_norm": 1.7611995935440063,
"learning_rate": 4.778245752813421e-06,
"loss": 0.9406437277793884,
"step": 1404
},
{
"epoch": 2.9662447257383966,
"grad_norm": 5.091633319854736,
"learning_rate": 4.769916610793324e-06,
"loss": 0.7957962155342102,
"step": 1406
},
{
"epoch": 2.970464135021097,
"grad_norm": 4.824461460113525,
"learning_rate": 4.76158546700309e-06,
"loss": 0.4907096028327942,
"step": 1408
},
{
"epoch": 2.9746835443037973,
"grad_norm": 3.2384073734283447,
"learning_rate": 4.75325236652595e-06,
"loss": 0.742721676826477,
"step": 1410
},
{
"epoch": 2.978902953586498,
"grad_norm": 3.1998229026794434,
"learning_rate": 4.744917354455715e-06,
"loss": 0.9864751100540161,
"step": 1412
},
{
"epoch": 2.9831223628691985,
"grad_norm": 8.642891883850098,
"learning_rate": 4.73658047589655e-06,
"loss": 0.8089879751205444,
"step": 1414
},
{
"epoch": 2.9873417721518987,
"grad_norm": 1.6954267024993896,
"learning_rate": 4.7282417759627134e-06,
"loss": 0.8185816407203674,
"step": 1416
},
{
"epoch": 2.9915611814345993,
"grad_norm": 1.4983083009719849,
"learning_rate": 4.719901299778325e-06,
"loss": 0.8309789896011353,
"step": 1418
},
{
"epoch": 2.9957805907173,
"grad_norm": 5.758739948272705,
"learning_rate": 4.71155909247711e-06,
"loss": 0.9021912813186646,
"step": 1420
},
{
"epoch": 3.0,
"grad_norm": 2.6232898235321045,
"learning_rate": 4.703215199202169e-06,
"loss": 0.2926831841468811,
"step": 1422
},
{
"epoch": 3.0042194092827006,
"grad_norm": 2.3824779987335205,
"learning_rate": 4.6948696651057225e-06,
"loss": 0.5607067346572876,
"step": 1424
},
{
"epoch": 3.0084388185654007,
"grad_norm": 10.018150329589844,
"learning_rate": 4.6865225353488675e-06,
"loss": 0.5354501008987427,
"step": 1426
},
{
"epoch": 3.0126582278481013,
"grad_norm": 16.426225662231445,
"learning_rate": 4.678173855101341e-06,
"loss": 0.5269479155540466,
"step": 1428
},
{
"epoch": 3.0168776371308015,
"grad_norm": 15.212722778320312,
"learning_rate": 4.669823669541266e-06,
"loss": 0.39293336868286133,
"step": 1430
},
{
"epoch": 3.021097046413502,
"grad_norm": 2.7751386165618896,
"learning_rate": 4.661472023854916e-06,
"loss": 0.8252520561218262,
"step": 1432
},
{
"epoch": 3.0253164556962027,
"grad_norm": 2.98408842086792,
"learning_rate": 4.653118963236458e-06,
"loss": 0.7142210006713867,
"step": 1434
},
{
"epoch": 3.029535864978903,
"grad_norm": 1.7343214750289917,
"learning_rate": 4.644764532887726e-06,
"loss": 0.8274791240692139,
"step": 1436
},
{
"epoch": 3.0337552742616034,
"grad_norm": 2.1739771366119385,
"learning_rate": 4.636408778017957e-06,
"loss": 0.3643840551376343,
"step": 1438
},
{
"epoch": 3.037974683544304,
"grad_norm": 1.6796656847000122,
"learning_rate": 4.6280517438435616e-06,
"loss": 0.7152677178382874,
"step": 1440
},
{
"epoch": 3.042194092827004,
"grad_norm": 1.2829041481018066,
"learning_rate": 4.61969347558787e-06,
"loss": 0.5194791555404663,
"step": 1442
},
{
"epoch": 3.0464135021097047,
"grad_norm": 1.6921758651733398,
"learning_rate": 4.6113340184808925e-06,
"loss": 0.6532431840896606,
"step": 1444
},
{
"epoch": 3.050632911392405,
"grad_norm": 1.7007097005844116,
"learning_rate": 4.602973417759071e-06,
"loss": 0.7926474809646606,
"step": 1446
},
{
"epoch": 3.0548523206751055,
"grad_norm": 2854.81396484375,
"learning_rate": 4.594611718665038e-06,
"loss": 0.5383695960044861,
"step": 1448
},
{
"epoch": 3.059071729957806,
"grad_norm": 4.033429145812988,
"learning_rate": 4.586248966447367e-06,
"loss": 0.6349921822547913,
"step": 1450
},
{
"epoch": 3.0632911392405062,
"grad_norm": 1.7910646200180054,
"learning_rate": 4.577885206360334e-06,
"loss": 0.7665805220603943,
"step": 1452
},
{
"epoch": 3.067510548523207,
"grad_norm": 2.6463685035705566,
"learning_rate": 4.5695204836636655e-06,
"loss": 0.617534875869751,
"step": 1454
},
{
"epoch": 3.071729957805907,
"grad_norm": 3.8018009662628174,
"learning_rate": 4.561154843622299e-06,
"loss": 0.4436488151550293,
"step": 1456
},
{
"epoch": 3.0759493670886076,
"grad_norm": 1.6345051527023315,
"learning_rate": 4.552788331506134e-06,
"loss": 0.45668232440948486,
"step": 1458
},
{
"epoch": 3.080168776371308,
"grad_norm": 10.521839141845703,
"learning_rate": 4.544420992589792e-06,
"loss": 0.5640779733657837,
"step": 1460
},
{
"epoch": 3.0843881856540083,
"grad_norm": 2.4284989833831787,
"learning_rate": 4.53605287215237e-06,
"loss": 0.7709170579910278,
"step": 1462
},
{
"epoch": 3.088607594936709,
"grad_norm": 1.7529208660125732,
"learning_rate": 4.527684015477188e-06,
"loss": 0.7688764333724976,
"step": 1464
},
{
"epoch": 3.0928270042194095,
"grad_norm": 29.2982177734375,
"learning_rate": 4.519314467851555e-06,
"loss": 0.7450973987579346,
"step": 1466
},
{
"epoch": 3.0970464135021096,
"grad_norm": 6.658877372741699,
"learning_rate": 4.510944274566518e-06,
"loss": 0.5714298486709595,
"step": 1468
},
{
"epoch": 3.1012658227848102,
"grad_norm": 10.68608283996582,
"learning_rate": 4.502573480916617e-06,
"loss": 0.17385733127593994,
"step": 1470
},
{
"epoch": 3.1054852320675104,
"grad_norm": 5.784574508666992,
"learning_rate": 4.494202132199643e-06,
"loss": 0.9861471652984619,
"step": 1472
},
{
"epoch": 3.109704641350211,
"grad_norm": 0.49223047494888306,
"learning_rate": 4.485830273716386e-06,
"loss": 0.3651547431945801,
"step": 1474
},
{
"epoch": 3.1139240506329116,
"grad_norm": 3.3575212955474854,
"learning_rate": 4.4774579507704e-06,
"loss": 0.8966869115829468,
"step": 1476
},
{
"epoch": 3.1181434599156117,
"grad_norm": 3.334437370300293,
"learning_rate": 4.46908520866775e-06,
"loss": 0.9988681077957153,
"step": 1478
},
{
"epoch": 3.1223628691983123,
"grad_norm": 1.8539319038391113,
"learning_rate": 4.460712092716768e-06,
"loss": 0.7239236831665039,
"step": 1480
},
{
"epoch": 3.1265822784810124,
"grad_norm": 3.4966561794281006,
"learning_rate": 4.452338648227813e-06,
"loss": 0.8891302347183228,
"step": 1482
},
{
"epoch": 3.130801687763713,
"grad_norm": 2.0679361820220947,
"learning_rate": 4.443964920513017e-06,
"loss": 0.8403599262237549,
"step": 1484
},
{
"epoch": 3.1350210970464136,
"grad_norm": 6.140291690826416,
"learning_rate": 4.435590954886047e-06,
"loss": 0.5731205940246582,
"step": 1486
},
{
"epoch": 3.1392405063291138,
"grad_norm": 2.278001070022583,
"learning_rate": 4.427216796661857e-06,
"loss": 0.5262531638145447,
"step": 1488
},
{
"epoch": 3.1434599156118144,
"grad_norm": 3.5377111434936523,
"learning_rate": 4.418842491156445e-06,
"loss": 0.8218955993652344,
"step": 1490
},
{
"epoch": 3.147679324894515,
"grad_norm": 1.6017743349075317,
"learning_rate": 4.410468083686605e-06,
"loss": 0.5413227081298828,
"step": 1492
},
{
"epoch": 3.151898734177215,
"grad_norm": 3.7507541179656982,
"learning_rate": 4.402093619569679e-06,
"loss": 0.7044564485549927,
"step": 1494
},
{
"epoch": 3.1561181434599157,
"grad_norm": 3.2674598693847656,
"learning_rate": 4.393719144123321e-06,
"loss": 0.6519253253936768,
"step": 1496
},
{
"epoch": 3.160337552742616,
"grad_norm": 1.4527074098587036,
"learning_rate": 4.385344702665246e-06,
"loss": 0.3425239622592926,
"step": 1498
},
{
"epoch": 3.1645569620253164,
"grad_norm": 1.9561620950698853,
"learning_rate": 4.376970340512979e-06,
"loss": 0.4482334852218628,
"step": 1500
},
{
"epoch": 3.168776371308017,
"grad_norm": 2.0695550441741943,
"learning_rate": 4.368596102983623e-06,
"loss": 0.7770338654518127,
"step": 1502
},
{
"epoch": 3.172995780590717,
"grad_norm": 2.4615397453308105,
"learning_rate": 4.360222035393603e-06,
"loss": 0.6019558906555176,
"step": 1504
},
{
"epoch": 3.1772151898734178,
"grad_norm": 2.1790032386779785,
"learning_rate": 4.351848183058427e-06,
"loss": 0.8018068075180054,
"step": 1506
},
{
"epoch": 3.181434599156118,
"grad_norm": 3.2153782844543457,
"learning_rate": 4.343474591292432e-06,
"loss": 0.8963441848754883,
"step": 1508
},
{
"epoch": 3.1856540084388185,
"grad_norm": 4.124518394470215,
"learning_rate": 4.335101305408552e-06,
"loss": 0.7522740960121155,
"step": 1510
},
{
"epoch": 3.189873417721519,
"grad_norm": 0.789757251739502,
"learning_rate": 4.3267283707180635e-06,
"loss": 0.2408222258090973,
"step": 1512
},
{
"epoch": 3.1940928270042193,
"grad_norm": 2.3271267414093018,
"learning_rate": 4.31835583253034e-06,
"loss": 0.7659440636634827,
"step": 1514
},
{
"epoch": 3.19831223628692,
"grad_norm": 1.0062589645385742,
"learning_rate": 4.309983736152612e-06,
"loss": 0.5749263763427734,
"step": 1516
},
{
"epoch": 3.2025316455696204,
"grad_norm": 3.043958902359009,
"learning_rate": 4.301612126889719e-06,
"loss": 0.7307943105697632,
"step": 1518
},
{
"epoch": 3.2067510548523206,
"grad_norm": 4.167404651641846,
"learning_rate": 4.293241050043863e-06,
"loss": 0.6726250648498535,
"step": 1520
},
{
"epoch": 3.210970464135021,
"grad_norm": 7.460043430328369,
"learning_rate": 4.284870550914368e-06,
"loss": 0.27290791273117065,
"step": 1522
},
{
"epoch": 3.2151898734177213,
"grad_norm": 7.231512546539307,
"learning_rate": 4.276500674797427e-06,
"loss": 0.6644264459609985,
"step": 1524
},
{
"epoch": 3.219409282700422,
"grad_norm": 2.330780029296875,
"learning_rate": 4.268131466985867e-06,
"loss": 0.5520614385604858,
"step": 1526
},
{
"epoch": 3.2236286919831225,
"grad_norm": 0.9592808485031128,
"learning_rate": 4.259762972768895e-06,
"loss": 0.2992947995662689,
"step": 1528
},
{
"epoch": 3.2278481012658227,
"grad_norm": 4.56012487411499,
"learning_rate": 4.2513952374318556e-06,
"loss": 0.6852157115936279,
"step": 1530
},
{
"epoch": 3.2320675105485233,
"grad_norm": 1.3004289865493774,
"learning_rate": 4.24302830625599e-06,
"loss": 0.18629956245422363,
"step": 1532
},
{
"epoch": 3.2362869198312234,
"grad_norm": 2.980001449584961,
"learning_rate": 4.2346622245181864e-06,
"loss": 0.6743506193161011,
"step": 1534
},
{
"epoch": 3.240506329113924,
"grad_norm": 2.0032145977020264,
"learning_rate": 4.226297037490735e-06,
"loss": 0.779093861579895,
"step": 1536
},
{
"epoch": 3.2447257383966246,
"grad_norm": 1.4844651222229004,
"learning_rate": 4.217932790441087e-06,
"loss": 0.7138203382492065,
"step": 1538
},
{
"epoch": 3.2489451476793247,
"grad_norm": 9.559041023254395,
"learning_rate": 4.209569528631604e-06,
"loss": 0.726833701133728,
"step": 1540
},
{
"epoch": 3.2531645569620253,
"grad_norm": 5.846635341644287,
"learning_rate": 4.201207297319318e-06,
"loss": 0.577594518661499,
"step": 1542
},
{
"epoch": 3.257383966244726,
"grad_norm": 14.799168586730957,
"learning_rate": 4.192846141755686e-06,
"loss": 0.6153043508529663,
"step": 1544
},
{
"epoch": 3.261603375527426,
"grad_norm": 4.5415568351745605,
"learning_rate": 4.184486107186338e-06,
"loss": 0.3514612317085266,
"step": 1546
},
{
"epoch": 3.2658227848101267,
"grad_norm": 1.909765362739563,
"learning_rate": 4.176127238850845e-06,
"loss": 0.6445936560630798,
"step": 1548
},
{
"epoch": 3.270042194092827,
"grad_norm": 4.638195037841797,
"learning_rate": 4.1677695819824615e-06,
"loss": 0.32674679160118103,
"step": 1550
},
{
"epoch": 3.2742616033755274,
"grad_norm": 1.878891944885254,
"learning_rate": 4.159413181807891e-06,
"loss": 0.2638033628463745,
"step": 1552
},
{
"epoch": 3.278481012658228,
"grad_norm": 6.237101078033447,
"learning_rate": 4.151058083547031e-06,
"loss": 0.46362948417663574,
"step": 1554
},
{
"epoch": 3.282700421940928,
"grad_norm": 2.6107330322265625,
"learning_rate": 4.142704332412738e-06,
"loss": 0.767645001411438,
"step": 1556
},
{
"epoch": 3.2869198312236287,
"grad_norm": 0.5579416155815125,
"learning_rate": 4.1343519736105785e-06,
"loss": 0.6301885843276978,
"step": 1558
},
{
"epoch": 3.291139240506329,
"grad_norm": 2.669329881668091,
"learning_rate": 4.126001052338581e-06,
"loss": 0.4373775124549866,
"step": 1560
},
{
"epoch": 3.2953586497890295,
"grad_norm": 3.803680181503296,
"learning_rate": 4.1176516137870004e-06,
"loss": 0.5417683720588684,
"step": 1562
},
{
"epoch": 3.29957805907173,
"grad_norm": 7.650591850280762,
"learning_rate": 4.109303703138063e-06,
"loss": 0.7619826793670654,
"step": 1564
},
{
"epoch": 3.3037974683544302,
"grad_norm": 12.526813507080078,
"learning_rate": 4.1009573655657295e-06,
"loss": 0.696597695350647,
"step": 1566
},
{
"epoch": 3.308016877637131,
"grad_norm": 1.6425329446792603,
"learning_rate": 4.092612646235447e-06,
"loss": 0.4307796359062195,
"step": 1568
},
{
"epoch": 3.3122362869198314,
"grad_norm": 3.352663516998291,
"learning_rate": 4.084269590303907e-06,
"loss": 0.3921862244606018,
"step": 1570
},
{
"epoch": 3.3164556962025316,
"grad_norm": 2.774632215499878,
"learning_rate": 4.075928242918798e-06,
"loss": 0.4460093677043915,
"step": 1572
},
{
"epoch": 3.320675105485232,
"grad_norm": 2.935922384262085,
"learning_rate": 4.067588649218564e-06,
"loss": 0.935857892036438,
"step": 1574
},
{
"epoch": 3.3248945147679323,
"grad_norm": 2.058513641357422,
"learning_rate": 4.059250854332159e-06,
"loss": 0.6347423791885376,
"step": 1576
},
{
"epoch": 3.329113924050633,
"grad_norm": 1.685788869857788,
"learning_rate": 4.050914903378802e-06,
"loss": 0.7031244039535522,
"step": 1578
},
{
"epoch": 3.3333333333333335,
"grad_norm": 2.3649990558624268,
"learning_rate": 4.0425808414677345e-06,
"loss": 0.43982017040252686,
"step": 1580
},
{
"epoch": 3.3375527426160336,
"grad_norm": 1.6167001724243164,
"learning_rate": 4.034248713697977e-06,
"loss": 0.40530964732170105,
"step": 1582
},
{
"epoch": 3.3417721518987342,
"grad_norm": 10.420100212097168,
"learning_rate": 4.025918565158079e-06,
"loss": 0.6115049123764038,
"step": 1584
},
{
"epoch": 3.3459915611814344,
"grad_norm": 7.325076580047607,
"learning_rate": 4.0175904409258844e-06,
"loss": 0.5467356443405151,
"step": 1586
},
{
"epoch": 3.350210970464135,
"grad_norm": 2.629723310470581,
"learning_rate": 4.009264386068281e-06,
"loss": 0.3660237789154053,
"step": 1588
},
{
"epoch": 3.3544303797468356,
"grad_norm": 1.3594039678573608,
"learning_rate": 4.000940445640959e-06,
"loss": 0.8356277942657471,
"step": 1590
},
{
"epoch": 3.3586497890295357,
"grad_norm": 2.2674577236175537,
"learning_rate": 3.992618664688165e-06,
"loss": 0.7481639981269836,
"step": 1592
},
{
"epoch": 3.3628691983122363,
"grad_norm": 0.9013781547546387,
"learning_rate": 3.98429908824246e-06,
"loss": 0.3871142268180847,
"step": 1594
},
{
"epoch": 3.367088607594937,
"grad_norm": 3.642791271209717,
"learning_rate": 3.975981761324477e-06,
"loss": 0.4039541482925415,
"step": 1596
},
{
"epoch": 3.371308016877637,
"grad_norm": 1.674315333366394,
"learning_rate": 3.967666728942675e-06,
"loss": 0.3363262712955475,
"step": 1598
},
{
"epoch": 3.3755274261603376,
"grad_norm": 4.324201583862305,
"learning_rate": 3.959354036093097e-06,
"loss": 0.45887890458106995,
"step": 1600
},
{
"epoch": 3.379746835443038,
"grad_norm": 1.464799404144287,
"learning_rate": 3.951043727759125e-06,
"loss": 0.47278836369514465,
"step": 1602
},
{
"epoch": 3.3839662447257384,
"grad_norm": 0.5969643592834473,
"learning_rate": 3.942735848911236e-06,
"loss": 0.4599458575248718,
"step": 1604
},
{
"epoch": 3.388185654008439,
"grad_norm": 34.39630126953125,
"learning_rate": 3.9344304445067644e-06,
"loss": 0.8346083164215088,
"step": 1606
},
{
"epoch": 3.392405063291139,
"grad_norm": 2.1359801292419434,
"learning_rate": 3.9261275594896495e-06,
"loss": 0.532837450504303,
"step": 1608
},
{
"epoch": 3.3966244725738397,
"grad_norm": 1.2699977159500122,
"learning_rate": 3.9178272387902e-06,
"loss": 0.2630946636199951,
"step": 1610
},
{
"epoch": 3.40084388185654,
"grad_norm": 2.12693452835083,
"learning_rate": 3.909529527324849e-06,
"loss": 0.7574643492698669,
"step": 1612
},
{
"epoch": 3.4050632911392404,
"grad_norm": 2.4170689582824707,
"learning_rate": 3.9012344699959045e-06,
"loss": 0.2519644498825073,
"step": 1614
},
{
"epoch": 3.409282700421941,
"grad_norm": 3.567059278488159,
"learning_rate": 3.892942111691319e-06,
"loss": 0.6072185039520264,
"step": 1616
},
{
"epoch": 3.413502109704641,
"grad_norm": 2.984300136566162,
"learning_rate": 3.884652497284436e-06,
"loss": 0.9044985771179199,
"step": 1618
},
{
"epoch": 3.4177215189873418,
"grad_norm": 1.356608271598816,
"learning_rate": 3.8763656716337496e-06,
"loss": 0.8276529908180237,
"step": 1620
},
{
"epoch": 3.4219409282700424,
"grad_norm": 2.127027750015259,
"learning_rate": 3.868081679582664e-06,
"loss": 0.45381200313568115,
"step": 1622
},
{
"epoch": 3.4261603375527425,
"grad_norm": 9.49200439453125,
"learning_rate": 3.8598005659592505e-06,
"loss": 0.35857370495796204,
"step": 1624
},
{
"epoch": 3.430379746835443,
"grad_norm": 7.919655799865723,
"learning_rate": 3.851522375576004e-06,
"loss": 0.2886282801628113,
"step": 1626
},
{
"epoch": 3.4345991561181437,
"grad_norm": 2.5264101028442383,
"learning_rate": 3.843247153229598e-06,
"loss": 0.7439049482345581,
"step": 1628
},
{
"epoch": 3.438818565400844,
"grad_norm": 3.5521364212036133,
"learning_rate": 3.834974943700646e-06,
"loss": 0.1677057147026062,
"step": 1630
},
{
"epoch": 3.4430379746835444,
"grad_norm": 3.5299649238586426,
"learning_rate": 3.82670579175346e-06,
"loss": 0.867900013923645,
"step": 1632
},
{
"epoch": 3.4472573839662446,
"grad_norm": 1.0358866453170776,
"learning_rate": 3.818439742135804e-06,
"loss": 0.4616679549217224,
"step": 1634
},
{
"epoch": 3.451476793248945,
"grad_norm": 17.166608810424805,
"learning_rate": 3.8101768395786555e-06,
"loss": 0.8641064167022705,
"step": 1636
},
{
"epoch": 3.4556962025316453,
"grad_norm": 3.9892635345458984,
"learning_rate": 3.80191712879596e-06,
"loss": 0.7791248559951782,
"step": 1638
},
{
"epoch": 3.459915611814346,
"grad_norm": 3.8166918754577637,
"learning_rate": 3.7936606544843936e-06,
"loss": 0.8491038084030151,
"step": 1640
},
{
"epoch": 3.4641350210970465,
"grad_norm": 12.44215202331543,
"learning_rate": 3.7854074613231156e-06,
"loss": 0.8689329624176025,
"step": 1642
},
{
"epoch": 3.4683544303797467,
"grad_norm": 9.996358871459961,
"learning_rate": 3.777157593973531e-06,
"loss": 0.1393229067325592,
"step": 1644
},
{
"epoch": 3.4725738396624473,
"grad_norm": 2.609384298324585,
"learning_rate": 3.768911097079048e-06,
"loss": 0.5422978401184082,
"step": 1646
},
{
"epoch": 3.476793248945148,
"grad_norm": 1.4818031787872314,
"learning_rate": 3.7606680152648363e-06,
"loss": 0.6728254556655884,
"step": 1648
},
{
"epoch": 3.481012658227848,
"grad_norm": 2.1601715087890625,
"learning_rate": 3.752428393137582e-06,
"loss": 0.35271987318992615,
"step": 1650
},
{
"epoch": 3.4852320675105486,
"grad_norm": 2.9078116416931152,
"learning_rate": 3.744192275285254e-06,
"loss": 0.6402429938316345,
"step": 1652
},
{
"epoch": 3.489451476793249,
"grad_norm": 1.4320260286331177,
"learning_rate": 3.735959706276855e-06,
"loss": 0.4159366488456726,
"step": 1654
},
{
"epoch": 3.4936708860759493,
"grad_norm": 2.64323091506958,
"learning_rate": 3.727730730662185e-06,
"loss": 0.45933040976524353,
"step": 1656
},
{
"epoch": 3.49789029535865,
"grad_norm": 5.375485897064209,
"learning_rate": 3.719505392971597e-06,
"loss": 0.7267172336578369,
"step": 1658
},
{
"epoch": 3.50210970464135,
"grad_norm": 1.8793143033981323,
"learning_rate": 3.7112837377157595e-06,
"loss": 0.750633955001831,
"step": 1660
},
{
"epoch": 3.5063291139240507,
"grad_norm": 3.7220072746276855,
"learning_rate": 3.7030658093854116e-06,
"loss": 0.7886282205581665,
"step": 1662
},
{
"epoch": 3.510548523206751,
"grad_norm": 4.647188186645508,
"learning_rate": 3.6948516524511284e-06,
"loss": 0.4952489733695984,
"step": 1664
},
{
"epoch": 3.5147679324894514,
"grad_norm": 4.3419575691223145,
"learning_rate": 3.686641311363072e-06,
"loss": 0.7061523199081421,
"step": 1666
},
{
"epoch": 3.518987341772152,
"grad_norm": 2.792799949645996,
"learning_rate": 3.678434830550758e-06,
"loss": 0.4294711947441101,
"step": 1668
},
{
"epoch": 3.523206751054852,
"grad_norm": 3.9279825687408447,
"learning_rate": 3.670232254422812e-06,
"loss": 0.6987364888191223,
"step": 1670
},
{
"epoch": 3.5274261603375527,
"grad_norm": 4.345192909240723,
"learning_rate": 3.6620336273667292e-06,
"loss": 0.2978661060333252,
"step": 1672
},
{
"epoch": 3.5316455696202533,
"grad_norm": 2.069209575653076,
"learning_rate": 3.6538389937486356e-06,
"loss": 0.4812040627002716,
"step": 1674
},
{
"epoch": 3.5358649789029535,
"grad_norm": 14.877073287963867,
"learning_rate": 3.6456483979130477e-06,
"loss": 0.5612766146659851,
"step": 1676
},
{
"epoch": 3.540084388185654,
"grad_norm": 6.497949600219727,
"learning_rate": 3.6374618841826285e-06,
"loss": 0.6456748843193054,
"step": 1678
},
{
"epoch": 3.5443037974683547,
"grad_norm": 6.732306003570557,
"learning_rate": 3.629279496857955e-06,
"loss": 0.713530421257019,
"step": 1680
},
{
"epoch": 3.548523206751055,
"grad_norm": 2.6278674602508545,
"learning_rate": 3.621101280217272e-06,
"loss": 0.6881183385848999,
"step": 1682
},
{
"epoch": 3.5527426160337554,
"grad_norm": 5.155986785888672,
"learning_rate": 3.612927278516257e-06,
"loss": 0.5856807827949524,
"step": 1684
},
{
"epoch": 3.5569620253164556,
"grad_norm": 6.799246788024902,
"learning_rate": 3.6047575359877768e-06,
"loss": 0.36446380615234375,
"step": 1686
},
{
"epoch": 3.561181434599156,
"grad_norm": 1.061477541923523,
"learning_rate": 3.596592096841651e-06,
"loss": 0.4035094976425171,
"step": 1688
},
{
"epoch": 3.5654008438818563,
"grad_norm": 4.340595245361328,
"learning_rate": 3.5884310052644127e-06,
"loss": 0.7940167188644409,
"step": 1690
},
{
"epoch": 3.569620253164557,
"grad_norm": 8.34933853149414,
"learning_rate": 3.580274305419067e-06,
"loss": 0.25536781549453735,
"step": 1692
},
{
"epoch": 3.5738396624472575,
"grad_norm": 0.517219603061676,
"learning_rate": 3.572122041444853e-06,
"loss": 0.3392212688922882,
"step": 1694
},
{
"epoch": 3.5780590717299576,
"grad_norm": 7.081967830657959,
"learning_rate": 3.5639742574570084e-06,
"loss": 0.24323059618473053,
"step": 1696
},
{
"epoch": 3.5822784810126582,
"grad_norm": 1.9360688924789429,
"learning_rate": 3.5558309975465256e-06,
"loss": 0.600135326385498,
"step": 1698
},
{
"epoch": 3.586497890295359,
"grad_norm": 2.5145275592803955,
"learning_rate": 3.5476923057799165e-06,
"loss": 0.4567859172821045,
"step": 1700
},
{
"epoch": 3.590717299578059,
"grad_norm": 3.178347110748291,
"learning_rate": 3.53955822619897e-06,
"loss": 0.4825342893600464,
"step": 1702
},
{
"epoch": 3.5949367088607596,
"grad_norm": 2.0541470050811768,
"learning_rate": 3.531428802820521e-06,
"loss": 1.0025891065597534,
"step": 1704
},
{
"epoch": 3.59915611814346,
"grad_norm": 1.977526307106018,
"learning_rate": 3.5233040796362038e-06,
"loss": 0.5798022747039795,
"step": 1706
},
{
"epoch": 3.6033755274261603,
"grad_norm": 4.426157474517822,
"learning_rate": 3.515184100612222e-06,
"loss": 0.5708905458450317,
"step": 1708
},
{
"epoch": 3.607594936708861,
"grad_norm": 0.8931450843811035,
"learning_rate": 3.5070689096891045e-06,
"loss": 0.3289738893508911,
"step": 1710
},
{
"epoch": 3.611814345991561,
"grad_norm": 2.223947048187256,
"learning_rate": 3.4989585507814684e-06,
"loss": 0.6438009142875671,
"step": 1712
},
{
"epoch": 3.6160337552742616,
"grad_norm": 4.223023414611816,
"learning_rate": 3.4908530677777846e-06,
"loss": 0.8552393913269043,
"step": 1714
},
{
"epoch": 3.620253164556962,
"grad_norm": 1.7854139804840088,
"learning_rate": 3.482752504540138e-06,
"loss": 0.4675080180168152,
"step": 1716
},
{
"epoch": 3.6244725738396624,
"grad_norm": 2.395404577255249,
"learning_rate": 3.474656904903991e-06,
"loss": 0.35858801007270813,
"step": 1718
},
{
"epoch": 3.628691983122363,
"grad_norm": 4.765064239501953,
"learning_rate": 3.466566312677946e-06,
"loss": 0.3300427198410034,
"step": 1720
},
{
"epoch": 3.632911392405063,
"grad_norm": 2.5676372051239014,
"learning_rate": 3.458480771643507e-06,
"loss": 0.7667765617370605,
"step": 1722
},
{
"epoch": 3.6371308016877637,
"grad_norm": 5.16605281829834,
"learning_rate": 3.4504003255548454e-06,
"loss": 0.3946114182472229,
"step": 1724
},
{
"epoch": 3.6413502109704643,
"grad_norm": 15.37302303314209,
"learning_rate": 3.44232501813856e-06,
"loss": 0.31146499514579773,
"step": 1726
},
{
"epoch": 3.6455696202531644,
"grad_norm": 11.36103343963623,
"learning_rate": 3.4342548930934447e-06,
"loss": 0.7634888887405396,
"step": 1728
},
{
"epoch": 3.649789029535865,
"grad_norm": 8.875736236572266,
"learning_rate": 3.426189994090249e-06,
"loss": 0.20420894026756287,
"step": 1730
},
{
"epoch": 3.6540084388185656,
"grad_norm": 6.140727996826172,
"learning_rate": 3.418130364771438e-06,
"loss": 0.7999590635299683,
"step": 1732
},
{
"epoch": 3.6582278481012658,
"grad_norm": 3.5605967044830322,
"learning_rate": 3.4100760487509677e-06,
"loss": 0.22376415133476257,
"step": 1734
},
{
"epoch": 3.6624472573839664,
"grad_norm": 2.0715627670288086,
"learning_rate": 3.4020270896140338e-06,
"loss": 0.30320820212364197,
"step": 1736
},
{
"epoch": 3.6666666666666665,
"grad_norm": 1.8760136365890503,
"learning_rate": 3.3939835309168494e-06,
"loss": 0.5345732569694519,
"step": 1738
},
{
"epoch": 3.670886075949367,
"grad_norm": 5.121237277984619,
"learning_rate": 3.385945416186402e-06,
"loss": 0.25805044174194336,
"step": 1740
},
{
"epoch": 3.6751054852320673,
"grad_norm": 1.5474026203155518,
"learning_rate": 3.377912788920218e-06,
"loss": 0.811784029006958,
"step": 1742
},
{
"epoch": 3.679324894514768,
"grad_norm": 1.5448044538497925,
"learning_rate": 3.3698856925861306e-06,
"loss": 0.4863538146018982,
"step": 1744
},
{
"epoch": 3.6835443037974684,
"grad_norm": 4.263956069946289,
"learning_rate": 3.361864170622043e-06,
"loss": 0.38036102056503296,
"step": 1746
},
{
"epoch": 3.6877637130801686,
"grad_norm": 2.2748067378997803,
"learning_rate": 3.3538482664356938e-06,
"loss": 0.8080613613128662,
"step": 1748
},
{
"epoch": 3.691983122362869,
"grad_norm": 2.969224214553833,
"learning_rate": 3.345838023404419e-06,
"loss": 0.7013299465179443,
"step": 1750
},
{
"epoch": 3.6962025316455698,
"grad_norm": 2.08278751373291,
"learning_rate": 3.3378334848749193e-06,
"loss": 0.6944292187690735,
"step": 1752
},
{
"epoch": 3.70042194092827,
"grad_norm": 2.476149797439575,
"learning_rate": 3.329834694163032e-06,
"loss": 0.8725452423095703,
"step": 1754
},
{
"epoch": 3.7046413502109705,
"grad_norm": 18.001956939697266,
"learning_rate": 3.321841694553482e-06,
"loss": 0.6215965747833252,
"step": 1756
},
{
"epoch": 3.708860759493671,
"grad_norm": 5.473003387451172,
"learning_rate": 3.3138545292996636e-06,
"loss": 0.7003090977668762,
"step": 1758
},
{
"epoch": 3.7130801687763713,
"grad_norm": 24.688859939575195,
"learning_rate": 3.305873241623395e-06,
"loss": 0.6492451429367065,
"step": 1760
},
{
"epoch": 3.717299578059072,
"grad_norm": 5.999505996704102,
"learning_rate": 3.2978978747146886e-06,
"loss": 0.27890729904174805,
"step": 1762
},
{
"epoch": 3.721518987341772,
"grad_norm": 6.559441566467285,
"learning_rate": 3.28992847173152e-06,
"loss": 0.382098525762558,
"step": 1764
},
{
"epoch": 3.7257383966244726,
"grad_norm": 2.456238269805908,
"learning_rate": 3.2819650757995882e-06,
"loss": 0.7096537947654724,
"step": 1766
},
{
"epoch": 3.7299578059071727,
"grad_norm": 8.340387344360352,
"learning_rate": 3.2740077300120874e-06,
"loss": 0.5058803558349609,
"step": 1768
},
{
"epoch": 3.7341772151898733,
"grad_norm": 2.959620952606201,
"learning_rate": 3.2660564774294698e-06,
"loss": 0.5690555572509766,
"step": 1770
},
{
"epoch": 3.738396624472574,
"grad_norm": 1.9937338829040527,
"learning_rate": 3.2581113610792186e-06,
"loss": 0.6931591033935547,
"step": 1772
},
{
"epoch": 3.742616033755274,
"grad_norm": 1.217617392539978,
"learning_rate": 3.2501724239556093e-06,
"loss": 0.20921635627746582,
"step": 1774
},
{
"epoch": 3.7468354430379747,
"grad_norm": 0.2667827904224396,
"learning_rate": 3.2422397090194763e-06,
"loss": 0.3903126120567322,
"step": 1776
},
{
"epoch": 3.7510548523206753,
"grad_norm": 3.232510566711426,
"learning_rate": 3.2343132591979893e-06,
"loss": 0.6602214574813843,
"step": 1778
},
{
"epoch": 3.7552742616033754,
"grad_norm": 1.6198503971099854,
"learning_rate": 3.2263931173844077e-06,
"loss": 0.7261852025985718,
"step": 1780
},
{
"epoch": 3.759493670886076,
"grad_norm": 2.057166814804077,
"learning_rate": 3.2184793264378635e-06,
"loss": 0.6649327278137207,
"step": 1782
},
{
"epoch": 3.7637130801687766,
"grad_norm": 2.829087495803833,
"learning_rate": 3.210571929183115e-06,
"loss": 0.6382551789283752,
"step": 1784
},
{
"epoch": 3.7679324894514767,
"grad_norm": 2.4798736572265625,
"learning_rate": 3.2026709684103248e-06,
"loss": 0.6738499402999878,
"step": 1786
},
{
"epoch": 3.7721518987341773,
"grad_norm": 10.70611572265625,
"learning_rate": 3.194776486874825e-06,
"loss": 0.19844934344291687,
"step": 1788
},
{
"epoch": 3.7763713080168775,
"grad_norm": 4.095230579376221,
"learning_rate": 3.186888527296885e-06,
"loss": 0.5124695301055908,
"step": 1790
},
{
"epoch": 3.780590717299578,
"grad_norm": 2.3026554584503174,
"learning_rate": 3.1790071323614794e-06,
"loss": 0.6329219937324524,
"step": 1792
},
{
"epoch": 3.7848101265822782,
"grad_norm": 5.607376575469971,
"learning_rate": 3.1711323447180637e-06,
"loss": 0.5636836290359497,
"step": 1794
},
{
"epoch": 3.789029535864979,
"grad_norm": 2.444586992263794,
"learning_rate": 3.163264206980336e-06,
"loss": 0.6737933158874512,
"step": 1796
},
{
"epoch": 3.7932489451476794,
"grad_norm": 4.4093451499938965,
"learning_rate": 3.155402761726006e-06,
"loss": 0.8205442428588867,
"step": 1798
},
{
"epoch": 3.7974683544303796,
"grad_norm": 2.5362284183502197,
"learning_rate": 3.1475480514965733e-06,
"loss": 0.7304701209068298,
"step": 1800
},
{
"epoch": 3.80168776371308,
"grad_norm": 1.82133150100708,
"learning_rate": 3.139700118797088e-06,
"loss": 0.7703126072883606,
"step": 1802
},
{
"epoch": 3.8059071729957807,
"grad_norm": 1.8650217056274414,
"learning_rate": 3.131859006095926e-06,
"loss": 0.45118463039398193,
"step": 1804
},
{
"epoch": 3.810126582278481,
"grad_norm": 17.568998336791992,
"learning_rate": 3.124024755824554e-06,
"loss": 0.2017352283000946,
"step": 1806
},
{
"epoch": 3.8143459915611815,
"grad_norm": 3.5482592582702637,
"learning_rate": 3.1161974103773066e-06,
"loss": 0.728500485420227,
"step": 1808
},
{
"epoch": 3.818565400843882,
"grad_norm": 2.8701515197753906,
"learning_rate": 3.108377012111154e-06,
"loss": 0.7613662481307983,
"step": 1810
},
{
"epoch": 3.8227848101265822,
"grad_norm": 3.2422940731048584,
"learning_rate": 3.10056360334547e-06,
"loss": 0.37432968616485596,
"step": 1812
},
{
"epoch": 3.827004219409283,
"grad_norm": 1.7439910173416138,
"learning_rate": 3.0927572263618062e-06,
"loss": 0.7083200216293335,
"step": 1814
},
{
"epoch": 3.831223628691983,
"grad_norm": 3.794440746307373,
"learning_rate": 3.084957923403662e-06,
"loss": 0.7253645658493042,
"step": 1816
},
{
"epoch": 3.8354430379746836,
"grad_norm": 8.467775344848633,
"learning_rate": 3.0771657366762586e-06,
"loss": 0.6260569095611572,
"step": 1818
},
{
"epoch": 3.8396624472573837,
"grad_norm": 6.704847812652588,
"learning_rate": 3.069380708346305e-06,
"loss": 0.5025795698165894,
"step": 1820
},
{
"epoch": 3.8438818565400843,
"grad_norm": 1.6902318000793457,
"learning_rate": 3.061602880541776e-06,
"loss": 0.6335855722427368,
"step": 1822
},
{
"epoch": 3.848101265822785,
"grad_norm": 3.424485206604004,
"learning_rate": 3.0538322953516807e-06,
"loss": 0.5025821328163147,
"step": 1824
},
{
"epoch": 3.852320675105485,
"grad_norm": 3.550658941268921,
"learning_rate": 3.046068994825832e-06,
"loss": 0.7374518513679504,
"step": 1826
},
{
"epoch": 3.8565400843881856,
"grad_norm": 4.101608753204346,
"learning_rate": 3.0383130209746287e-06,
"loss": 0.7142576575279236,
"step": 1828
},
{
"epoch": 3.8607594936708862,
"grad_norm": 1.8561471700668335,
"learning_rate": 3.0305644157688175e-06,
"loss": 0.6271055936813354,
"step": 1830
},
{
"epoch": 3.8649789029535864,
"grad_norm": 19.705900192260742,
"learning_rate": 3.022823221139272e-06,
"loss": 0.3404349088668823,
"step": 1832
},
{
"epoch": 3.869198312236287,
"grad_norm": 9.467658042907715,
"learning_rate": 3.0150894789767627e-06,
"loss": 0.5793641805648804,
"step": 1834
},
{
"epoch": 3.8734177215189876,
"grad_norm": 6.555062294006348,
"learning_rate": 3.007363231131733e-06,
"loss": 0.5979642868041992,
"step": 1836
},
{
"epoch": 3.8776371308016877,
"grad_norm": 12.590143203735352,
"learning_rate": 2.9996445194140723e-06,
"loss": 0.49834197759628296,
"step": 1838
},
{
"epoch": 3.8818565400843883,
"grad_norm": 11.55475902557373,
"learning_rate": 2.9919333855928875e-06,
"loss": 0.7811706066131592,
"step": 1840
},
{
"epoch": 3.8860759493670884,
"grad_norm": 1.6321529150009155,
"learning_rate": 2.9842298713962795e-06,
"loss": 0.4640495777130127,
"step": 1842
},
{
"epoch": 3.890295358649789,
"grad_norm": 1.108053207397461,
"learning_rate": 2.9765340185111134e-06,
"loss": 0.5240273475646973,
"step": 1844
},
{
"epoch": 3.894514767932489,
"grad_norm": 1.4660061597824097,
"learning_rate": 2.968845868582799e-06,
"loss": 0.6336109042167664,
"step": 1846
},
{
"epoch": 3.8987341772151898,
"grad_norm": 7.276936054229736,
"learning_rate": 2.961165463215062e-06,
"loss": 0.48461082577705383,
"step": 1848
},
{
"epoch": 3.9029535864978904,
"grad_norm": 1.9613572359085083,
"learning_rate": 2.9534928439697186e-06,
"loss": 0.6677671670913696,
"step": 1850
},
{
"epoch": 3.9071729957805905,
"grad_norm": 1.520216464996338,
"learning_rate": 2.9458280523664493e-06,
"loss": 0.8395076990127563,
"step": 1852
},
{
"epoch": 3.911392405063291,
"grad_norm": 3.0033154487609863,
"learning_rate": 2.938171129882579e-06,
"loss": 0.6944848299026489,
"step": 1854
},
{
"epoch": 3.9156118143459917,
"grad_norm": 1.6401822566986084,
"learning_rate": 2.930522117952847e-06,
"loss": 0.7018183469772339,
"step": 1856
},
{
"epoch": 3.919831223628692,
"grad_norm": 2.8167307376861572,
"learning_rate": 2.922881057969188e-06,
"loss": 0.7709340453147888,
"step": 1858
},
{
"epoch": 3.9240506329113924,
"grad_norm": 2.7081515789031982,
"learning_rate": 2.9152479912805028e-06,
"loss": 0.7548224925994873,
"step": 1860
},
{
"epoch": 3.928270042194093,
"grad_norm": 3.791499137878418,
"learning_rate": 2.907622959192439e-06,
"loss": 0.5371965169906616,
"step": 1862
},
{
"epoch": 3.932489451476793,
"grad_norm": 2.503772497177124,
"learning_rate": 2.9000060029671644e-06,
"loss": 0.5366585850715637,
"step": 1864
},
{
"epoch": 3.9367088607594938,
"grad_norm": 6.065065383911133,
"learning_rate": 2.8923971638231466e-06,
"loss": 0.9665102958679199,
"step": 1866
},
{
"epoch": 3.9409282700421944,
"grad_norm": 2.7202789783477783,
"learning_rate": 2.884796482934927e-06,
"loss": 0.7356393337249756,
"step": 1868
},
{
"epoch": 3.9451476793248945,
"grad_norm": 3.0500247478485107,
"learning_rate": 2.877204001432899e-06,
"loss": 0.5012904405593872,
"step": 1870
},
{
"epoch": 3.9493670886075947,
"grad_norm": 0.8024043440818787,
"learning_rate": 2.869619760403089e-06,
"loss": 0.3538365662097931,
"step": 1872
},
{
"epoch": 3.9535864978902953,
"grad_norm": 191.6532745361328,
"learning_rate": 2.8620438008869264e-06,
"loss": 0.434034138917923,
"step": 1874
},
{
"epoch": 3.957805907172996,
"grad_norm": 2.8688251972198486,
"learning_rate": 2.8544761638810277e-06,
"loss": 0.6301808953285217,
"step": 1876
},
{
"epoch": 3.962025316455696,
"grad_norm": 4.15130090713501,
"learning_rate": 2.8469168903369733e-06,
"loss": 0.596470832824707,
"step": 1878
},
{
"epoch": 3.9662447257383966,
"grad_norm": 2.7118093967437744,
"learning_rate": 2.8393660211610864e-06,
"loss": 0.4589231610298157,
"step": 1880
},
{
"epoch": 3.970464135021097,
"grad_norm": 2.9497010707855225,
"learning_rate": 2.8318235972142075e-06,
"loss": 0.7778608798980713,
"step": 1882
},
{
"epoch": 3.9746835443037973,
"grad_norm": 10.08464241027832,
"learning_rate": 2.824289659311481e-06,
"loss": 0.3298872113227844,
"step": 1884
},
{
"epoch": 3.978902953586498,
"grad_norm": 2.5433638095855713,
"learning_rate": 2.8167642482221274e-06,
"loss": 0.6300213932991028,
"step": 1886
},
{
"epoch": 3.9831223628691985,
"grad_norm": 11.90830135345459,
"learning_rate": 2.8092474046692227e-06,
"loss": 0.4418677091598511,
"step": 1888
},
{
"epoch": 3.9873417721518987,
"grad_norm": 4.765434741973877,
"learning_rate": 2.801739169329486e-06,
"loss": 0.6927688121795654,
"step": 1890
},
{
"epoch": 3.9915611814345993,
"grad_norm": 6.100020408630371,
"learning_rate": 2.7942395828330477e-06,
"loss": 0.5399014949798584,
"step": 1892
},
{
"epoch": 3.9957805907173,
"grad_norm": 1.7746268510818481,
"learning_rate": 2.7867486857632417e-06,
"loss": 0.7801375389099121,
"step": 1894
},
{
"epoch": 4.0,
"grad_norm": 1.9672950506210327,
"learning_rate": 2.7792665186563753e-06,
"loss": 0.6976273059844971,
"step": 1896
},
{
"epoch": 4.0042194092827,
"grad_norm": 2.0892131328582764,
"learning_rate": 2.771793122001518e-06,
"loss": 0.5950413942337036,
"step": 1898
},
{
"epoch": 4.008438818565401,
"grad_norm": 3.418523073196411,
"learning_rate": 2.764328536240274e-06,
"loss": 0.48346221446990967,
"step": 1900
},
{
"epoch": 4.012658227848101,
"grad_norm": 2.4079160690307617,
"learning_rate": 2.7568728017665734e-06,
"loss": 0.5231744647026062,
"step": 1902
},
{
"epoch": 4.0168776371308015,
"grad_norm": 10.42201042175293,
"learning_rate": 2.749425958926447e-06,
"loss": 0.36587753891944885,
"step": 1904
},
{
"epoch": 4.0210970464135025,
"grad_norm": 0.36827781796455383,
"learning_rate": 2.7419880480178055e-06,
"loss": 0.18869513273239136,
"step": 1906
},
{
"epoch": 4.025316455696203,
"grad_norm": 2.4577043056488037,
"learning_rate": 2.734559109290229e-06,
"loss": 0.5424115061759949,
"step": 1908
},
{
"epoch": 4.029535864978903,
"grad_norm": 7.176480293273926,
"learning_rate": 2.7271391829447447e-06,
"loss": 0.09614966064691544,
"step": 1910
},
{
"epoch": 4.033755274261603,
"grad_norm": 2.078049898147583,
"learning_rate": 2.71972830913361e-06,
"loss": 0.5041449069976807,
"step": 1912
},
{
"epoch": 4.037974683544304,
"grad_norm": 3.0364325046539307,
"learning_rate": 2.712326527960096e-06,
"loss": 0.6174269914627075,
"step": 1914
},
{
"epoch": 4.042194092827004,
"grad_norm": 0.6836444139480591,
"learning_rate": 2.704933879478268e-06,
"loss": 0.3205277919769287,
"step": 1916
},
{
"epoch": 4.046413502109704,
"grad_norm": 6.195359230041504,
"learning_rate": 2.697550403692773e-06,
"loss": 0.14734962582588196,
"step": 1918
},
{
"epoch": 4.050632911392405,
"grad_norm": 2.888777732849121,
"learning_rate": 2.69017614055862e-06,
"loss": 0.5565149784088135,
"step": 1920
},
{
"epoch": 4.0548523206751055,
"grad_norm": 12.064739227294922,
"learning_rate": 2.682811129980962e-06,
"loss": 0.47878050804138184,
"step": 1922
},
{
"epoch": 4.059071729957806,
"grad_norm": 1.9031803607940674,
"learning_rate": 2.6754554118148857e-06,
"loss": 0.3945463299751282,
"step": 1924
},
{
"epoch": 4.063291139240507,
"grad_norm": 6.993194103240967,
"learning_rate": 2.668109025865191e-06,
"loss": 0.2721104919910431,
"step": 1926
},
{
"epoch": 4.067510548523207,
"grad_norm": 7.187300205230713,
"learning_rate": 2.660772011886178e-06,
"loss": 0.572750449180603,
"step": 1928
},
{
"epoch": 4.071729957805907,
"grad_norm": 9.433985710144043,
"learning_rate": 2.6534444095814334e-06,
"loss": 0.14224952459335327,
"step": 1930
},
{
"epoch": 4.075949367088608,
"grad_norm": 6.624326705932617,
"learning_rate": 2.646126258603612e-06,
"loss": 0.429046630859375,
"step": 1932
},
{
"epoch": 4.080168776371308,
"grad_norm": 5.319462776184082,
"learning_rate": 2.6388175985542193e-06,
"loss": 0.4175564646720886,
"step": 1934
},
{
"epoch": 4.084388185654008,
"grad_norm": 7.918082237243652,
"learning_rate": 2.631518468983407e-06,
"loss": 0.5208654403686523,
"step": 1936
},
{
"epoch": 4.0886075949367084,
"grad_norm": 2.524588108062744,
"learning_rate": 2.6242289093897533e-06,
"loss": 0.30576610565185547,
"step": 1938
},
{
"epoch": 4.0928270042194095,
"grad_norm": 17.760915756225586,
"learning_rate": 2.6169489592200457e-06,
"loss": 0.3638699948787689,
"step": 1940
},
{
"epoch": 4.09704641350211,
"grad_norm": 3.6685545444488525,
"learning_rate": 2.6096786578690738e-06,
"loss": 0.2502339482307434,
"step": 1942
},
{
"epoch": 4.10126582278481,
"grad_norm": 1.735503077507019,
"learning_rate": 2.6024180446794133e-06,
"loss": 0.2844234108924866,
"step": 1944
},
{
"epoch": 4.105485232067511,
"grad_norm": 2.3414032459259033,
"learning_rate": 2.5951671589412127e-06,
"loss": 0.5370857119560242,
"step": 1946
},
{
"epoch": 4.109704641350211,
"grad_norm": 9.196849822998047,
"learning_rate": 2.587926039891983e-06,
"loss": 0.45078617334365845,
"step": 1948
},
{
"epoch": 4.113924050632911,
"grad_norm": 0.8639876842498779,
"learning_rate": 2.580694726716379e-06,
"loss": 0.3761923313140869,
"step": 1950
},
{
"epoch": 4.118143459915612,
"grad_norm": 1.6307646036148071,
"learning_rate": 2.573473258545997e-06,
"loss": 0.44236212968826294,
"step": 1952
},
{
"epoch": 4.122362869198312,
"grad_norm": 4.56338357925415,
"learning_rate": 2.566261674459156e-06,
"loss": 0.707075834274292,
"step": 1954
},
{
"epoch": 4.1265822784810124,
"grad_norm": 3.0363290309906006,
"learning_rate": 2.5590600134806873e-06,
"loss": 0.12159548699855804,
"step": 1956
},
{
"epoch": 4.1308016877637135,
"grad_norm": 2.8413619995117188,
"learning_rate": 2.551868314581726e-06,
"loss": 0.6649860739707947,
"step": 1958
},
{
"epoch": 4.135021097046414,
"grad_norm": 4.109986782073975,
"learning_rate": 2.544686616679497e-06,
"loss": 0.6205018758773804,
"step": 1960
},
{
"epoch": 4.139240506329114,
"grad_norm": 6.5747504234313965,
"learning_rate": 2.537514958637107e-06,
"loss": 0.37222298979759216,
"step": 1962
},
{
"epoch": 4.143459915611814,
"grad_norm": 2.581211566925049,
"learning_rate": 2.5303533792633306e-06,
"loss": 0.4583626687526703,
"step": 1964
},
{
"epoch": 4.147679324894515,
"grad_norm": 0.3391718864440918,
"learning_rate": 2.5232019173124043e-06,
"loss": 0.24545279145240784,
"step": 1966
},
{
"epoch": 4.151898734177215,
"grad_norm": 3.354196071624756,
"learning_rate": 2.5160606114838158e-06,
"loss": 0.6107680797576904,
"step": 1968
},
{
"epoch": 4.156118143459915,
"grad_norm": 1.9463728666305542,
"learning_rate": 2.5089295004220927e-06,
"loss": 0.41494786739349365,
"step": 1970
},
{
"epoch": 4.160337552742616,
"grad_norm": 3.8241024017333984,
"learning_rate": 2.5018086227165937e-06,
"loss": 0.5631481409072876,
"step": 1972
},
{
"epoch": 4.1645569620253164,
"grad_norm": 3.7971303462982178,
"learning_rate": 2.494698016901302e-06,
"loss": 0.13252116739749908,
"step": 1974
},
{
"epoch": 4.168776371308017,
"grad_norm": 5.456217288970947,
"learning_rate": 2.487597721454616e-06,
"loss": 0.4099525213241577,
"step": 1976
},
{
"epoch": 4.172995780590718,
"grad_norm": 18.906333923339844,
"learning_rate": 2.4805077747991403e-06,
"loss": 0.33811259269714355,
"step": 1978
},
{
"epoch": 4.177215189873418,
"grad_norm": 11.150616645812988,
"learning_rate": 2.473428215301474e-06,
"loss": 0.2853623032569885,
"step": 1980
},
{
"epoch": 4.181434599156118,
"grad_norm": 23.042560577392578,
"learning_rate": 2.466359081272012e-06,
"loss": 0.3581426441669464,
"step": 1982
},
{
"epoch": 4.185654008438819,
"grad_norm": 4.002007007598877,
"learning_rate": 2.459300410964731e-06,
"loss": 0.3014911413192749,
"step": 1984
},
{
"epoch": 4.189873417721519,
"grad_norm": 6.566624164581299,
"learning_rate": 2.452252242576984e-06,
"loss": 0.11508725583553314,
"step": 1986
},
{
"epoch": 4.194092827004219,
"grad_norm": 18.410457611083984,
"learning_rate": 2.445214614249294e-06,
"loss": 0.3810286521911621,
"step": 1988
},
{
"epoch": 4.198312236286919,
"grad_norm": 6.431080341339111,
"learning_rate": 2.4381875640651466e-06,
"loss": 0.20014682412147522,
"step": 1990
},
{
"epoch": 4.2025316455696204,
"grad_norm": 3.2412610054016113,
"learning_rate": 2.431171130050788e-06,
"loss": 0.6001700162887573,
"step": 1992
},
{
"epoch": 4.206751054852321,
"grad_norm": 3.1228854656219482,
"learning_rate": 2.4241653501750117e-06,
"loss": 0.29799264669418335,
"step": 1994
},
{
"epoch": 4.210970464135021,
"grad_norm": 2.178508996963501,
"learning_rate": 2.4171702623489588e-06,
"loss": 0.5007591247558594,
"step": 1996
},
{
"epoch": 4.215189873417722,
"grad_norm": 7.447211265563965,
"learning_rate": 2.410185904425912e-06,
"loss": 0.7163572907447815,
"step": 1998
},
{
"epoch": 4.219409282700422,
"grad_norm": 2.8777246475219727,
"learning_rate": 2.403212314201088e-06,
"loss": 0.5820721387863159,
"step": 2000
},
{
"epoch": 4.223628691983122,
"grad_norm": 4.454619884490967,
"learning_rate": 2.3962495294114403e-06,
"loss": 0.41988158226013184,
"step": 2002
},
{
"epoch": 4.227848101265823,
"grad_norm": 4.4292426109313965,
"learning_rate": 2.3892975877354452e-06,
"loss": 0.14902547001838684,
"step": 2004
},
{
"epoch": 4.232067510548523,
"grad_norm": 2.666948080062866,
"learning_rate": 2.3823565267929036e-06,
"loss": 0.6181389093399048,
"step": 2006
},
{
"epoch": 4.236286919831223,
"grad_norm": 3.547452688217163,
"learning_rate": 2.375426384144735e-06,
"loss": 0.33217155933380127,
"step": 2008
},
{
"epoch": 4.2405063291139244,
"grad_norm": 2.134594440460205,
"learning_rate": 2.368507197292777e-06,
"loss": 0.4793064594268799,
"step": 2010
},
{
"epoch": 4.244725738396625,
"grad_norm": 25.654151916503906,
"learning_rate": 2.361599003679582e-06,
"loss": 0.13546811044216156,
"step": 2012
},
{
"epoch": 4.248945147679325,
"grad_norm": 4.229970455169678,
"learning_rate": 2.3547018406882104e-06,
"loss": 0.3434482216835022,
"step": 2014
},
{
"epoch": 4.253164556962025,
"grad_norm": 4.361436367034912,
"learning_rate": 2.347815745642035e-06,
"loss": 0.6057535409927368,
"step": 2016
},
{
"epoch": 4.257383966244726,
"grad_norm": 17.874441146850586,
"learning_rate": 2.340940755804532e-06,
"loss": 0.5280637741088867,
"step": 2018
},
{
"epoch": 4.261603375527426,
"grad_norm": 8.038070678710938,
"learning_rate": 2.334076908379086e-06,
"loss": 0.07331550121307373,
"step": 2020
},
{
"epoch": 4.265822784810126,
"grad_norm": 0.6873889565467834,
"learning_rate": 2.327224240508784e-06,
"loss": 0.15723557770252228,
"step": 2022
},
{
"epoch": 4.270042194092827,
"grad_norm": 4.693041801452637,
"learning_rate": 2.3203827892762136e-06,
"loss": 0.45733606815338135,
"step": 2024
},
{
"epoch": 4.274261603375527,
"grad_norm": 21.652511596679688,
"learning_rate": 2.313552591703267e-06,
"loss": 0.20987409353256226,
"step": 2026
},
{
"epoch": 4.2784810126582276,
"grad_norm": 1.620386004447937,
"learning_rate": 2.3067336847509405e-06,
"loss": 0.18322864174842834,
"step": 2028
},
{
"epoch": 4.282700421940929,
"grad_norm": 0.5709916949272156,
"learning_rate": 2.2999261053191264e-06,
"loss": 0.264180064201355,
"step": 2030
},
{
"epoch": 4.286919831223629,
"grad_norm": 6.232452392578125,
"learning_rate": 2.2931298902464242e-06,
"loss": 0.581986129283905,
"step": 2032
},
{
"epoch": 4.291139240506329,
"grad_norm": 2.427851438522339,
"learning_rate": 2.286345076309935e-06,
"loss": 0.08267831802368164,
"step": 2034
},
{
"epoch": 4.29535864978903,
"grad_norm": 7.5021843910217285,
"learning_rate": 2.279571700225061e-06,
"loss": 0.3914198875427246,
"step": 2036
},
{
"epoch": 4.29957805907173,
"grad_norm": 17.116886138916016,
"learning_rate": 2.272809798645313e-06,
"loss": 0.4527243375778198,
"step": 2038
},
{
"epoch": 4.30379746835443,
"grad_norm": 9.568516731262207,
"learning_rate": 2.2660594081621068e-06,
"loss": 0.5110298991203308,
"step": 2040
},
{
"epoch": 4.308016877637131,
"grad_norm": 2.8458101749420166,
"learning_rate": 2.259320565304568e-06,
"loss": 0.3989183306694031,
"step": 2042
},
{
"epoch": 4.312236286919831,
"grad_norm": 3.3316569328308105,
"learning_rate": 2.2525933065393316e-06,
"loss": 0.4240986406803131,
"step": 2044
},
{
"epoch": 4.3164556962025316,
"grad_norm": 3.5117201805114746,
"learning_rate": 2.2458776682703478e-06,
"loss": 0.5510097146034241,
"step": 2046
},
{
"epoch": 4.320675105485232,
"grad_norm": 2.211899757385254,
"learning_rate": 2.2391736868386826e-06,
"loss": 0.47137928009033203,
"step": 2048
},
{
"epoch": 4.324894514767933,
"grad_norm": 2.8007261753082275,
"learning_rate": 2.2324813985223236e-06,
"loss": 0.13788414001464844,
"step": 2050
},
{
"epoch": 4.329113924050633,
"grad_norm": 5.883923530578613,
"learning_rate": 2.2258008395359814e-06,
"loss": 0.21625080704689026,
"step": 2052
},
{
"epoch": 4.333333333333333,
"grad_norm": 2.6445043087005615,
"learning_rate": 2.2191320460308913e-06,
"loss": 0.43525630235671997,
"step": 2054
},
{
"epoch": 4.337552742616034,
"grad_norm": 4.206122875213623,
"learning_rate": 2.2124750540946258e-06,
"loss": 0.22658753395080566,
"step": 2056
},
{
"epoch": 4.341772151898734,
"grad_norm": 7.528255462646484,
"learning_rate": 2.2058298997508916e-06,
"loss": 0.19083625078201294,
"step": 2058
},
{
"epoch": 4.345991561181434,
"grad_norm": 2.3334925174713135,
"learning_rate": 2.1991966189593375e-06,
"loss": 0.5279438495635986,
"step": 2060
},
{
"epoch": 4.350210970464135,
"grad_norm": 3.07808780670166,
"learning_rate": 2.1925752476153598e-06,
"loss": 0.5324735641479492,
"step": 2062
},
{
"epoch": 4.3544303797468356,
"grad_norm": 7.293347358703613,
"learning_rate": 2.1859658215499094e-06,
"loss": 0.4442484378814697,
"step": 2064
},
{
"epoch": 4.358649789029536,
"grad_norm": 3.767479419708252,
"learning_rate": 2.1793683765292943e-06,
"loss": 0.6478234529495239,
"step": 2066
},
{
"epoch": 4.362869198312236,
"grad_norm": 1.7366708517074585,
"learning_rate": 2.172782948254989e-06,
"loss": 0.22714099287986755,
"step": 2068
},
{
"epoch": 4.367088607594937,
"grad_norm": 2.4501614570617676,
"learning_rate": 2.1662095723634387e-06,
"loss": 0.7067612409591675,
"step": 2070
},
{
"epoch": 4.371308016877637,
"grad_norm": 2.0209014415740967,
"learning_rate": 2.159648284425872e-06,
"loss": 0.6720374226570129,
"step": 2072
},
{
"epoch": 4.375527426160337,
"grad_norm": 2.6613192558288574,
"learning_rate": 2.1530991199481e-06,
"loss": 0.46383750438690186,
"step": 2074
},
{
"epoch": 4.379746835443038,
"grad_norm": 10.552399635314941,
"learning_rate": 2.1465621143703354e-06,
"loss": 0.4360678195953369,
"step": 2076
},
{
"epoch": 4.383966244725738,
"grad_norm": 2.3267464637756348,
"learning_rate": 2.1400373030669878e-06,
"loss": 0.32150259613990784,
"step": 2078
},
{
"epoch": 4.3881856540084385,
"grad_norm": 11.424999237060547,
"learning_rate": 2.1335247213464816e-06,
"loss": 0.6122124195098877,
"step": 2080
},
{
"epoch": 4.3924050632911396,
"grad_norm": 1.8929657936096191,
"learning_rate": 2.1270244044510596e-06,
"loss": 0.29143026471138,
"step": 2082
},
{
"epoch": 4.39662447257384,
"grad_norm": 5.961505889892578,
"learning_rate": 2.120536387556597e-06,
"loss": 0.44119709730148315,
"step": 2084
},
{
"epoch": 4.40084388185654,
"grad_norm": 4.30864953994751,
"learning_rate": 2.114060705772409e-06,
"loss": 0.7014176845550537,
"step": 2086
},
{
"epoch": 4.405063291139241,
"grad_norm": 2.612563371658325,
"learning_rate": 2.107597394141057e-06,
"loss": 0.5459550023078918,
"step": 2088
},
{
"epoch": 4.409282700421941,
"grad_norm": 2.4660723209381104,
"learning_rate": 2.1011464876381663e-06,
"loss": 0.46325892210006714,
"step": 2090
},
{
"epoch": 4.413502109704641,
"grad_norm": 4.131664276123047,
"learning_rate": 2.0947080211722317e-06,
"loss": 0.4953617453575134,
"step": 2092
},
{
"epoch": 4.417721518987342,
"grad_norm": 1.9574029445648193,
"learning_rate": 2.0882820295844285e-06,
"loss": 0.5186775922775269,
"step": 2094
},
{
"epoch": 4.421940928270042,
"grad_norm": 3.840588092803955,
"learning_rate": 2.081868547648429e-06,
"loss": 0.31746193766593933,
"step": 2096
},
{
"epoch": 4.4261603375527425,
"grad_norm": 2.727635383605957,
"learning_rate": 2.0754676100702045e-06,
"loss": 0.7108813524246216,
"step": 2098
},
{
"epoch": 4.430379746835443,
"grad_norm": 4.424046039581299,
"learning_rate": 2.0690792514878495e-06,
"loss": 0.48461851477622986,
"step": 2100
},
{
"epoch": 4.434599156118144,
"grad_norm": 2.04559326171875,
"learning_rate": 2.0627035064713857e-06,
"loss": 0.4159836769104004,
"step": 2102
},
{
"epoch": 4.438818565400844,
"grad_norm": 1.8618910312652588,
"learning_rate": 2.056340409522577e-06,
"loss": 0.36201441287994385,
"step": 2104
},
{
"epoch": 4.443037974683544,
"grad_norm": 2.5027105808258057,
"learning_rate": 2.049989995074746e-06,
"loss": 0.5959118008613586,
"step": 2106
},
{
"epoch": 4.447257383966245,
"grad_norm": 11.552289009094238,
"learning_rate": 2.043652297492583e-06,
"loss": 0.3659658432006836,
"step": 2108
},
{
"epoch": 4.451476793248945,
"grad_norm": 4.931119441986084,
"learning_rate": 2.037327351071963e-06,
"loss": 0.48589879274368286,
"step": 2110
},
{
"epoch": 4.455696202531645,
"grad_norm": 4.232883930206299,
"learning_rate": 2.031015190039759e-06,
"loss": 0.5243382453918457,
"step": 2112
},
{
"epoch": 4.459915611814346,
"grad_norm": 0.3998461961746216,
"learning_rate": 2.0247158485536565e-06,
"loss": 0.5077897310256958,
"step": 2114
},
{
"epoch": 4.4641350210970465,
"grad_norm": 1.7971662282943726,
"learning_rate": 2.0184293607019707e-06,
"loss": 0.2606506943702698,
"step": 2116
},
{
"epoch": 4.468354430379747,
"grad_norm": 2.3619842529296875,
"learning_rate": 2.012155760503458e-06,
"loss": 0.543289065361023,
"step": 2118
},
{
"epoch": 4.472573839662447,
"grad_norm": 1.1135996580123901,
"learning_rate": 2.0058950819071384e-06,
"loss": 0.08294013142585754,
"step": 2120
},
{
"epoch": 4.476793248945148,
"grad_norm": 6.450394630432129,
"learning_rate": 1.999647358792103e-06,
"loss": 0.27434927225112915,
"step": 2122
},
{
"epoch": 4.481012658227848,
"grad_norm": 9.028851509094238,
"learning_rate": 1.993412624967339e-06,
"loss": 0.18550115823745728,
"step": 2124
},
{
"epoch": 4.485232067510548,
"grad_norm": 3.7954587936401367,
"learning_rate": 1.9871909141715433e-06,
"loss": 0.25095483660697937,
"step": 2126
},
{
"epoch": 4.489451476793249,
"grad_norm": 2.933171033859253,
"learning_rate": 1.980982260072936e-06,
"loss": 0.29782503843307495,
"step": 2128
},
{
"epoch": 4.493670886075949,
"grad_norm": 5.5410475730896,
"learning_rate": 1.9747866962690864e-06,
"loss": 0.37597131729125977,
"step": 2130
},
{
"epoch": 4.4978902953586495,
"grad_norm": 7.844871997833252,
"learning_rate": 1.9686042562867247e-06,
"loss": 0.591028094291687,
"step": 2132
},
{
"epoch": 4.5021097046413505,
"grad_norm": 5.038850784301758,
"learning_rate": 1.962434973581564e-06,
"loss": 0.45768237113952637,
"step": 2134
},
{
"epoch": 4.506329113924051,
"grad_norm": 6.212744235992432,
"learning_rate": 1.9562788815381164e-06,
"loss": 0.11174334585666656,
"step": 2136
},
{
"epoch": 4.510548523206751,
"grad_norm": 1.0894521474838257,
"learning_rate": 1.950136013469515e-06,
"loss": 0.1324283480644226,
"step": 2138
},
{
"epoch": 4.514767932489452,
"grad_norm": 5.448882579803467,
"learning_rate": 1.944006402617333e-06,
"loss": 0.13975661993026733,
"step": 2140
},
{
"epoch": 4.518987341772152,
"grad_norm": 0.6249382495880127,
"learning_rate": 1.937890082151403e-06,
"loss": 0.32427144050598145,
"step": 2142
},
{
"epoch": 4.523206751054852,
"grad_norm": 11.115077018737793,
"learning_rate": 1.9317870851696356e-06,
"loss": 0.10621624439954758,
"step": 2144
},
{
"epoch": 4.527426160337553,
"grad_norm": 3.9892995357513428,
"learning_rate": 1.9256974446978464e-06,
"loss": 0.38272783160209656,
"step": 2146
},
{
"epoch": 4.531645569620253,
"grad_norm": 2.471816301345825,
"learning_rate": 1.919621193689569e-06,
"loss": 0.3882204294204712,
"step": 2148
},
{
"epoch": 4.5358649789029535,
"grad_norm": 0.016054954379796982,
"learning_rate": 1.9135583650258873e-06,
"loss": 0.2680031657218933,
"step": 2150
},
{
"epoch": 4.540084388185654,
"grad_norm": 2.9162821769714355,
"learning_rate": 1.9075089915152464e-06,
"loss": 0.3421184718608856,
"step": 2152
},
{
"epoch": 4.544303797468355,
"grad_norm": 3.484391212463379,
"learning_rate": 1.9014731058932827e-06,
"loss": 0.5047986507415771,
"step": 2154
},
{
"epoch": 4.548523206751055,
"grad_norm": 1.9593169689178467,
"learning_rate": 1.8954507408226409e-06,
"loss": 0.46260231733322144,
"step": 2156
},
{
"epoch": 4.552742616033755,
"grad_norm": 2.716538667678833,
"learning_rate": 1.8894419288928027e-06,
"loss": 0.5966385006904602,
"step": 2158
},
{
"epoch": 4.556962025316456,
"grad_norm": 3.1801514625549316,
"learning_rate": 1.883446702619909e-06,
"loss": 0.37797853350639343,
"step": 2160
},
{
"epoch": 4.561181434599156,
"grad_norm": 2.5519282817840576,
"learning_rate": 1.8774650944465816e-06,
"loss": 0.4353446960449219,
"step": 2162
},
{
"epoch": 4.565400843881856,
"grad_norm": 3.090348243713379,
"learning_rate": 1.8714971367417503e-06,
"loss": 0.36761924624443054,
"step": 2164
},
{
"epoch": 4.569620253164557,
"grad_norm": 2.526357889175415,
"learning_rate": 1.8655428618004757e-06,
"loss": 0.5436191558837891,
"step": 2166
},
{
"epoch": 4.5738396624472575,
"grad_norm": 6.702995777130127,
"learning_rate": 1.8596023018437756e-06,
"loss": 0.5698112845420837,
"step": 2168
},
{
"epoch": 4.578059071729958,
"grad_norm": 21.6278133392334,
"learning_rate": 1.8536754890184514e-06,
"loss": 0.12127143144607544,
"step": 2170
},
{
"epoch": 4.582278481012658,
"grad_norm": 2.4644062519073486,
"learning_rate": 1.8477624553969126e-06,
"loss": 0.3572949767112732,
"step": 2172
},
{
"epoch": 4.586497890295359,
"grad_norm": 4.449887275695801,
"learning_rate": 1.8418632329770014e-06,
"loss": 0.4991232752799988,
"step": 2174
},
{
"epoch": 4.590717299578059,
"grad_norm": 2.306753396987915,
"learning_rate": 1.8359778536818252e-06,
"loss": 0.6089332103729248,
"step": 2176
},
{
"epoch": 4.594936708860759,
"grad_norm": 9.263266563415527,
"learning_rate": 1.8301063493595794e-06,
"loss": 0.44372105598449707,
"step": 2178
},
{
"epoch": 4.59915611814346,
"grad_norm": 1.82095205783844,
"learning_rate": 1.824248751783377e-06,
"loss": 0.3401510715484619,
"step": 2180
},
{
"epoch": 4.60337552742616,
"grad_norm": 2.3795061111450195,
"learning_rate": 1.8184050926510743e-06,
"loss": 0.5080521106719971,
"step": 2182
},
{
"epoch": 4.6075949367088604,
"grad_norm": 29.6896915435791,
"learning_rate": 1.8125754035851018e-06,
"loss": 0.0813543051481247,
"step": 2184
},
{
"epoch": 4.6118143459915615,
"grad_norm": 3.2905502319335938,
"learning_rate": 1.806759716132293e-06,
"loss": 0.5500208139419556,
"step": 2186
},
{
"epoch": 4.616033755274262,
"grad_norm": 2.1505532264709473,
"learning_rate": 1.800958061763712e-06,
"loss": 0.26043060421943665,
"step": 2188
},
{
"epoch": 4.620253164556962,
"grad_norm": 2.0198612213134766,
"learning_rate": 1.7951704718744841e-06,
"loss": 0.6140601634979248,
"step": 2190
},
{
"epoch": 4.624472573839663,
"grad_norm": 2.324085235595703,
"learning_rate": 1.7893969777836265e-06,
"loss": 0.20785805583000183,
"step": 2192
},
{
"epoch": 4.628691983122363,
"grad_norm": 2.0707149505615234,
"learning_rate": 1.7836376107338783e-06,
"loss": 0.5573110580444336,
"step": 2194
},
{
"epoch": 4.632911392405063,
"grad_norm": 3.6579232215881348,
"learning_rate": 1.7778924018915302e-06,
"loss": 0.2335490882396698,
"step": 2196
},
{
"epoch": 4.637130801687764,
"grad_norm": 2.841978073120117,
"learning_rate": 1.772161382346259e-06,
"loss": 0.3419453501701355,
"step": 2198
},
{
"epoch": 4.641350210970464,
"grad_norm": 2.595341682434082,
"learning_rate": 1.7664445831109566e-06,
"loss": 0.535962700843811,
"step": 2200
},
{
"epoch": 4.6455696202531644,
"grad_norm": 2.8027384281158447,
"learning_rate": 1.7607420351215616e-06,
"loss": 0.4780561923980713,
"step": 2202
},
{
"epoch": 4.649789029535865,
"grad_norm": 0.4611937701702118,
"learning_rate": 1.7550537692368942e-06,
"loss": 0.3059866428375244,
"step": 2204
},
{
"epoch": 4.654008438818566,
"grad_norm": 1.5873767137527466,
"learning_rate": 1.74937981623849e-06,
"loss": 0.46250712871551514,
"step": 2206
},
{
"epoch": 4.658227848101266,
"grad_norm": 1.6936619281768799,
"learning_rate": 1.7437202068304287e-06,
"loss": 0.452869713306427,
"step": 2208
},
{
"epoch": 4.662447257383966,
"grad_norm": 2.697862386703491,
"learning_rate": 1.7380749716391737e-06,
"loss": 0.5035865306854248,
"step": 2210
},
{
"epoch": 4.666666666666667,
"grad_norm": 3.739734649658203,
"learning_rate": 1.7324441412134013e-06,
"loss": 0.3993757367134094,
"step": 2212
},
{
"epoch": 4.670886075949367,
"grad_norm": 7.6267852783203125,
"learning_rate": 1.7268277460238397e-06,
"loss": 0.3390964865684509,
"step": 2214
},
{
"epoch": 4.675105485232067,
"grad_norm": 1.8734283447265625,
"learning_rate": 1.7212258164631027e-06,
"loss": 0.5280478000640869,
"step": 2216
},
{
"epoch": 4.679324894514768,
"grad_norm": 6.668360710144043,
"learning_rate": 1.7156383828455204e-06,
"loss": 0.4059964418411255,
"step": 2218
},
{
"epoch": 4.6835443037974684,
"grad_norm": 2.475369930267334,
"learning_rate": 1.710065475406983e-06,
"loss": 0.4801621735095978,
"step": 2220
},
{
"epoch": 4.687763713080169,
"grad_norm": 2.3857297897338867,
"learning_rate": 1.7045071243047728e-06,
"loss": 0.0963069349527359,
"step": 2222
},
{
"epoch": 4.691983122362869,
"grad_norm": 2.433400869369507,
"learning_rate": 1.6989633596174029e-06,
"loss": 0.47518980503082275,
"step": 2224
},
{
"epoch": 4.69620253164557,
"grad_norm": 2.3119516372680664,
"learning_rate": 1.6934342113444524e-06,
"loss": 0.2933182120323181,
"step": 2226
},
{
"epoch": 4.70042194092827,
"grad_norm": 12.671791076660156,
"learning_rate": 1.6879197094064043e-06,
"loss": 0.08877721428871155,
"step": 2228
},
{
"epoch": 4.70464135021097,
"grad_norm": 2.207108497619629,
"learning_rate": 1.6824198836444858e-06,
"loss": 0.622957706451416,
"step": 2230
},
{
"epoch": 4.708860759493671,
"grad_norm": 6.14840030670166,
"learning_rate": 1.676934763820503e-06,
"loss": 0.5102095603942871,
"step": 2232
},
{
"epoch": 4.713080168776371,
"grad_norm": 4.767087936401367,
"learning_rate": 1.6714643796166835e-06,
"loss": 0.5292322635650635,
"step": 2234
},
{
"epoch": 4.717299578059071,
"grad_norm": 0.11136994510889053,
"learning_rate": 1.6660087606355153e-06,
"loss": 0.31627708673477173,
"step": 2236
},
{
"epoch": 4.7215189873417724,
"grad_norm": 4.295990467071533,
"learning_rate": 1.6605679363995848e-06,
"loss": 0.33531737327575684,
"step": 2238
},
{
"epoch": 4.725738396624473,
"grad_norm": 6.078010559082031,
"learning_rate": 1.6551419363514182e-06,
"loss": 0.43265092372894287,
"step": 2240
},
{
"epoch": 4.729957805907173,
"grad_norm": 3.5457258224487305,
"learning_rate": 1.6497307898533218e-06,
"loss": 0.6657654047012329,
"step": 2242
},
{
"epoch": 4.734177215189874,
"grad_norm": 0.8174402713775635,
"learning_rate": 1.6443345261872228e-06,
"loss": 0.05635060369968414,
"step": 2244
},
{
"epoch": 4.738396624472574,
"grad_norm": 2.277449369430542,
"learning_rate": 1.6389531745545138e-06,
"loss": 0.40952473878860474,
"step": 2246
},
{
"epoch": 4.742616033755274,
"grad_norm": 2.1519405841827393,
"learning_rate": 1.6335867640758876e-06,
"loss": 0.6268118023872375,
"step": 2248
},
{
"epoch": 4.746835443037975,
"grad_norm": 10.723348617553711,
"learning_rate": 1.6282353237911881e-06,
"loss": 0.08097459375858307,
"step": 2250
},
{
"epoch": 4.751054852320675,
"grad_norm": 6.452489852905273,
"learning_rate": 1.6228988826592484e-06,
"loss": 0.5121550559997559,
"step": 2252
},
{
"epoch": 4.755274261603375,
"grad_norm": 3.1199183464050293,
"learning_rate": 1.617577469557735e-06,
"loss": 0.417529433965683,
"step": 2254
},
{
"epoch": 4.759493670886076,
"grad_norm": 1.670754075050354,
"learning_rate": 1.6122711132829917e-06,
"loss": 0.23685501515865326,
"step": 2256
},
{
"epoch": 4.763713080168777,
"grad_norm": 3.9786603450775146,
"learning_rate": 1.606979842549883e-06,
"loss": 0.08441432565450668,
"step": 2258
},
{
"epoch": 4.767932489451477,
"grad_norm": 3.4749438762664795,
"learning_rate": 1.60170368599164e-06,
"loss": 0.1604347825050354,
"step": 2260
},
{
"epoch": 4.772151898734177,
"grad_norm": 2.590517044067383,
"learning_rate": 1.5964426721597048e-06,
"loss": 0.3043164014816284,
"step": 2262
},
{
"epoch": 4.776371308016878,
"grad_norm": 3.320221185684204,
"learning_rate": 1.5911968295235756e-06,
"loss": 0.5432933568954468,
"step": 2264
},
{
"epoch": 4.780590717299578,
"grad_norm": 6.788969993591309,
"learning_rate": 1.5859661864706533e-06,
"loss": 0.4840553402900696,
"step": 2266
},
{
"epoch": 4.784810126582278,
"grad_norm": 5.413600444793701,
"learning_rate": 1.5807507713060879e-06,
"loss": 0.6614431142807007,
"step": 2268
},
{
"epoch": 4.789029535864979,
"grad_norm": 3.4263274669647217,
"learning_rate": 1.5755506122526248e-06,
"loss": 0.4286192059516907,
"step": 2270
},
{
"epoch": 4.793248945147679,
"grad_norm": 3.0580050945281982,
"learning_rate": 1.5703657374504516e-06,
"loss": 0.7800706624984741,
"step": 2272
},
{
"epoch": 4.7974683544303796,
"grad_norm": 6.522762775421143,
"learning_rate": 1.565196174957049e-06,
"loss": 0.2070183902978897,
"step": 2274
},
{
"epoch": 4.80168776371308,
"grad_norm": 1.439235806465149,
"learning_rate": 1.5600419527470331e-06,
"loss": 0.10173705220222473,
"step": 2276
},
{
"epoch": 4.805907172995781,
"grad_norm": 3.4041426181793213,
"learning_rate": 1.5549030987120095e-06,
"loss": 0.3341836929321289,
"step": 2278
},
{
"epoch": 4.810126582278481,
"grad_norm": 3.3857309818267822,
"learning_rate": 1.5497796406604202e-06,
"loss": 0.20992735028266907,
"step": 2280
},
{
"epoch": 4.814345991561181,
"grad_norm": 2.2873666286468506,
"learning_rate": 1.5446716063173935e-06,
"loss": 0.424138605594635,
"step": 2282
},
{
"epoch": 4.818565400843882,
"grad_norm": 8.471506118774414,
"learning_rate": 1.5395790233245924e-06,
"loss": 0.5139745473861694,
"step": 2284
},
{
"epoch": 4.822784810126582,
"grad_norm": 2.079385757446289,
"learning_rate": 1.5345019192400677e-06,
"loss": 0.494828999042511,
"step": 2286
},
{
"epoch": 4.827004219409282,
"grad_norm": 2.116379737854004,
"learning_rate": 1.529440321538107e-06,
"loss": 0.12557630240917206,
"step": 2288
},
{
"epoch": 4.831223628691983,
"grad_norm": 5.065046787261963,
"learning_rate": 1.5243942576090872e-06,
"loss": 0.6678446531295776,
"step": 2290
},
{
"epoch": 4.8354430379746836,
"grad_norm": 2.4034457206726074,
"learning_rate": 1.5193637547593231e-06,
"loss": 0.4627326428890228,
"step": 2292
},
{
"epoch": 4.839662447257384,
"grad_norm": 2.859379291534424,
"learning_rate": 1.5143488402109239e-06,
"loss": 0.44882258772850037,
"step": 2294
},
{
"epoch": 4.843881856540085,
"grad_norm": 1.7705358266830444,
"learning_rate": 1.509349541101646e-06,
"loss": 0.4356788694858551,
"step": 2296
},
{
"epoch": 4.848101265822785,
"grad_norm": 2.958854913711548,
"learning_rate": 1.5043658844847414e-06,
"loss": 0.7101269960403442,
"step": 2298
},
{
"epoch": 4.852320675105485,
"grad_norm": 5.127024173736572,
"learning_rate": 1.499397897328815e-06,
"loss": 0.6652213931083679,
"step": 2300
},
{
"epoch": 4.856540084388186,
"grad_norm": 0.7210382223129272,
"learning_rate": 1.4944456065176785e-06,
"loss": 0.23934832215309143,
"step": 2302
},
{
"epoch": 4.860759493670886,
"grad_norm": 2.468538284301758,
"learning_rate": 1.4895090388502043e-06,
"loss": 0.26671305298805237,
"step": 2304
},
{
"epoch": 4.864978902953586,
"grad_norm": 2.5370748043060303,
"learning_rate": 1.4845882210401776e-06,
"loss": 0.4928842782974243,
"step": 2306
},
{
"epoch": 4.869198312236287,
"grad_norm": 2.779625654220581,
"learning_rate": 1.479683179716159e-06,
"loss": 0.2867523729801178,
"step": 2308
},
{
"epoch": 4.8734177215189876,
"grad_norm": 4.043989658355713,
"learning_rate": 1.4747939414213334e-06,
"loss": 0.4452981948852539,
"step": 2310
},
{
"epoch": 4.877637130801688,
"grad_norm": 2.612654209136963,
"learning_rate": 1.4699205326133696e-06,
"loss": 0.47218436002731323,
"step": 2312
},
{
"epoch": 4.881856540084388,
"grad_norm": 4.8351593017578125,
"learning_rate": 1.4650629796642774e-06,
"loss": 0.5447877049446106,
"step": 2314
},
{
"epoch": 4.886075949367089,
"grad_norm": 2.5699872970581055,
"learning_rate": 1.460221308860262e-06,
"loss": 0.5671508312225342,
"step": 2316
},
{
"epoch": 4.890295358649789,
"grad_norm": 3.086909055709839,
"learning_rate": 1.4553955464015868e-06,
"loss": 0.39557531476020813,
"step": 2318
},
{
"epoch": 4.894514767932489,
"grad_norm": 5.661040782928467,
"learning_rate": 1.4505857184024262e-06,
"loss": 0.44218748807907104,
"step": 2320
},
{
"epoch": 4.89873417721519,
"grad_norm": 3.8085811138153076,
"learning_rate": 1.4457918508907268e-06,
"loss": 0.3575529456138611,
"step": 2322
},
{
"epoch": 4.90295358649789,
"grad_norm": 2.3151283264160156,
"learning_rate": 1.441013969808068e-06,
"loss": 0.5917726755142212,
"step": 2324
},
{
"epoch": 4.9071729957805905,
"grad_norm": 3.560556650161743,
"learning_rate": 1.4362521010095186e-06,
"loss": 0.33830514550209045,
"step": 2326
},
{
"epoch": 4.911392405063291,
"grad_norm": 2.334346294403076,
"learning_rate": 1.4315062702634997e-06,
"loss": 0.4876287281513214,
"step": 2328
},
{
"epoch": 4.915611814345992,
"grad_norm": 2.1452908515930176,
"learning_rate": 1.426776503251643e-06,
"loss": 0.6366673111915588,
"step": 2330
},
{
"epoch": 4.919831223628692,
"grad_norm": 0.41547343134880066,
"learning_rate": 1.4220628255686533e-06,
"loss": 0.25237974524497986,
"step": 2332
},
{
"epoch": 4.924050632911392,
"grad_norm": 54.84702682495117,
"learning_rate": 1.4173652627221686e-06,
"loss": 0.43499624729156494,
"step": 2334
},
{
"epoch": 4.928270042194093,
"grad_norm": 2.2124152183532715,
"learning_rate": 1.4126838401326243e-06,
"loss": 0.5627238750457764,
"step": 2336
},
{
"epoch": 4.932489451476793,
"grad_norm": 7.185441017150879,
"learning_rate": 1.4080185831331126e-06,
"loss": 0.25834035873413086,
"step": 2338
},
{
"epoch": 4.936708860759493,
"grad_norm": 5.250967502593994,
"learning_rate": 1.4033695169692485e-06,
"loss": 0.2957782447338104,
"step": 2340
},
{
"epoch": 4.940928270042194,
"grad_norm": 6.259135723114014,
"learning_rate": 1.398736666799031e-06,
"loss": 0.6378402709960938,
"step": 2342
},
{
"epoch": 4.9451476793248945,
"grad_norm": 16.73641586303711,
"learning_rate": 1.3941200576927088e-06,
"loss": 0.35595816373825073,
"step": 2344
},
{
"epoch": 4.949367088607595,
"grad_norm": 3.1287739276885986,
"learning_rate": 1.3895197146326414e-06,
"loss": 0.7204777002334595,
"step": 2346
},
{
"epoch": 4.953586497890296,
"grad_norm": 2.49485445022583,
"learning_rate": 1.3849356625131692e-06,
"loss": 0.3135877847671509,
"step": 2348
},
{
"epoch": 4.957805907172996,
"grad_norm": 3.383075714111328,
"learning_rate": 1.3803679261404716e-06,
"loss": 0.49237698316574097,
"step": 2350
},
{
"epoch": 4.962025316455696,
"grad_norm": 2.2556025981903076,
"learning_rate": 1.3758165302324397e-06,
"loss": 0.16111743450164795,
"step": 2352
},
{
"epoch": 4.966244725738397,
"grad_norm": 4.6603546142578125,
"learning_rate": 1.3712814994185395e-06,
"loss": 0.6392441987991333,
"step": 2354
},
{
"epoch": 4.970464135021097,
"grad_norm": 12.963358879089355,
"learning_rate": 1.366762858239679e-06,
"loss": 0.35483598709106445,
"step": 2356
},
{
"epoch": 4.974683544303797,
"grad_norm": 3.961883068084717,
"learning_rate": 1.3622606311480729e-06,
"loss": 0.5934839248657227,
"step": 2358
},
{
"epoch": 4.978902953586498,
"grad_norm": 1.0137317180633545,
"learning_rate": 1.3577748425071152e-06,
"loss": 0.28861305117607117,
"step": 2360
},
{
"epoch": 4.9831223628691985,
"grad_norm": 3.8444621562957764,
"learning_rate": 1.3533055165912433e-06,
"loss": 0.5528509616851807,
"step": 2362
},
{
"epoch": 4.987341772151899,
"grad_norm": 12.442330360412598,
"learning_rate": 1.3488526775858087e-06,
"loss": 0.6871875524520874,
"step": 2364
},
{
"epoch": 4.991561181434599,
"grad_norm": 2.018998622894287,
"learning_rate": 1.3444163495869444e-06,
"loss": 0.6601129770278931,
"step": 2366
},
{
"epoch": 4.9957805907173,
"grad_norm": 4.588871002197266,
"learning_rate": 1.3399965566014363e-06,
"loss": 0.3472335934638977,
"step": 2368
},
{
"epoch": 5.0,
"grad_norm": 20.997011184692383,
"learning_rate": 1.3355933225465938e-06,
"loss": 0.1488598883152008,
"step": 2370
},
{
"epoch": 5.0042194092827,
"grad_norm": 1.974225401878357,
"learning_rate": 1.3312066712501176e-06,
"loss": 0.4649539589881897,
"step": 2372
},
{
"epoch": 5.008438818565401,
"grad_norm": 2.101741075515747,
"learning_rate": 1.3268366264499723e-06,
"loss": 0.40653684735298157,
"step": 2374
},
{
"epoch": 5.012658227848101,
"grad_norm": 3.2043211460113525,
"learning_rate": 1.322483211794259e-06,
"loss": 0.20105722546577454,
"step": 2376
},
{
"epoch": 5.0168776371308015,
"grad_norm": 3.830211877822876,
"learning_rate": 1.3181464508410858e-06,
"loss": 0.4869913160800934,
"step": 2378
},
{
"epoch": 5.0210970464135025,
"grad_norm": 3.2576708793640137,
"learning_rate": 1.3138263670584392e-06,
"loss": 0.3144640028476715,
"step": 2380
},
{
"epoch": 5.025316455696203,
"grad_norm": 2.3268511295318604,
"learning_rate": 1.309522983824061e-06,
"loss": 0.4795665144920349,
"step": 2382
},
{
"epoch": 5.029535864978903,
"grad_norm": 4.2797112464904785,
"learning_rate": 1.3052363244253188e-06,
"loss": 0.303976833820343,
"step": 2384
},
{
"epoch": 5.033755274261603,
"grad_norm": 2.950766086578369,
"learning_rate": 1.3009664120590806e-06,
"loss": 0.2566067576408386,
"step": 2386
},
{
"epoch": 5.037974683544304,
"grad_norm": 4.176736354827881,
"learning_rate": 1.296713269831589e-06,
"loss": 0.33072197437286377,
"step": 2388
},
{
"epoch": 5.042194092827004,
"grad_norm": 0.061758268624544144,
"learning_rate": 1.2924769207583368e-06,
"loss": 0.17066842317581177,
"step": 2390
},
{
"epoch": 5.046413502109704,
"grad_norm": 4.471809387207031,
"learning_rate": 1.2882573877639427e-06,
"loss": 0.24980589747428894,
"step": 2392
},
{
"epoch": 5.050632911392405,
"grad_norm": 4.955497741699219,
"learning_rate": 1.2840546936820263e-06,
"loss": 0.2576749622821808,
"step": 2394
},
{
"epoch": 5.0548523206751055,
"grad_norm": 0.7454743981361389,
"learning_rate": 1.2798688612550838e-06,
"loss": 0.040055617690086365,
"step": 2396
},
{
"epoch": 5.059071729957806,
"grad_norm": 2.270764112472534,
"learning_rate": 1.2756999131343677e-06,
"loss": 0.4545499086380005,
"step": 2398
},
{
"epoch": 5.063291139240507,
"grad_norm": 8.608718872070312,
"learning_rate": 1.271547871879762e-06,
"loss": 0.46691781282424927,
"step": 2400
},
{
"epoch": 5.067510548523207,
"grad_norm": 4.590333938598633,
"learning_rate": 1.267412759959661e-06,
"loss": 0.3534661829471588,
"step": 2402
},
{
"epoch": 5.071729957805907,
"grad_norm": 6.176363468170166,
"learning_rate": 1.2632945997508469e-06,
"loss": 0.03008463606238365,
"step": 2404
},
{
"epoch": 5.075949367088608,
"grad_norm": 4.096558570861816,
"learning_rate": 1.25919341353837e-06,
"loss": 0.4609118103981018,
"step": 2406
},
{
"epoch": 5.080168776371308,
"grad_norm": 1.3339556455612183,
"learning_rate": 1.2551092235154265e-06,
"loss": 0.25634127855300903,
"step": 2408
},
{
"epoch": 5.084388185654008,
"grad_norm": 3.1009860038757324,
"learning_rate": 1.2510420517832399e-06,
"loss": 0.3237183690071106,
"step": 2410
},
{
"epoch": 5.0886075949367084,
"grad_norm": 6.112014293670654,
"learning_rate": 1.2469919203509406e-06,
"loss": 0.45163053274154663,
"step": 2412
},
{
"epoch": 5.0928270042194095,
"grad_norm": 1.8072830438613892,
"learning_rate": 1.2429588511354468e-06,
"loss": 0.3245161175727844,
"step": 2414
},
{
"epoch": 5.09704641350211,
"grad_norm": 1.5899354219436646,
"learning_rate": 1.2389428659613465e-06,
"loss": 0.09791871905326843,
"step": 2416
},
{
"epoch": 5.10126582278481,
"grad_norm": 2.595155954360962,
"learning_rate": 1.2349439865607783e-06,
"loss": 0.20728906989097595,
"step": 2418
},
{
"epoch": 5.105485232067511,
"grad_norm": 2.6492655277252197,
"learning_rate": 1.2309622345733153e-06,
"loss": 0.52880859375,
"step": 2420
},
{
"epoch": 5.109704641350211,
"grad_norm": 3.113187789916992,
"learning_rate": 1.226997631545846e-06,
"loss": 0.34188008308410645,
"step": 2422
},
{
"epoch": 5.113924050632911,
"grad_norm": 0.3923889100551605,
"learning_rate": 1.2230501989324606e-06,
"loss": 0.39657163619995117,
"step": 2424
},
{
"epoch": 5.118143459915612,
"grad_norm": 2.9111595153808594,
"learning_rate": 1.219119958094331e-06,
"loss": 0.37215137481689453,
"step": 2426
},
{
"epoch": 5.122362869198312,
"grad_norm": 5.908500671386719,
"learning_rate": 1.215206930299599e-06,
"loss": 0.28079548478126526,
"step": 2428
},
{
"epoch": 5.1265822784810124,
"grad_norm": 0.15474487841129303,
"learning_rate": 1.2113111367232582e-06,
"loss": 0.16562075912952423,
"step": 2430
},
{
"epoch": 5.1308016877637135,
"grad_norm": 2.4118106365203857,
"learning_rate": 1.2074325984470428e-06,
"loss": 0.3783321678638458,
"step": 2432
},
{
"epoch": 5.135021097046414,
"grad_norm": 2.499565601348877,
"learning_rate": 1.2035713364593102e-06,
"loss": 0.4123075604438782,
"step": 2434
},
{
"epoch": 5.139240506329114,
"grad_norm": 3.3785624504089355,
"learning_rate": 1.1997273716549284e-06,
"loss": 0.25959959626197815,
"step": 2436
},
{
"epoch": 5.143459915611814,
"grad_norm": 3.4707491397857666,
"learning_rate": 1.195900724835164e-06,
"loss": 0.03673313558101654,
"step": 2438
},
{
"epoch": 5.147679324894515,
"grad_norm": 2.67907977104187,
"learning_rate": 1.1920914167075696e-06,
"loss": 0.2947133779525757,
"step": 2440
},
{
"epoch": 5.151898734177215,
"grad_norm": 3.3348686695098877,
"learning_rate": 1.1882994678858675e-06,
"loss": 0.3776189684867859,
"step": 2442
},
{
"epoch": 5.156118143459915,
"grad_norm": 3.6222927570343018,
"learning_rate": 1.1845248988898464e-06,
"loss": 0.2443552017211914,
"step": 2444
},
{
"epoch": 5.160337552742616,
"grad_norm": 3.27504301071167,
"learning_rate": 1.1807677301452437e-06,
"loss": 0.5414304733276367,
"step": 2446
},
{
"epoch": 5.1645569620253164,
"grad_norm": 3.2731869220733643,
"learning_rate": 1.1770279819836355e-06,
"loss": 0.18883806467056274,
"step": 2448
},
{
"epoch": 5.168776371308017,
"grad_norm": 3.275451421737671,
"learning_rate": 1.1733056746423304e-06,
"loss": 0.37931862473487854,
"step": 2450
},
{
"epoch": 5.172995780590718,
"grad_norm": 0.034749243408441544,
"learning_rate": 1.1696008282642559e-06,
"loss": 0.20449881255626678,
"step": 2452
},
{
"epoch": 5.177215189873418,
"grad_norm": 4.930509567260742,
"learning_rate": 1.165913462897852e-06,
"loss": 0.035537637770175934,
"step": 2454
},
{
"epoch": 5.181434599156118,
"grad_norm": 1.7042666673660278,
"learning_rate": 1.1622435984969602e-06,
"loss": 0.20552217960357666,
"step": 2456
},
{
"epoch": 5.185654008438819,
"grad_norm": 14.41930103302002,
"learning_rate": 1.1585912549207196e-06,
"loss": 0.3709006607532501,
"step": 2458
},
{
"epoch": 5.189873417721519,
"grad_norm": 6.651240348815918,
"learning_rate": 1.1549564519334556e-06,
"loss": 0.18409161269664764,
"step": 2460
},
{
"epoch": 5.194092827004219,
"grad_norm": 2.1087753772735596,
"learning_rate": 1.1513392092045736e-06,
"loss": 0.39856773614883423,
"step": 2462
},
{
"epoch": 5.198312236286919,
"grad_norm": 2.5043351650238037,
"learning_rate": 1.147739546308455e-06,
"loss": 0.3595339059829712,
"step": 2464
},
{
"epoch": 5.2025316455696204,
"grad_norm": 3.3729639053344727,
"learning_rate": 1.1441574827243478e-06,
"loss": 0.17214104533195496,
"step": 2466
},
{
"epoch": 5.206751054852321,
"grad_norm": 2.227221965789795,
"learning_rate": 1.1405930378362648e-06,
"loss": 0.3033697009086609,
"step": 2468
},
{
"epoch": 5.210970464135021,
"grad_norm": 5.146759510040283,
"learning_rate": 1.1370462309328743e-06,
"loss": 0.36619800329208374,
"step": 2470
},
{
"epoch": 5.215189873417722,
"grad_norm": 3.0241358280181885,
"learning_rate": 1.1335170812073999e-06,
"loss": 0.30589285492897034,
"step": 2472
},
{
"epoch": 5.219409282700422,
"grad_norm": 2.229212522506714,
"learning_rate": 1.1300056077575154e-06,
"loss": 0.3369923233985901,
"step": 2474
},
{
"epoch": 5.223628691983122,
"grad_norm": 2.9154298305511475,
"learning_rate": 1.1265118295852404e-06,
"loss": 0.19644200801849365,
"step": 2476
},
{
"epoch": 5.227848101265823,
"grad_norm": 5.483319282531738,
"learning_rate": 1.1230357655968371e-06,
"loss": 0.06274639070034027,
"step": 2478
},
{
"epoch": 5.232067510548523,
"grad_norm": 0.5124228596687317,
"learning_rate": 1.119577434602711e-06,
"loss": 0.20770075917243958,
"step": 2480
},
{
"epoch": 5.236286919831223,
"grad_norm": 2.4156696796417236,
"learning_rate": 1.116136855317307e-06,
"loss": 0.29468050599098206,
"step": 2482
},
{
"epoch": 5.2405063291139244,
"grad_norm": 1.4421368837356567,
"learning_rate": 1.1127140463590055e-06,
"loss": 0.23361340165138245,
"step": 2484
},
{
"epoch": 5.244725738396625,
"grad_norm": 6.403635025024414,
"learning_rate": 1.1093090262500266e-06,
"loss": 0.4423346519470215,
"step": 2486
},
{
"epoch": 5.248945147679325,
"grad_norm": 4.7648606300354,
"learning_rate": 1.105921813416328e-06,
"loss": 0.5721250772476196,
"step": 2488
},
{
"epoch": 5.253164556962025,
"grad_norm": 4.072231292724609,
"learning_rate": 1.1025524261875041e-06,
"loss": 0.5335391163825989,
"step": 2490
},
{
"epoch": 5.257383966244726,
"grad_norm": 2.6612138748168945,
"learning_rate": 1.0992008827966874e-06,
"loss": 0.5658106803894043,
"step": 2492
},
{
"epoch": 5.261603375527426,
"grad_norm": 2.578683614730835,
"learning_rate": 1.095867201380451e-06,
"loss": 0.41335171461105347,
"step": 2494
},
{
"epoch": 5.265822784810126,
"grad_norm": 5.410678863525391,
"learning_rate": 1.0925513999787086e-06,
"loss": 0.15254725515842438,
"step": 2496
},
{
"epoch": 5.270042194092827,
"grad_norm": 3.9477524757385254,
"learning_rate": 1.0892534965346192e-06,
"loss": 0.44648611545562744,
"step": 2498
},
{
"epoch": 5.274261603375527,
"grad_norm": 7.086328506469727,
"learning_rate": 1.0859735088944868e-06,
"loss": 0.16064085066318512,
"step": 2500
},
{
"epoch": 5.2784810126582276,
"grad_norm": 2.0906810760498047,
"learning_rate": 1.0827114548076663e-06,
"loss": 0.2642805874347687,
"step": 2502
},
{
"epoch": 5.282700421940929,
"grad_norm": 2.104074716567993,
"learning_rate": 1.0794673519264675e-06,
"loss": 0.24389728903770447,
"step": 2504
},
{
"epoch": 5.286919831223629,
"grad_norm": 6.062882423400879,
"learning_rate": 1.0762412178060587e-06,
"loss": 0.31626439094543457,
"step": 2506
},
{
"epoch": 5.291139240506329,
"grad_norm": 2.444314956665039,
"learning_rate": 1.0730330699043717e-06,
"loss": 0.4520007371902466,
"step": 2508
},
{
"epoch": 5.29535864978903,
"grad_norm": 5.062936305999756,
"learning_rate": 1.0698429255820068e-06,
"loss": 0.09191440790891647,
"step": 2510
},
{
"epoch": 5.29957805907173,
"grad_norm": 2.516993761062622,
"learning_rate": 1.0666708021021406e-06,
"loss": 0.21818026900291443,
"step": 2512
},
{
"epoch": 5.30379746835443,
"grad_norm": 3.7304656505584717,
"learning_rate": 1.063516716630432e-06,
"loss": 0.33304083347320557,
"step": 2514
},
{
"epoch": 5.308016877637131,
"grad_norm": 2.554382562637329,
"learning_rate": 1.0603806862349255e-06,
"loss": 0.3670189380645752,
"step": 2516
},
{
"epoch": 5.312236286919831,
"grad_norm": 2.6083078384399414,
"learning_rate": 1.0572627278859675e-06,
"loss": 0.4783245027065277,
"step": 2518
},
{
"epoch": 5.3164556962025316,
"grad_norm": 4.347960472106934,
"learning_rate": 1.0541628584561052e-06,
"loss": 0.4460408687591553,
"step": 2520
},
{
"epoch": 5.320675105485232,
"grad_norm": 4.647004127502441,
"learning_rate": 1.0510810947200003e-06,
"loss": 0.3045784533023834,
"step": 2522
},
{
"epoch": 5.324894514767933,
"grad_norm": 3.011627197265625,
"learning_rate": 1.0480174533543372e-06,
"loss": 0.33729833364486694,
"step": 2524
},
{
"epoch": 5.329113924050633,
"grad_norm": 2.8627593517303467,
"learning_rate": 1.044971950937734e-06,
"loss": 0.5005810260772705,
"step": 2526
},
{
"epoch": 5.333333333333333,
"grad_norm": 4.140803337097168,
"learning_rate": 1.041944603950649e-06,
"loss": 0.44916412234306335,
"step": 2528
},
{
"epoch": 5.337552742616034,
"grad_norm": 3.677198886871338,
"learning_rate": 1.038935428775296e-06,
"loss": 0.5101116299629211,
"step": 2530
},
{
"epoch": 5.341772151898734,
"grad_norm": 3.9427692890167236,
"learning_rate": 1.0359444416955528e-06,
"loss": 0.3052045702934265,
"step": 2532
},
{
"epoch": 5.345991561181434,
"grad_norm": 3.024719715118408,
"learning_rate": 1.0329716588968745e-06,
"loss": 0.2897722125053406,
"step": 2534
},
{
"epoch": 5.350210970464135,
"grad_norm": 2.671980619430542,
"learning_rate": 1.030017096466205e-06,
"loss": 0.3393900692462921,
"step": 2536
},
{
"epoch": 5.3544303797468356,
"grad_norm": 2.4032411575317383,
"learning_rate": 1.027080770391891e-06,
"loss": 0.40280789136886597,
"step": 2538
},
{
"epoch": 5.358649789029536,
"grad_norm": 1.9715185165405273,
"learning_rate": 1.0241626965635942e-06,
"loss": 0.2567780017852783,
"step": 2540
},
{
"epoch": 5.362869198312236,
"grad_norm": 14.593756675720215,
"learning_rate": 1.0212628907722062e-06,
"loss": 0.04668917506933212,
"step": 2542
},
{
"epoch": 5.367088607594937,
"grad_norm": 0.3638130724430084,
"learning_rate": 1.0183813687097618e-06,
"loss": 0.16572636365890503,
"step": 2544
},
{
"epoch": 5.371308016877637,
"grad_norm": 4.365072250366211,
"learning_rate": 1.0155181459693565e-06,
"loss": 0.3552468717098236,
"step": 2546
},
{
"epoch": 5.375527426160337,
"grad_norm": 2.508363723754883,
"learning_rate": 1.0126732380450596e-06,
"loss": 0.38389939069747925,
"step": 2548
},
{
"epoch": 5.379746835443038,
"grad_norm": 3.3055357933044434,
"learning_rate": 1.0098466603318323e-06,
"loss": 0.31817764043807983,
"step": 2550
},
{
"epoch": 5.383966244725738,
"grad_norm": 3.0569851398468018,
"learning_rate": 1.0070384281254425e-06,
"loss": 0.12491938471794128,
"step": 2552
},
{
"epoch": 5.3881856540084385,
"grad_norm": 2.883695363998413,
"learning_rate": 1.0042485566223848e-06,
"loss": 0.4344925284385681,
"step": 2554
},
{
"epoch": 5.3924050632911396,
"grad_norm": 11.280695915222168,
"learning_rate": 1.0014770609197957e-06,
"loss": 0.3988388180732727,
"step": 2556
},
{
"epoch": 5.39662447257384,
"grad_norm": 8.63589096069336,
"learning_rate": 9.98723956015371e-07,
"loss": 0.17392376065254211,
"step": 2558
},
{
"epoch": 5.40084388185654,
"grad_norm": 10.322222709655762,
"learning_rate": 9.959892568072881e-07,
"loss": 0.08735622465610504,
"step": 2560
},
{
"epoch": 5.405063291139241,
"grad_norm": 3.0921213626861572,
"learning_rate": 9.932729780941237e-07,
"loss": 0.20220640301704407,
"step": 2562
},
{
"epoch": 5.409282700421941,
"grad_norm": 2.6708106994628906,
"learning_rate": 9.905751345747734e-07,
"loss": 0.5822624564170837,
"step": 2564
},
{
"epoch": 5.413502109704641,
"grad_norm": 2.136763572692871,
"learning_rate": 9.878957408483718e-07,
"loss": 0.16230207681655884,
"step": 2566
},
{
"epoch": 5.417721518987342,
"grad_norm": 2.1476376056671143,
"learning_rate": 9.852348114142155e-07,
"loss": 0.3189689517021179,
"step": 2568
},
{
"epoch": 5.421940928270042,
"grad_norm": 27.836748123168945,
"learning_rate": 9.825923606716818e-07,
"loss": 0.05949246510863304,
"step": 2570
},
{
"epoch": 5.4261603375527425,
"grad_norm": 1.7536920309066772,
"learning_rate": 9.799684029201536e-07,
"loss": 0.23696368932724,
"step": 2572
},
{
"epoch": 5.430379746835443,
"grad_norm": 0.2607567608356476,
"learning_rate": 9.773629523589387e-07,
"loss": 0.014276674017310143,
"step": 2574
},
{
"epoch": 5.434599156118144,
"grad_norm": 2.661586046218872,
"learning_rate": 9.747760230871965e-07,
"loss": 0.27866894006729126,
"step": 2576
},
{
"epoch": 5.438818565400844,
"grad_norm": 6.1830010414123535,
"learning_rate": 9.722076291038605e-07,
"loss": 0.5345185399055481,
"step": 2578
},
{
"epoch": 5.443037974683544,
"grad_norm": 2.385190010070801,
"learning_rate": 9.696577843075608e-07,
"loss": 0.4049319624900818,
"step": 2580
},
{
"epoch": 5.447257383966245,
"grad_norm": 3.472625732421875,
"learning_rate": 9.671265024965509e-07,
"loss": 0.35417062044143677,
"step": 2582
},
{
"epoch": 5.451476793248945,
"grad_norm": 2.2873806953430176,
"learning_rate": 9.646137973686324e-07,
"loss": 0.22758211195468903,
"step": 2584
},
{
"epoch": 5.455696202531645,
"grad_norm": 3.527923345565796,
"learning_rate": 9.621196825210814e-07,
"loss": 0.332139790058136,
"step": 2586
},
{
"epoch": 5.459915611814346,
"grad_norm": 2.947986364364624,
"learning_rate": 9.596441714505732e-07,
"loss": 0.07135351002216339,
"step": 2588
},
{
"epoch": 5.4641350210970465,
"grad_norm": 4.305266857147217,
"learning_rate": 9.57187277553111e-07,
"loss": 0.5696512460708618,
"step": 2590
},
{
"epoch": 5.468354430379747,
"grad_norm": 3.028513193130493,
"learning_rate": 9.547490141239534e-07,
"loss": 0.19437383115291595,
"step": 2592
},
{
"epoch": 5.472573839662447,
"grad_norm": 2.7946903705596924,
"learning_rate": 9.523293943575414e-07,
"loss": 0.15654590725898743,
"step": 2594
},
{
"epoch": 5.476793248945148,
"grad_norm": 6.811203956604004,
"learning_rate": 9.499284313474276e-07,
"loss": 0.11999380588531494,
"step": 2596
},
{
"epoch": 5.481012658227848,
"grad_norm": 2.8398826122283936,
"learning_rate": 9.475461380862047e-07,
"loss": 0.04623116925358772,
"step": 2598
},
{
"epoch": 5.485232067510548,
"grad_norm": 0.62392657995224,
"learning_rate": 9.451825274654373e-07,
"loss": 0.1718018651008606,
"step": 2600
},
{
"epoch": 5.489451476793249,
"grad_norm": 1.799946904182434,
"learning_rate": 9.428376122755884e-07,
"loss": 0.2459963858127594,
"step": 2602
},
{
"epoch": 5.493670886075949,
"grad_norm": 3.90554141998291,
"learning_rate": 9.405114052059541e-07,
"loss": 0.23852768540382385,
"step": 2604
},
{
"epoch": 5.4978902953586495,
"grad_norm": 6.165667533874512,
"learning_rate": 9.382039188445925e-07,
"loss": 0.05722271651029587,
"step": 2606
},
{
"epoch": 5.5021097046413505,
"grad_norm": 2.0570311546325684,
"learning_rate": 9.359151656782567e-07,
"loss": 0.19151735305786133,
"step": 2608
},
{
"epoch": 5.506329113924051,
"grad_norm": 1.9145231246948242,
"learning_rate": 9.336451580923262e-07,
"loss": 0.03127627447247505,
"step": 2610
},
{
"epoch": 5.510548523206751,
"grad_norm": 3.4458625316619873,
"learning_rate": 9.313939083707413e-07,
"loss": 0.1748735010623932,
"step": 2612
},
{
"epoch": 5.514767932489452,
"grad_norm": 3.4101099967956543,
"learning_rate": 9.291614286959349e-07,
"loss": 0.382763147354126,
"step": 2614
},
{
"epoch": 5.518987341772152,
"grad_norm": 0.40910443663597107,
"learning_rate": 9.269477311487686e-07,
"loss": 0.1556778848171234,
"step": 2616
},
{
"epoch": 5.523206751054852,
"grad_norm": 3.5669357776641846,
"learning_rate": 9.247528277084645e-07,
"loss": 0.1594393253326416,
"step": 2618
},
{
"epoch": 5.527426160337553,
"grad_norm": 3.370866537094116,
"learning_rate": 9.225767302525441e-07,
"loss": 0.4137956500053406,
"step": 2620
},
{
"epoch": 5.531645569620253,
"grad_norm": 0.21743591129779816,
"learning_rate": 9.20419450556761e-07,
"loss": 0.4230045676231384,
"step": 2622
},
{
"epoch": 5.5358649789029535,
"grad_norm": 2.6428186893463135,
"learning_rate": 9.182810002950378e-07,
"loss": 0.42899954319000244,
"step": 2624
},
{
"epoch": 5.540084388185654,
"grad_norm": 6.077566623687744,
"learning_rate": 9.16161391039404e-07,
"loss": 0.20327959954738617,
"step": 2626
},
{
"epoch": 5.544303797468355,
"grad_norm": 5.41641902923584,
"learning_rate": 9.140606342599332e-07,
"loss": 0.44856715202331543,
"step": 2628
},
{
"epoch": 5.548523206751055,
"grad_norm": 4.853996753692627,
"learning_rate": 9.119787413246795e-07,
"loss": 0.4271373748779297,
"step": 2630
},
{
"epoch": 5.552742616033755,
"grad_norm": 4.13206672668457,
"learning_rate": 9.099157234996173e-07,
"loss": 0.23560848832130432,
"step": 2632
},
{
"epoch": 5.556962025316456,
"grad_norm": 2.2997729778289795,
"learning_rate": 9.078715919485798e-07,
"loss": 0.23265743255615234,
"step": 2634
},
{
"epoch": 5.561181434599156,
"grad_norm": 1.88694167137146,
"learning_rate": 9.058463577331999e-07,
"loss": 0.15787991881370544,
"step": 2636
},
{
"epoch": 5.565400843881856,
"grad_norm": 2.646512746810913,
"learning_rate": 9.03840031812848e-07,
"loss": 0.1242508590221405,
"step": 2638
},
{
"epoch": 5.569620253164557,
"grad_norm": 0.22765684127807617,
"learning_rate": 9.018526250445747e-07,
"loss": 0.07518874108791351,
"step": 2640
},
{
"epoch": 5.5738396624472575,
"grad_norm": 2.321202278137207,
"learning_rate": 8.998841481830515e-07,
"loss": 0.30490678548812866,
"step": 2642
},
{
"epoch": 5.578059071729958,
"grad_norm": 4.0073418617248535,
"learning_rate": 8.97934611880512e-07,
"loss": 0.611823558807373,
"step": 2644
},
{
"epoch": 5.582278481012658,
"grad_norm": 18.506275177001953,
"learning_rate": 8.960040266866948e-07,
"loss": 0.3300861120223999,
"step": 2646
},
{
"epoch": 5.586497890295359,
"grad_norm": 2.6837949752807617,
"learning_rate": 8.94092403048786e-07,
"loss": 0.018273882567882538,
"step": 2648
},
{
"epoch": 5.590717299578059,
"grad_norm": 2.3034257888793945,
"learning_rate": 8.921997513113637e-07,
"loss": 0.25158876180648804,
"step": 2650
},
{
"epoch": 5.594936708860759,
"grad_norm": 2.5194528102874756,
"learning_rate": 8.903260817163402e-07,
"loss": 0.18762826919555664,
"step": 2652
},
{
"epoch": 5.59915611814346,
"grad_norm": 4.4369096755981445,
"learning_rate": 8.884714044029092e-07,
"loss": 0.06930024921894073,
"step": 2654
},
{
"epoch": 5.60337552742616,
"grad_norm": 1.652228832244873,
"learning_rate": 8.86635729407488e-07,
"loss": 0.2389906346797943,
"step": 2656
},
{
"epoch": 5.6075949367088604,
"grad_norm": 5.807703971862793,
"learning_rate": 8.848190666636651e-07,
"loss": 0.10554240643978119,
"step": 2658
},
{
"epoch": 5.6118143459915615,
"grad_norm": 3.1773393154144287,
"learning_rate": 8.830214260021459e-07,
"loss": 0.16849491000175476,
"step": 2660
},
{
"epoch": 5.616033755274262,
"grad_norm": 23.979202270507812,
"learning_rate": 8.812428171506998e-07,
"loss": 0.04333914816379547,
"step": 2662
},
{
"epoch": 5.620253164556962,
"grad_norm": 0.8357203602790833,
"learning_rate": 8.794832497341065e-07,
"loss": 0.30027642846107483,
"step": 2664
},
{
"epoch": 5.624472573839663,
"grad_norm": 2.281597375869751,
"learning_rate": 8.77742733274106e-07,
"loss": 0.027927353978157043,
"step": 2666
},
{
"epoch": 5.628691983122363,
"grad_norm": 2.4539661407470703,
"learning_rate": 8.760212771893442e-07,
"loss": 0.10624615103006363,
"step": 2668
},
{
"epoch": 5.632911392405063,
"grad_norm": 0.6876718401908875,
"learning_rate": 8.743188907953251e-07,
"loss": 0.3938605487346649,
"step": 2670
},
{
"epoch": 5.637130801687764,
"grad_norm": 3.4516823291778564,
"learning_rate": 8.726355833043575e-07,
"loss": 0.3330395519733429,
"step": 2672
},
{
"epoch": 5.641350210970464,
"grad_norm": 3.539989471435547,
"learning_rate": 8.709713638255074e-07,
"loss": 0.6006532907485962,
"step": 2674
},
{
"epoch": 5.6455696202531644,
"grad_norm": 3.0337464809417725,
"learning_rate": 8.693262413645464e-07,
"loss": 0.3575003445148468,
"step": 2676
},
{
"epoch": 5.649789029535865,
"grad_norm": 3.694211721420288,
"learning_rate": 8.677002248239066e-07,
"loss": 0.2969297766685486,
"step": 2678
},
{
"epoch": 5.654008438818566,
"grad_norm": 1.0974704027175903,
"learning_rate": 8.660933230026276e-07,
"loss": 0.05868370085954666,
"step": 2680
},
{
"epoch": 5.658227848101266,
"grad_norm": 2.3777191638946533,
"learning_rate": 8.645055445963135e-07,
"loss": 0.31508857011795044,
"step": 2682
},
{
"epoch": 5.662447257383966,
"grad_norm": 0.7402134537696838,
"learning_rate": 8.629368981970822e-07,
"loss": 0.04464399069547653,
"step": 2684
},
{
"epoch": 5.666666666666667,
"grad_norm": 1.678879737854004,
"learning_rate": 8.613873922935217e-07,
"loss": 0.4207780957221985,
"step": 2686
},
{
"epoch": 5.670886075949367,
"grad_norm": 0.16458748281002045,
"learning_rate": 8.598570352706425e-07,
"loss": 0.33038753271102905,
"step": 2688
},
{
"epoch": 5.675105485232067,
"grad_norm": 2.9141294956207275,
"learning_rate": 8.583458354098318e-07,
"loss": 0.471355140209198,
"step": 2690
},
{
"epoch": 5.679324894514768,
"grad_norm": 2.882704973220825,
"learning_rate": 8.56853800888812e-07,
"loss": 0.01554950326681137,
"step": 2692
},
{
"epoch": 5.6835443037974684,
"grad_norm": 2.264005422592163,
"learning_rate": 8.553809397815909e-07,
"loss": 0.5948341488838196,
"step": 2694
},
{
"epoch": 5.687763713080169,
"grad_norm": 5.996740341186523,
"learning_rate": 8.539272600584227e-07,
"loss": 0.2293516844511032,
"step": 2696
},
{
"epoch": 5.691983122362869,
"grad_norm": 3.936495542526245,
"learning_rate": 8.524927695857636e-07,
"loss": 0.4448416829109192,
"step": 2698
},
{
"epoch": 5.69620253164557,
"grad_norm": 2.383849859237671,
"learning_rate": 8.510774761262285e-07,
"loss": 0.3430967926979065,
"step": 2700
},
{
"epoch": 5.70042194092827,
"grad_norm": 3.2722134590148926,
"learning_rate": 8.496813873385494e-07,
"loss": 0.38816744089126587,
"step": 2702
},
{
"epoch": 5.70464135021097,
"grad_norm": 0.12035968899726868,
"learning_rate": 8.483045107775337e-07,
"loss": 0.2644461393356323,
"step": 2704
},
{
"epoch": 5.708860759493671,
"grad_norm": 0.15902626514434814,
"learning_rate": 8.469468538940241e-07,
"loss": 0.15841832756996155,
"step": 2706
},
{
"epoch": 5.713080168776371,
"grad_norm": 2.9765915870666504,
"learning_rate": 8.456084240348575e-07,
"loss": 0.03421106934547424,
"step": 2708
},
{
"epoch": 5.717299578059071,
"grad_norm": 0.5361355543136597,
"learning_rate": 8.44289228442825e-07,
"loss": 0.21108713746070862,
"step": 2710
},
{
"epoch": 5.7215189873417724,
"grad_norm": 3.1111233234405518,
"learning_rate": 8.429892742566344e-07,
"loss": 0.38604629039764404,
"step": 2712
},
{
"epoch": 5.725738396624473,
"grad_norm": 2.709472417831421,
"learning_rate": 8.417085685108695e-07,
"loss": 0.4284287095069885,
"step": 2714
},
{
"epoch": 5.729957805907173,
"grad_norm": 2.9085452556610107,
"learning_rate": 8.404471181359526e-07,
"loss": 0.2729555666446686,
"step": 2716
},
{
"epoch": 5.734177215189874,
"grad_norm": 13.333749771118164,
"learning_rate": 8.392049299581083e-07,
"loss": 0.49384695291519165,
"step": 2718
},
{
"epoch": 5.738396624472574,
"grad_norm": 2.3657379150390625,
"learning_rate": 8.379820106993253e-07,
"loss": 0.42707446217536926,
"step": 2720
},
{
"epoch": 5.742616033755274,
"grad_norm": 4.9322943687438965,
"learning_rate": 8.367783669773196e-07,
"loss": 0.4772263467311859,
"step": 2722
},
{
"epoch": 5.746835443037975,
"grad_norm": 2.417219638824463,
"learning_rate": 8.355940053054999e-07,
"loss": 0.11725395172834396,
"step": 2724
},
{
"epoch": 5.751054852320675,
"grad_norm": 3.8851850032806396,
"learning_rate": 8.344289320929321e-07,
"loss": 0.3932940363883972,
"step": 2726
},
{
"epoch": 5.755274261603375,
"grad_norm": 2.6408393383026123,
"learning_rate": 8.332831536443035e-07,
"loss": 0.3797783851623535,
"step": 2728
},
{
"epoch": 5.759493670886076,
"grad_norm": 4.485612392425537,
"learning_rate": 8.321566761598909e-07,
"loss": 0.28436335921287537,
"step": 2730
},
{
"epoch": 5.763713080168777,
"grad_norm": 2.002480983734131,
"learning_rate": 8.310495057355242e-07,
"loss": 0.5089020729064941,
"step": 2732
},
{
"epoch": 5.767932489451477,
"grad_norm": 2.7712652683258057,
"learning_rate": 8.299616483625561e-07,
"loss": 0.09954804182052612,
"step": 2734
},
{
"epoch": 5.772151898734177,
"grad_norm": 8.521866798400879,
"learning_rate": 8.288931099278275e-07,
"loss": 0.28571265935897827,
"step": 2736
},
{
"epoch": 5.776371308016878,
"grad_norm": 3.3578455448150635,
"learning_rate": 8.27843896213637e-07,
"loss": 0.4965103268623352,
"step": 2738
},
{
"epoch": 5.780590717299578,
"grad_norm": 2.0499563217163086,
"learning_rate": 8.2681401289771e-07,
"loss": 0.2735576629638672,
"step": 2740
},
{
"epoch": 5.784810126582278,
"grad_norm": 3.7373149394989014,
"learning_rate": 8.258034655531661e-07,
"loss": 0.4888134002685547,
"step": 2742
},
{
"epoch": 5.789029535864979,
"grad_norm": 2.028367757797241,
"learning_rate": 8.248122596484903e-07,
"loss": 0.16572898626327515,
"step": 2744
},
{
"epoch": 5.793248945147679,
"grad_norm": 2.7090587615966797,
"learning_rate": 8.23840400547503e-07,
"loss": 0.2500470280647278,
"step": 2746
},
{
"epoch": 5.7974683544303796,
"grad_norm": 5.770694255828857,
"learning_rate": 8.228878935093327e-07,
"loss": 0.6361812949180603,
"step": 2748
},
{
"epoch": 5.80168776371308,
"grad_norm": 5.137115478515625,
"learning_rate": 8.219547436883832e-07,
"loss": 0.25070175528526306,
"step": 2750
},
{
"epoch": 5.805907172995781,
"grad_norm": 7.0401716232299805,
"learning_rate": 8.210409561343112e-07,
"loss": 0.4003854990005493,
"step": 2752
},
{
"epoch": 5.810126582278481,
"grad_norm": 8.947293281555176,
"learning_rate": 8.201465357919941e-07,
"loss": 0.5776923894882202,
"step": 2754
},
{
"epoch": 5.814345991561181,
"grad_norm": 2.4191689491271973,
"learning_rate": 8.192714875015071e-07,
"loss": 0.21931633353233337,
"step": 2756
},
{
"epoch": 5.818565400843882,
"grad_norm": 3.477417469024658,
"learning_rate": 8.184158159980942e-07,
"loss": 0.034300077706575394,
"step": 2758
},
{
"epoch": 5.822784810126582,
"grad_norm": 2.9756200313568115,
"learning_rate": 8.175795259121438e-07,
"loss": 0.38680142164230347,
"step": 2760
},
{
"epoch": 5.827004219409282,
"grad_norm": 13.028040885925293,
"learning_rate": 8.167626217691641e-07,
"loss": 0.3836379647254944,
"step": 2762
},
{
"epoch": 5.831223628691983,
"grad_norm": 8.563456535339355,
"learning_rate": 8.15965107989757e-07,
"loss": 0.17900025844573975,
"step": 2764
},
{
"epoch": 5.8354430379746836,
"grad_norm": 3.2769968509674072,
"learning_rate": 8.151869888895971e-07,
"loss": 0.41699984669685364,
"step": 2766
},
{
"epoch": 5.839662447257384,
"grad_norm": 4.447607040405273,
"learning_rate": 8.144282686794042e-07,
"loss": 0.3173035979270935,
"step": 2768
},
{
"epoch": 5.843881856540085,
"grad_norm": 3.0378546714782715,
"learning_rate": 8.136889514649242e-07,
"loss": 0.40285831689834595,
"step": 2770
},
{
"epoch": 5.848101265822785,
"grad_norm": 2.4900171756744385,
"learning_rate": 8.129690412469046e-07,
"loss": 0.2346557378768921,
"step": 2772
},
{
"epoch": 5.852320675105485,
"grad_norm": 4.628073215484619,
"learning_rate": 8.122685419210748e-07,
"loss": 0.34134355187416077,
"step": 2774
},
{
"epoch": 5.856540084388186,
"grad_norm": 2.439875602722168,
"learning_rate": 8.11587457278123e-07,
"loss": 0.32176950573921204,
"step": 2776
},
{
"epoch": 5.860759493670886,
"grad_norm": 2.549513816833496,
"learning_rate": 8.109257910036767e-07,
"loss": 0.4297516345977783,
"step": 2778
},
{
"epoch": 5.864978902953586,
"grad_norm": 2.4646005630493164,
"learning_rate": 8.102835466782829e-07,
"loss": 0.3611939251422882,
"step": 2780
},
{
"epoch": 5.869198312236287,
"grad_norm": 2.1484479904174805,
"learning_rate": 8.096607277773885e-07,
"loss": 0.3919060528278351,
"step": 2782
},
{
"epoch": 5.8734177215189876,
"grad_norm": 4.247211456298828,
"learning_rate": 8.090573376713214e-07,
"loss": 0.4738028943538666,
"step": 2784
},
{
"epoch": 5.877637130801688,
"grad_norm": 12.467236518859863,
"learning_rate": 8.084733796252727e-07,
"loss": 0.14323553442955017,
"step": 2786
},
{
"epoch": 5.881856540084388,
"grad_norm": 2.3769068717956543,
"learning_rate": 8.079088567992778e-07,
"loss": 0.3547300100326538,
"step": 2788
},
{
"epoch": 5.886075949367089,
"grad_norm": 1.2352712154388428,
"learning_rate": 8.073637722482008e-07,
"loss": 0.028360096737742424,
"step": 2790
},
{
"epoch": 5.890295358649789,
"grad_norm": 2.2478432655334473,
"learning_rate": 8.068381289217173e-07,
"loss": 0.13877378404140472,
"step": 2792
},
{
"epoch": 5.894514767932489,
"grad_norm": 2.8125381469726562,
"learning_rate": 8.063319296642983e-07,
"loss": 0.40060657262802124,
"step": 2794
},
{
"epoch": 5.89873417721519,
"grad_norm": 5.122012615203857,
"learning_rate": 8.058451772151953e-07,
"loss": 0.40660685300827026,
"step": 2796
},
{
"epoch": 5.90295358649789,
"grad_norm": 3.058626890182495,
"learning_rate": 8.05377874208425e-07,
"loss": 0.2716779410839081,
"step": 2798
},
{
"epoch": 5.9071729957805905,
"grad_norm": 3.8683018684387207,
"learning_rate": 8.049300231727548e-07,
"loss": 0.3559970259666443,
"step": 2800
},
{
"epoch": 5.911392405063291,
"grad_norm": 3.477356433868408,
"learning_rate": 8.045016265316904e-07,
"loss": 0.29196757078170776,
"step": 2802
},
{
"epoch": 5.915611814345992,
"grad_norm": 3.347072124481201,
"learning_rate": 8.04092686603461e-07,
"loss": 0.24977904558181763,
"step": 2804
},
{
"epoch": 5.919831223628692,
"grad_norm": 2.251091957092285,
"learning_rate": 8.037032056010077e-07,
"loss": 0.03224069997668266,
"step": 2806
},
{
"epoch": 5.924050632911392,
"grad_norm": 7.518223762512207,
"learning_rate": 8.03333185631972e-07,
"loss": 0.3484126329421997,
"step": 2808
},
{
"epoch": 5.928270042194093,
"grad_norm": 3.4275622367858887,
"learning_rate": 8.02982628698683e-07,
"loss": 0.059894442558288574,
"step": 2810
},
{
"epoch": 5.932489451476793,
"grad_norm": 2.064310073852539,
"learning_rate": 8.026515366981481e-07,
"loss": 0.12616072595119476,
"step": 2812
},
{
"epoch": 5.936708860759493,
"grad_norm": 0.0712597668170929,
"learning_rate": 8.023399114220411e-07,
"loss": 0.2311958521604538,
"step": 2814
},
{
"epoch": 5.940928270042194,
"grad_norm": 2.0846590995788574,
"learning_rate": 8.020477545566941e-07,
"loss": 0.27708864212036133,
"step": 2816
},
{
"epoch": 5.9451476793248945,
"grad_norm": 2.385714530944824,
"learning_rate": 8.017750676830876e-07,
"loss": 0.19477054476737976,
"step": 2818
},
{
"epoch": 5.949367088607595,
"grad_norm": 0.2615872025489807,
"learning_rate": 8.015218522768414e-07,
"loss": 0.12333428859710693,
"step": 2820
},
{
"epoch": 5.953586497890296,
"grad_norm": 2.914166212081909,
"learning_rate": 8.012881097082082e-07,
"loss": 0.3903350234031677,
"step": 2822
},
{
"epoch": 5.957805907172996,
"grad_norm": 2.832524538040161,
"learning_rate": 8.010738412420643e-07,
"loss": 0.25948387384414673,
"step": 2824
},
{
"epoch": 5.962025316455696,
"grad_norm": 2.6522233486175537,
"learning_rate": 8.008790480379041e-07,
"loss": 0.42445188760757446,
"step": 2826
},
{
"epoch": 5.966244725738397,
"grad_norm": 0.424772173166275,
"learning_rate": 8.007037311498337e-07,
"loss": 0.18149511516094208,
"step": 2828
},
{
"epoch": 5.970464135021097,
"grad_norm": 2.7508718967437744,
"learning_rate": 8.005478915265643e-07,
"loss": 0.08192159235477448,
"step": 2830
},
{
"epoch": 5.974683544303797,
"grad_norm": 7.296056270599365,
"learning_rate": 8.004115300114071e-07,
"loss": 0.3574886918067932,
"step": 2832
},
{
"epoch": 5.978902953586498,
"grad_norm": 0.17789922654628754,
"learning_rate": 8.002946473422713e-07,
"loss": 0.2169741988182068,
"step": 2834
},
{
"epoch": 5.9831223628691985,
"grad_norm": 5.66543436050415,
"learning_rate": 8.001972441516558e-07,
"loss": 0.06217677891254425,
"step": 2836
},
{
"epoch": 5.987341772151899,
"grad_norm": 2.6673474311828613,
"learning_rate": 8.001193209666501e-07,
"loss": 0.3183894753456116,
"step": 2838
},
{
"epoch": 5.991561181434599,
"grad_norm": 0.4517356753349304,
"learning_rate": 8.000608782089275e-07,
"loss": 0.26500433683395386,
"step": 2840
},
{
"epoch": 5.9957805907173,
"grad_norm": 13.812853813171387,
"learning_rate": 8.000219161947466e-07,
"loss": 0.4387038052082062,
"step": 2842
},
{
"epoch": 6.0,
"grad_norm": 2.1090548038482666,
"learning_rate": 8.000024351349457e-07,
"loss": 0.4745343327522278,
"step": 2844
},
{
"epoch": 6.0,
"step": 2844,
"total_flos": 5.392281114922451e+18,
"train_loss": 0.6896465788091076,
"train_runtime": 6887.9733,
"train_samples_per_second": 12.387,
"train_steps_per_second": 0.413
}
],
"logging_steps": 2,
"max_steps": 2844,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 99999,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.392281114922451e+18,
"train_batch_size": 3,
"trial_name": null,
"trial_params": null
}