Ours-RadFM-3ep-Rev2 / trainer_state.json
loopback-kr's picture
Upload folder using huggingface_hub
0f3ffbe verified
{
"best_metric": 0.009820309467613697,
"best_model_checkpoint": "/workspace/previous_works/RadFM/output/RadFM-Llama3-8B-pretrain-0002-embed_tokens-depth32-lora-10ep/checkpoint-10000",
"epoch": 3.0,
"eval_steps": 10000,
"global_step": 14319,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0031426775612822125,
"grad_norm": 38.333740234375,
"learning_rate": 3.488372093023256e-06,
"loss": 2.6324,
"step": 15
},
{
"epoch": 0.006285355122564425,
"grad_norm": 23.8914794921875,
"learning_rate": 6.976744186046512e-06,
"loss": 2.3565,
"step": 30
},
{
"epoch": 0.009428032683846637,
"grad_norm": 6.890503883361816,
"learning_rate": 1.0465116279069768e-05,
"loss": 1.8897,
"step": 45
},
{
"epoch": 0.01257071024512885,
"grad_norm": 3.9464468955993652,
"learning_rate": 1.3953488372093024e-05,
"loss": 1.3707,
"step": 60
},
{
"epoch": 0.01571338780641106,
"grad_norm": 4.443431854248047,
"learning_rate": 1.744186046511628e-05,
"loss": 1.055,
"step": 75
},
{
"epoch": 0.018856065367693273,
"grad_norm": 3.5747361183166504,
"learning_rate": 2.0930232558139536e-05,
"loss": 0.9048,
"step": 90
},
{
"epoch": 0.02199874292897549,
"grad_norm": 4.540731430053711,
"learning_rate": 2.441860465116279e-05,
"loss": 0.9143,
"step": 105
},
{
"epoch": 0.0251414204902577,
"grad_norm": 4.121450424194336,
"learning_rate": 2.7906976744186048e-05,
"loss": 0.7641,
"step": 120
},
{
"epoch": 0.028284098051539912,
"grad_norm": 3.1179299354553223,
"learning_rate": 3.13953488372093e-05,
"loss": 0.7784,
"step": 135
},
{
"epoch": 0.03142677561282212,
"grad_norm": 2.9703869819641113,
"learning_rate": 3.488372093023256e-05,
"loss": 0.7299,
"step": 150
},
{
"epoch": 0.034569453174104335,
"grad_norm": 2.706854820251465,
"learning_rate": 3.837209302325582e-05,
"loss": 0.6778,
"step": 165
},
{
"epoch": 0.03771213073538655,
"grad_norm": 3.361267328262329,
"learning_rate": 4.186046511627907e-05,
"loss": 0.7222,
"step": 180
},
{
"epoch": 0.04085480829666876,
"grad_norm": 4.040229797363281,
"learning_rate": 4.5348837209302326e-05,
"loss": 0.6684,
"step": 195
},
{
"epoch": 0.04399748585795098,
"grad_norm": 2.817627429962158,
"learning_rate": 4.883720930232558e-05,
"loss": 0.7458,
"step": 210
},
{
"epoch": 0.04714016341923319,
"grad_norm": 2.8800182342529297,
"learning_rate": 5.232558139534884e-05,
"loss": 0.6338,
"step": 225
},
{
"epoch": 0.0502828409805154,
"grad_norm": 2.436993360519409,
"learning_rate": 5.5813953488372095e-05,
"loss": 0.6299,
"step": 240
},
{
"epoch": 0.05342551854179761,
"grad_norm": 3.5814456939697266,
"learning_rate": 5.9302325581395356e-05,
"loss": 0.5728,
"step": 255
},
{
"epoch": 0.056568196103079824,
"grad_norm": 2.8744938373565674,
"learning_rate": 6.27906976744186e-05,
"loss": 0.59,
"step": 270
},
{
"epoch": 0.059710873664362035,
"grad_norm": 2.679749011993408,
"learning_rate": 6.627906976744186e-05,
"loss": 0.6016,
"step": 285
},
{
"epoch": 0.06285355122564425,
"grad_norm": 3.1333463191986084,
"learning_rate": 6.976744186046513e-05,
"loss": 0.6569,
"step": 300
},
{
"epoch": 0.06599622878692646,
"grad_norm": 2.2865939140319824,
"learning_rate": 7.325581395348837e-05,
"loss": 0.6385,
"step": 315
},
{
"epoch": 0.06913890634820867,
"grad_norm": 2.9787251949310303,
"learning_rate": 7.674418604651163e-05,
"loss": 0.6307,
"step": 330
},
{
"epoch": 0.07228158390949088,
"grad_norm": 2.078509569168091,
"learning_rate": 8.023255813953489e-05,
"loss": 0.5454,
"step": 345
},
{
"epoch": 0.0754242614707731,
"grad_norm": 2.6606740951538086,
"learning_rate": 8.372093023255814e-05,
"loss": 0.6211,
"step": 360
},
{
"epoch": 0.0785669390320553,
"grad_norm": 1.9346429109573364,
"learning_rate": 8.72093023255814e-05,
"loss": 0.5954,
"step": 375
},
{
"epoch": 0.08170961659333752,
"grad_norm": 2.2432360649108887,
"learning_rate": 9.069767441860465e-05,
"loss": 0.5385,
"step": 390
},
{
"epoch": 0.08485229415461974,
"grad_norm": 2.1645498275756836,
"learning_rate": 9.418604651162792e-05,
"loss": 0.592,
"step": 405
},
{
"epoch": 0.08799497171590195,
"grad_norm": 2.1806533336639404,
"learning_rate": 9.767441860465116e-05,
"loss": 0.5372,
"step": 420
},
{
"epoch": 0.09113764927718417,
"grad_norm": 2.445610761642456,
"learning_rate": 9.999996802299678e-05,
"loss": 0.6487,
"step": 435
},
{
"epoch": 0.09428032683846638,
"grad_norm": 2.3592734336853027,
"learning_rate": 9.999948836876656e-05,
"loss": 0.5957,
"step": 450
},
{
"epoch": 0.09742300439974859,
"grad_norm": 2.3027069568634033,
"learning_rate": 9.999843313485898e-05,
"loss": 0.5835,
"step": 465
},
{
"epoch": 0.1005656819610308,
"grad_norm": 2.6429057121276855,
"learning_rate": 9.999680233342161e-05,
"loss": 0.592,
"step": 480
},
{
"epoch": 0.10370835952231301,
"grad_norm": 2.0832202434539795,
"learning_rate": 9.999459598322778e-05,
"loss": 0.6203,
"step": 495
},
{
"epoch": 0.10685103708359522,
"grad_norm": 2.481870412826538,
"learning_rate": 9.999181410967633e-05,
"loss": 0.5428,
"step": 510
},
{
"epoch": 0.10999371464487744,
"grad_norm": 1.9621151685714722,
"learning_rate": 9.99884567447914e-05,
"loss": 0.6101,
"step": 525
},
{
"epoch": 0.11313639220615965,
"grad_norm": 2.8833186626434326,
"learning_rate": 9.998452392722198e-05,
"loss": 0.5577,
"step": 540
},
{
"epoch": 0.11627906976744186,
"grad_norm": 2.4447429180145264,
"learning_rate": 9.998001570224158e-05,
"loss": 0.566,
"step": 555
},
{
"epoch": 0.11942174732872407,
"grad_norm": 2.141496419906616,
"learning_rate": 9.997493212174753e-05,
"loss": 0.6211,
"step": 570
},
{
"epoch": 0.12256442489000628,
"grad_norm": 2.389796495437622,
"learning_rate": 9.996927324426057e-05,
"loss": 0.5937,
"step": 585
},
{
"epoch": 0.1257071024512885,
"grad_norm": 2.1194262504577637,
"learning_rate": 9.996303913492408e-05,
"loss": 0.5847,
"step": 600
},
{
"epoch": 0.12884978001257072,
"grad_norm": 1.7767274379730225,
"learning_rate": 9.99562298655033e-05,
"loss": 0.518,
"step": 615
},
{
"epoch": 0.13199245757385292,
"grad_norm": 2.0348453521728516,
"learning_rate": 9.994884551438458e-05,
"loss": 0.5941,
"step": 630
},
{
"epoch": 0.13513513513513514,
"grad_norm": 1.443819284439087,
"learning_rate": 9.994088616657444e-05,
"loss": 0.5022,
"step": 645
},
{
"epoch": 0.13827781269641734,
"grad_norm": 2.1748251914978027,
"learning_rate": 9.993235191369861e-05,
"loss": 0.5369,
"step": 660
},
{
"epoch": 0.14142049025769957,
"grad_norm": 1.9295774698257446,
"learning_rate": 9.99232428540009e-05,
"loss": 0.607,
"step": 675
},
{
"epoch": 0.14456316781898176,
"grad_norm": 1.7530088424682617,
"learning_rate": 9.991355909234224e-05,
"loss": 0.5417,
"step": 690
},
{
"epoch": 0.147705845380264,
"grad_norm": 10.02226448059082,
"learning_rate": 9.990330074019925e-05,
"loss": 0.5901,
"step": 705
},
{
"epoch": 0.1508485229415462,
"grad_norm": 1.3864644765853882,
"learning_rate": 9.989246791566314e-05,
"loss": 0.678,
"step": 720
},
{
"epoch": 0.1539912005028284,
"grad_norm": 1.6103929281234741,
"learning_rate": 9.988106074343823e-05,
"loss": 0.4741,
"step": 735
},
{
"epoch": 0.1571338780641106,
"grad_norm": 1.5933347940444946,
"learning_rate": 9.986907935484064e-05,
"loss": 0.5391,
"step": 750
},
{
"epoch": 0.16027655562539284,
"grad_norm": 1.5971338748931885,
"learning_rate": 9.985652388779663e-05,
"loss": 0.5782,
"step": 765
},
{
"epoch": 0.16341923318667503,
"grad_norm": 1.559793472290039,
"learning_rate": 9.984339448684113e-05,
"loss": 0.5227,
"step": 780
},
{
"epoch": 0.16656191074795726,
"grad_norm": 1.3077164888381958,
"learning_rate": 9.982969130311597e-05,
"loss": 0.5203,
"step": 795
},
{
"epoch": 0.16970458830923948,
"grad_norm": 1.6828336715698242,
"learning_rate": 9.98154144943683e-05,
"loss": 0.5471,
"step": 810
},
{
"epoch": 0.17284726587052168,
"grad_norm": 1.387099266052246,
"learning_rate": 9.98005642249486e-05,
"loss": 0.5399,
"step": 825
},
{
"epoch": 0.1759899434318039,
"grad_norm": 1.723253607749939,
"learning_rate": 9.978514066580886e-05,
"loss": 0.5606,
"step": 840
},
{
"epoch": 0.1791326209930861,
"grad_norm": 1.22931706905365,
"learning_rate": 9.976914399450068e-05,
"loss": 0.5024,
"step": 855
},
{
"epoch": 0.18227529855436833,
"grad_norm": 1.4278538227081299,
"learning_rate": 9.97525743951731e-05,
"loss": 0.5983,
"step": 870
},
{
"epoch": 0.18541797611565053,
"grad_norm": 1.4029372930526733,
"learning_rate": 9.973543205857057e-05,
"loss": 0.5699,
"step": 885
},
{
"epoch": 0.18856065367693275,
"grad_norm": 1.3018133640289307,
"learning_rate": 9.971771718203072e-05,
"loss": 0.4936,
"step": 900
},
{
"epoch": 0.19170333123821495,
"grad_norm": 1.3082265853881836,
"learning_rate": 9.969942996948209e-05,
"loss": 0.5025,
"step": 915
},
{
"epoch": 0.19484600879949718,
"grad_norm": 1.2923167943954468,
"learning_rate": 9.968057063144182e-05,
"loss": 0.5779,
"step": 930
},
{
"epoch": 0.19798868636077938,
"grad_norm": 1.2902971506118774,
"learning_rate": 9.966113938501313e-05,
"loss": 0.5373,
"step": 945
},
{
"epoch": 0.2011313639220616,
"grad_norm": 1.391560673713684,
"learning_rate": 9.964113645388293e-05,
"loss": 0.5858,
"step": 960
},
{
"epoch": 0.2042740414833438,
"grad_norm": 1.3245513439178467,
"learning_rate": 9.96205620683192e-05,
"loss": 0.6043,
"step": 975
},
{
"epoch": 0.20741671904462602,
"grad_norm": 1.4998241662979126,
"learning_rate": 9.95994164651683e-05,
"loss": 0.5785,
"step": 990
},
{
"epoch": 0.21055939660590822,
"grad_norm": 1.090804934501648,
"learning_rate": 9.957769988785236e-05,
"loss": 0.6439,
"step": 1005
},
{
"epoch": 0.21370207416719045,
"grad_norm": 1.1564654111862183,
"learning_rate": 9.955541258636631e-05,
"loss": 0.5091,
"step": 1020
},
{
"epoch": 0.21684475172847265,
"grad_norm": 1.1778066158294678,
"learning_rate": 9.953255481727513e-05,
"loss": 0.5456,
"step": 1035
},
{
"epoch": 0.21998742928975487,
"grad_norm": 1.3568626642227173,
"learning_rate": 9.950912684371088e-05,
"loss": 0.5208,
"step": 1050
},
{
"epoch": 0.2231301068510371,
"grad_norm": 1.804425597190857,
"learning_rate": 9.948512893536961e-05,
"loss": 0.4956,
"step": 1065
},
{
"epoch": 0.2262727844123193,
"grad_norm": 1.226159930229187,
"learning_rate": 9.946056136850833e-05,
"loss": 0.5812,
"step": 1080
},
{
"epoch": 0.22941546197360152,
"grad_norm": 1.1530790328979492,
"learning_rate": 9.943542442594177e-05,
"loss": 0.4742,
"step": 1095
},
{
"epoch": 0.23255813953488372,
"grad_norm": 1.390417218208313,
"learning_rate": 9.940971839703916e-05,
"loss": 0.619,
"step": 1110
},
{
"epoch": 0.23570081709616594,
"grad_norm": 1.4010789394378662,
"learning_rate": 9.938344357772087e-05,
"loss": 0.6086,
"step": 1125
},
{
"epoch": 0.23884349465744814,
"grad_norm": 1.6488044261932373,
"learning_rate": 9.935660027045506e-05,
"loss": 0.551,
"step": 1140
},
{
"epoch": 0.24198617221873037,
"grad_norm": 1.0560044050216675,
"learning_rate": 9.932918878425412e-05,
"loss": 0.532,
"step": 1155
},
{
"epoch": 0.24512884978001256,
"grad_norm": 1.0651888847351074,
"learning_rate": 9.930120943467117e-05,
"loss": 0.5012,
"step": 1170
},
{
"epoch": 0.2482715273412948,
"grad_norm": 1.0553079843521118,
"learning_rate": 9.927266254379642e-05,
"loss": 0.5576,
"step": 1185
},
{
"epoch": 0.251414204902577,
"grad_norm": 1.007480263710022,
"learning_rate": 9.924354844025339e-05,
"loss": 0.4839,
"step": 1200
},
{
"epoch": 0.2545568824638592,
"grad_norm": 1.0924334526062012,
"learning_rate": 9.921386745919528e-05,
"loss": 0.595,
"step": 1215
},
{
"epoch": 0.25769956002514144,
"grad_norm": 1.3309390544891357,
"learning_rate": 9.918361994230097e-05,
"loss": 0.5224,
"step": 1230
},
{
"epoch": 0.2608422375864236,
"grad_norm": 0.9702763557434082,
"learning_rate": 9.915280623777114e-05,
"loss": 0.4871,
"step": 1245
},
{
"epoch": 0.26398491514770583,
"grad_norm": 1.0511876344680786,
"learning_rate": 9.912142670032427e-05,
"loss": 0.5861,
"step": 1260
},
{
"epoch": 0.26712759270898806,
"grad_norm": 1.396050214767456,
"learning_rate": 9.908948169119251e-05,
"loss": 0.4651,
"step": 1275
},
{
"epoch": 0.2702702702702703,
"grad_norm": 0.985396683216095,
"learning_rate": 9.905697157811761e-05,
"loss": 0.4302,
"step": 1290
},
{
"epoch": 0.27341294783155246,
"grad_norm": 0.9169828295707703,
"learning_rate": 9.902389673534659e-05,
"loss": 0.5212,
"step": 1305
},
{
"epoch": 0.2765556253928347,
"grad_norm": 0.9107710123062134,
"learning_rate": 9.899025754362751e-05,
"loss": 0.4941,
"step": 1320
},
{
"epoch": 0.2796983029541169,
"grad_norm": 0.9720286726951599,
"learning_rate": 9.8956054390205e-05,
"loss": 0.5169,
"step": 1335
},
{
"epoch": 0.28284098051539913,
"grad_norm": 1.1490366458892822,
"learning_rate": 9.892128766881596e-05,
"loss": 0.4973,
"step": 1350
},
{
"epoch": 0.28598365807668136,
"grad_norm": 1.2628952264785767,
"learning_rate": 9.888595777968479e-05,
"loss": 0.5194,
"step": 1365
},
{
"epoch": 0.2891263356379635,
"grad_norm": 1.1610651016235352,
"learning_rate": 9.885006512951897e-05,
"loss": 0.4994,
"step": 1380
},
{
"epoch": 0.29226901319924575,
"grad_norm": 1.054768681526184,
"learning_rate": 9.881361013150436e-05,
"loss": 0.4664,
"step": 1395
},
{
"epoch": 0.295411690760528,
"grad_norm": 1.0745666027069092,
"learning_rate": 9.877659320530037e-05,
"loss": 0.5306,
"step": 1410
},
{
"epoch": 0.2985543683218102,
"grad_norm": 1.3258591890335083,
"learning_rate": 9.873901477703516e-05,
"loss": 0.5076,
"step": 1425
},
{
"epoch": 0.3016970458830924,
"grad_norm": 1.222783088684082,
"learning_rate": 9.870087527930077e-05,
"loss": 0.4581,
"step": 1440
},
{
"epoch": 0.3048397234443746,
"grad_norm": 0.9374076724052429,
"learning_rate": 9.866217515114805e-05,
"loss": 0.4643,
"step": 1455
},
{
"epoch": 0.3079824010056568,
"grad_norm": 1.3485162258148193,
"learning_rate": 9.862291483808173e-05,
"loss": 0.5551,
"step": 1470
},
{
"epoch": 0.31112507856693905,
"grad_norm": 0.9162548780441284,
"learning_rate": 9.858309479205519e-05,
"loss": 0.5592,
"step": 1485
},
{
"epoch": 0.3142677561282212,
"grad_norm": 1.1385138034820557,
"learning_rate": 9.854271547146531e-05,
"loss": 0.477,
"step": 1500
},
{
"epoch": 0.31741043368950345,
"grad_norm": 1.0023164749145508,
"learning_rate": 9.850177734114718e-05,
"loss": 0.4972,
"step": 1515
},
{
"epoch": 0.32055311125078567,
"grad_norm": 2.540215492248535,
"learning_rate": 9.846028087236873e-05,
"loss": 0.5007,
"step": 1530
},
{
"epoch": 0.3236957888120679,
"grad_norm": 1.2012773752212524,
"learning_rate": 9.841822654282533e-05,
"loss": 0.5481,
"step": 1545
},
{
"epoch": 0.32683846637335007,
"grad_norm": 0.9517608284950256,
"learning_rate": 9.837561483663429e-05,
"loss": 0.567,
"step": 1560
},
{
"epoch": 0.3299811439346323,
"grad_norm": 1.0308321714401245,
"learning_rate": 9.833244624432927e-05,
"loss": 0.4856,
"step": 1575
},
{
"epoch": 0.3331238214959145,
"grad_norm": 1.118574857711792,
"learning_rate": 9.828872126285465e-05,
"loss": 0.465,
"step": 1590
},
{
"epoch": 0.33626649905719674,
"grad_norm": 1.0821537971496582,
"learning_rate": 9.824444039555977e-05,
"loss": 0.4394,
"step": 1605
},
{
"epoch": 0.33940917661847897,
"grad_norm": 0.8795451521873474,
"learning_rate": 9.81996041521932e-05,
"loss": 0.4383,
"step": 1620
},
{
"epoch": 0.34255185417976114,
"grad_norm": 1.1455141305923462,
"learning_rate": 9.815421304889687e-05,
"loss": 0.4805,
"step": 1635
},
{
"epoch": 0.34569453174104336,
"grad_norm": 1.1445369720458984,
"learning_rate": 9.81082676082e-05,
"loss": 0.5315,
"step": 1650
},
{
"epoch": 0.3488372093023256,
"grad_norm": 1.0800312757492065,
"learning_rate": 9.806176835901328e-05,
"loss": 0.5205,
"step": 1665
},
{
"epoch": 0.3519798868636078,
"grad_norm": 0.7038319706916809,
"learning_rate": 9.801471583662263e-05,
"loss": 0.515,
"step": 1680
},
{
"epoch": 0.35512256442489,
"grad_norm": 0.9790651202201843,
"learning_rate": 9.796711058268313e-05,
"loss": 0.504,
"step": 1695
},
{
"epoch": 0.3582652419861722,
"grad_norm": 1.1764894723892212,
"learning_rate": 9.791895314521267e-05,
"loss": 0.4806,
"step": 1710
},
{
"epoch": 0.36140791954745444,
"grad_norm": 0.9900022745132446,
"learning_rate": 9.787024407858582e-05,
"loss": 0.5358,
"step": 1725
},
{
"epoch": 0.36455059710873666,
"grad_norm": 0.8621386289596558,
"learning_rate": 9.782098394352725e-05,
"loss": 0.5494,
"step": 1740
},
{
"epoch": 0.36769327467001883,
"grad_norm": 0.8717844486236572,
"learning_rate": 9.777117330710547e-05,
"loss": 0.4967,
"step": 1755
},
{
"epoch": 0.37083595223130106,
"grad_norm": 0.9800569415092468,
"learning_rate": 9.772081274272611e-05,
"loss": 0.4538,
"step": 1770
},
{
"epoch": 0.3739786297925833,
"grad_norm": 0.9540134072303772,
"learning_rate": 9.766990283012544e-05,
"loss": 0.5149,
"step": 1785
},
{
"epoch": 0.3771213073538655,
"grad_norm": 1.0856047868728638,
"learning_rate": 9.761844415536372e-05,
"loss": 0.5042,
"step": 1800
},
{
"epoch": 0.3802639849151477,
"grad_norm": 1.0914040803909302,
"learning_rate": 9.756643731081833e-05,
"loss": 0.5059,
"step": 1815
},
{
"epoch": 0.3834066624764299,
"grad_norm": 1.2371134757995605,
"learning_rate": 9.751388289517704e-05,
"loss": 0.4506,
"step": 1830
},
{
"epoch": 0.38654934003771213,
"grad_norm": 1.0402591228485107,
"learning_rate": 9.746078151343116e-05,
"loss": 0.5535,
"step": 1845
},
{
"epoch": 0.38969201759899436,
"grad_norm": 0.6260209083557129,
"learning_rate": 9.740713377686843e-05,
"loss": 0.4436,
"step": 1860
},
{
"epoch": 0.3928346951602766,
"grad_norm": 0.9588780999183655,
"learning_rate": 9.735294030306611e-05,
"loss": 0.5573,
"step": 1875
},
{
"epoch": 0.39597737272155875,
"grad_norm": 1.0838474035263062,
"learning_rate": 9.729820171588384e-05,
"loss": 0.4627,
"step": 1890
},
{
"epoch": 0.399120050282841,
"grad_norm": 1.0682798624038696,
"learning_rate": 9.724291864545643e-05,
"loss": 0.4893,
"step": 1905
},
{
"epoch": 0.4022627278441232,
"grad_norm": 0.9129301309585571,
"learning_rate": 9.718709172818661e-05,
"loss": 0.4898,
"step": 1920
},
{
"epoch": 0.40540540540540543,
"grad_norm": 1.0116883516311646,
"learning_rate": 9.713072160673777e-05,
"loss": 0.4615,
"step": 1935
},
{
"epoch": 0.4085480829666876,
"grad_norm": 1.057822823524475,
"learning_rate": 9.707380893002646e-05,
"loss": 0.4899,
"step": 1950
},
{
"epoch": 0.4116907605279698,
"grad_norm": 0.6419869661331177,
"learning_rate": 9.7016354353215e-05,
"loss": 0.4348,
"step": 1965
},
{
"epoch": 0.41483343808925205,
"grad_norm": 0.961713433265686,
"learning_rate": 9.695835853770387e-05,
"loss": 0.4921,
"step": 1980
},
{
"epoch": 0.4179761156505343,
"grad_norm": 0.9473373889923096,
"learning_rate": 9.689982215112417e-05,
"loss": 0.4926,
"step": 1995
},
{
"epoch": 0.42111879321181644,
"grad_norm": 1.2034335136413574,
"learning_rate": 9.684074586732987e-05,
"loss": 0.5042,
"step": 2010
},
{
"epoch": 0.42426147077309867,
"grad_norm": 0.9373855590820312,
"learning_rate": 9.678113036639014e-05,
"loss": 0.5076,
"step": 2025
},
{
"epoch": 0.4274041483343809,
"grad_norm": 1.016756296157837,
"learning_rate": 9.672097633458136e-05,
"loss": 0.4805,
"step": 2040
},
{
"epoch": 0.4305468258956631,
"grad_norm": 0.7454690337181091,
"learning_rate": 9.666028446437942e-05,
"loss": 0.5382,
"step": 2055
},
{
"epoch": 0.4336895034569453,
"grad_norm": 0.8196286559104919,
"learning_rate": 9.659905545445159e-05,
"loss": 0.4613,
"step": 2070
},
{
"epoch": 0.4368321810182275,
"grad_norm": 0.9132091403007507,
"learning_rate": 9.653729000964857e-05,
"loss": 0.4595,
"step": 2085
},
{
"epoch": 0.43997485857950974,
"grad_norm": 0.8063992857933044,
"learning_rate": 9.647498884099633e-05,
"loss": 0.4139,
"step": 2100
},
{
"epoch": 0.44311753614079197,
"grad_norm": 0.9756997227668762,
"learning_rate": 9.641215266568794e-05,
"loss": 0.3941,
"step": 2115
},
{
"epoch": 0.4462602137020742,
"grad_norm": 0.6542510390281677,
"learning_rate": 9.634878220707531e-05,
"loss": 0.4768,
"step": 2130
},
{
"epoch": 0.44940289126335636,
"grad_norm": 0.9039008617401123,
"learning_rate": 9.628487819466086e-05,
"loss": 0.4248,
"step": 2145
},
{
"epoch": 0.4525455688246386,
"grad_norm": 1.1151047945022583,
"learning_rate": 9.622044136408914e-05,
"loss": 0.5041,
"step": 2160
},
{
"epoch": 0.4556882463859208,
"grad_norm": 0.8580663800239563,
"learning_rate": 9.615547245713836e-05,
"loss": 0.4766,
"step": 2175
},
{
"epoch": 0.45883092394720304,
"grad_norm": 0.9799042344093323,
"learning_rate": 9.608997222171178e-05,
"loss": 0.4714,
"step": 2190
},
{
"epoch": 0.4619736015084852,
"grad_norm": 0.8485172986984253,
"learning_rate": 9.602394141182927e-05,
"loss": 0.4556,
"step": 2205
},
{
"epoch": 0.46511627906976744,
"grad_norm": 0.9632934927940369,
"learning_rate": 9.595738078761837e-05,
"loss": 0.4791,
"step": 2220
},
{
"epoch": 0.46825895663104966,
"grad_norm": 0.8843478560447693,
"learning_rate": 9.589029111530586e-05,
"loss": 0.4603,
"step": 2235
},
{
"epoch": 0.4714016341923319,
"grad_norm": 1.1230348348617554,
"learning_rate": 9.582267316720861e-05,
"loss": 0.491,
"step": 2250
},
{
"epoch": 0.47454431175361406,
"grad_norm": 0.8234013915061951,
"learning_rate": 9.575452772172495e-05,
"loss": 0.44,
"step": 2265
},
{
"epoch": 0.4776869893148963,
"grad_norm": 0.6838919520378113,
"learning_rate": 9.568585556332559e-05,
"loss": 0.4456,
"step": 2280
},
{
"epoch": 0.4808296668761785,
"grad_norm": 0.8424423336982727,
"learning_rate": 9.561665748254456e-05,
"loss": 0.4556,
"step": 2295
},
{
"epoch": 0.48397234443746073,
"grad_norm": 0.6735498905181885,
"learning_rate": 9.554693427597024e-05,
"loss": 0.5184,
"step": 2310
},
{
"epoch": 0.4871150219987429,
"grad_norm": 0.8868768811225891,
"learning_rate": 9.5476686746236e-05,
"loss": 0.5403,
"step": 2325
},
{
"epoch": 0.49025769956002513,
"grad_norm": 0.9957670569419861,
"learning_rate": 9.540591570201116e-05,
"loss": 0.4997,
"step": 2340
},
{
"epoch": 0.49340037712130735,
"grad_norm": 0.76320481300354,
"learning_rate": 9.533462195799157e-05,
"loss": 0.4534,
"step": 2355
},
{
"epoch": 0.4965430546825896,
"grad_norm": 0.8841500282287598,
"learning_rate": 9.526280633489018e-05,
"loss": 0.4724,
"step": 2370
},
{
"epoch": 0.4996857322438718,
"grad_norm": 0.8852142095565796,
"learning_rate": 9.519046965942776e-05,
"loss": 0.4655,
"step": 2385
},
{
"epoch": 0.502828409805154,
"grad_norm": 0.839430570602417,
"learning_rate": 9.511761276432321e-05,
"loss": 0.4386,
"step": 2400
},
{
"epoch": 0.5059710873664363,
"grad_norm": 0.7581266760826111,
"learning_rate": 9.50442364882841e-05,
"loss": 0.4774,
"step": 2415
},
{
"epoch": 0.5091137649277184,
"grad_norm": 0.8754017949104309,
"learning_rate": 9.497034167599691e-05,
"loss": 0.4744,
"step": 2430
},
{
"epoch": 0.5122564424890006,
"grad_norm": 0.9099476337432861,
"learning_rate": 9.48959291781174e-05,
"loss": 0.4292,
"step": 2445
},
{
"epoch": 0.5153991200502829,
"grad_norm": 0.9721155166625977,
"learning_rate": 9.482099985126079e-05,
"loss": 0.4137,
"step": 2460
},
{
"epoch": 0.518541797611565,
"grad_norm": 0.8385334014892578,
"learning_rate": 9.474555455799181e-05,
"loss": 0.471,
"step": 2475
},
{
"epoch": 0.5216844751728472,
"grad_norm": 0.9853966236114502,
"learning_rate": 9.466959416681495e-05,
"loss": 0.4233,
"step": 2490
},
{
"epoch": 0.5248271527341295,
"grad_norm": 1.1044224500656128,
"learning_rate": 9.459311955216428e-05,
"loss": 0.5188,
"step": 2505
},
{
"epoch": 0.5279698302954117,
"grad_norm": 0.870677649974823,
"learning_rate": 9.451613159439349e-05,
"loss": 0.4676,
"step": 2520
},
{
"epoch": 0.531112507856694,
"grad_norm": 0.8571140170097351,
"learning_rate": 9.443863117976573e-05,
"loss": 0.4863,
"step": 2535
},
{
"epoch": 0.5342551854179761,
"grad_norm": 1.0573495626449585,
"learning_rate": 9.436061920044341e-05,
"loss": 0.5057,
"step": 2550
},
{
"epoch": 0.5373978629792583,
"grad_norm": 0.9805963635444641,
"learning_rate": 9.42820965544779e-05,
"loss": 0.468,
"step": 2565
},
{
"epoch": 0.5405405405405406,
"grad_norm": 0.8198602199554443,
"learning_rate": 9.420306414579925e-05,
"loss": 0.5054,
"step": 2580
},
{
"epoch": 0.5436832181018227,
"grad_norm": 0.9718137979507446,
"learning_rate": 9.412352288420572e-05,
"loss": 0.4824,
"step": 2595
},
{
"epoch": 0.5468258956631049,
"grad_norm": 1.0223153829574585,
"learning_rate": 9.404347368535337e-05,
"loss": 0.4502,
"step": 2610
},
{
"epoch": 0.5499685732243872,
"grad_norm": 0.9398010969161987,
"learning_rate": 9.396291747074547e-05,
"loss": 0.4761,
"step": 2625
},
{
"epoch": 0.5531112507856694,
"grad_norm": 0.9091777801513672,
"learning_rate": 9.38818551677219e-05,
"loss": 0.4033,
"step": 2640
},
{
"epoch": 0.5562539283469516,
"grad_norm": 1.06580650806427,
"learning_rate": 9.380028770944849e-05,
"loss": 0.4052,
"step": 2655
},
{
"epoch": 0.5593966059082338,
"grad_norm": 0.7236329913139343,
"learning_rate": 9.371821603490627e-05,
"loss": 0.4677,
"step": 2670
},
{
"epoch": 0.562539283469516,
"grad_norm": 0.8263210654258728,
"learning_rate": 9.363564108888069e-05,
"loss": 0.4576,
"step": 2685
},
{
"epoch": 0.5656819610307983,
"grad_norm": 1.022448182106018,
"learning_rate": 9.355256382195068e-05,
"loss": 0.4963,
"step": 2700
},
{
"epoch": 0.5688246385920804,
"grad_norm": 0.9639766812324524,
"learning_rate": 9.346898519047775e-05,
"loss": 0.4113,
"step": 2715
},
{
"epoch": 0.5719673161533627,
"grad_norm": 1.1044561862945557,
"learning_rate": 9.338490615659499e-05,
"loss": 0.5023,
"step": 2730
},
{
"epoch": 0.5751099937146449,
"grad_norm": 0.8272239565849304,
"learning_rate": 9.330032768819596e-05,
"loss": 0.4699,
"step": 2745
},
{
"epoch": 0.578252671275927,
"grad_norm": 0.7692523002624512,
"learning_rate": 9.321525075892356e-05,
"loss": 0.4292,
"step": 2760
},
{
"epoch": 0.5813953488372093,
"grad_norm": 0.9032982587814331,
"learning_rate": 9.312967634815888e-05,
"loss": 0.4432,
"step": 2775
},
{
"epoch": 0.5845380263984915,
"grad_norm": 0.7676737904548645,
"learning_rate": 9.304360544100982e-05,
"loss": 0.4311,
"step": 2790
},
{
"epoch": 0.5876807039597737,
"grad_norm": 0.9019532799720764,
"learning_rate": 9.29570390282998e-05,
"loss": 0.4464,
"step": 2805
},
{
"epoch": 0.590823381521056,
"grad_norm": 0.9738386869430542,
"learning_rate": 9.286997810655638e-05,
"loss": 0.5019,
"step": 2820
},
{
"epoch": 0.5939660590823381,
"grad_norm": 0.7886769771575928,
"learning_rate": 9.278242367799978e-05,
"loss": 0.4919,
"step": 2835
},
{
"epoch": 0.5971087366436204,
"grad_norm": 0.9002622365951538,
"learning_rate": 9.269437675053129e-05,
"loss": 0.4695,
"step": 2850
},
{
"epoch": 0.6002514142049026,
"grad_norm": 0.7023227214813232,
"learning_rate": 9.260583833772172e-05,
"loss": 0.4338,
"step": 2865
},
{
"epoch": 0.6033940917661847,
"grad_norm": 0.9442479014396667,
"learning_rate": 9.251680945879975e-05,
"loss": 0.4907,
"step": 2880
},
{
"epoch": 0.606536769327467,
"grad_norm": 0.6304488778114319,
"learning_rate": 9.24272911386401e-05,
"loss": 0.4612,
"step": 2895
},
{
"epoch": 0.6096794468887492,
"grad_norm": 0.731960117816925,
"learning_rate": 9.233728440775185e-05,
"loss": 0.4207,
"step": 2910
},
{
"epoch": 0.6128221244500315,
"grad_norm": 1.083849549293518,
"learning_rate": 9.224679030226648e-05,
"loss": 0.4775,
"step": 2925
},
{
"epoch": 0.6159648020113137,
"grad_norm": 0.6792687177658081,
"learning_rate": 9.215580986392607e-05,
"loss": 0.4708,
"step": 2940
},
{
"epoch": 0.6191074795725958,
"grad_norm": 0.7582160830497742,
"learning_rate": 9.20643441400711e-05,
"loss": 0.4352,
"step": 2955
},
{
"epoch": 0.6222501571338781,
"grad_norm": 0.7785065174102783,
"learning_rate": 9.197239418362862e-05,
"loss": 0.4199,
"step": 2970
},
{
"epoch": 0.6253928346951603,
"grad_norm": 0.9076778292655945,
"learning_rate": 9.187996105309995e-05,
"loss": 0.4937,
"step": 2985
},
{
"epoch": 0.6285355122564424,
"grad_norm": 0.9189762473106384,
"learning_rate": 9.178704581254865e-05,
"loss": 0.4553,
"step": 3000
},
{
"epoch": 0.6316781898177247,
"grad_norm": 0.8485803008079529,
"learning_rate": 9.169364953158812e-05,
"loss": 0.4799,
"step": 3015
},
{
"epoch": 0.6348208673790069,
"grad_norm": 0.8296557068824768,
"learning_rate": 9.15997732853694e-05,
"loss": 0.4799,
"step": 3030
},
{
"epoch": 0.6379635449402892,
"grad_norm": 0.9346463680267334,
"learning_rate": 9.150541815456874e-05,
"loss": 0.4707,
"step": 3045
},
{
"epoch": 0.6411062225015713,
"grad_norm": 1.0045510530471802,
"learning_rate": 9.141058522537515e-05,
"loss": 0.5216,
"step": 3060
},
{
"epoch": 0.6442489000628535,
"grad_norm": 0.5840141773223877,
"learning_rate": 9.131527558947796e-05,
"loss": 0.429,
"step": 3075
},
{
"epoch": 0.6473915776241358,
"grad_norm": 0.8743481040000916,
"learning_rate": 9.121949034405417e-05,
"loss": 0.4734,
"step": 3090
},
{
"epoch": 0.650534255185418,
"grad_norm": 0.9631288051605225,
"learning_rate": 9.112323059175588e-05,
"loss": 0.4856,
"step": 3105
},
{
"epoch": 0.6536769327467001,
"grad_norm": 0.7583104372024536,
"learning_rate": 9.102649744069758e-05,
"loss": 0.4428,
"step": 3120
},
{
"epoch": 0.6568196103079824,
"grad_norm": 0.9227087497711182,
"learning_rate": 9.092929200444337e-05,
"loss": 0.4622,
"step": 3135
},
{
"epoch": 0.6599622878692646,
"grad_norm": 0.720124363899231,
"learning_rate": 9.083161540199417e-05,
"loss": 0.4136,
"step": 3150
},
{
"epoch": 0.6631049654305469,
"grad_norm": 0.6481117010116577,
"learning_rate": 9.073346875777487e-05,
"loss": 0.5445,
"step": 3165
},
{
"epoch": 0.666247642991829,
"grad_norm": 0.6970652937889099,
"learning_rate": 9.063485320162126e-05,
"loss": 0.4247,
"step": 3180
},
{
"epoch": 0.6693903205531112,
"grad_norm": 0.5132230520248413,
"learning_rate": 9.053576986876718e-05,
"loss": 0.4415,
"step": 3195
},
{
"epoch": 0.6725329981143935,
"grad_norm": 0.7673790454864502,
"learning_rate": 9.043621989983135e-05,
"loss": 0.5188,
"step": 3210
},
{
"epoch": 0.6756756756756757,
"grad_norm": 0.8441967368125916,
"learning_rate": 9.033620444080428e-05,
"loss": 0.4343,
"step": 3225
},
{
"epoch": 0.6788183532369579,
"grad_norm": 0.8746171593666077,
"learning_rate": 9.023572464303506e-05,
"loss": 0.4114,
"step": 3240
},
{
"epoch": 0.6819610307982401,
"grad_norm": 0.7494221925735474,
"learning_rate": 9.013478166321812e-05,
"loss": 0.4334,
"step": 3255
},
{
"epoch": 0.6851037083595223,
"grad_norm": 0.7263948917388916,
"learning_rate": 9.00333766633799e-05,
"loss": 0.4322,
"step": 3270
},
{
"epoch": 0.6882463859208046,
"grad_norm": 0.852172315120697,
"learning_rate": 8.99315108108655e-05,
"loss": 0.4506,
"step": 3285
},
{
"epoch": 0.6913890634820867,
"grad_norm": 0.7959320545196533,
"learning_rate": 8.98291852783252e-05,
"loss": 0.4456,
"step": 3300
},
{
"epoch": 0.6945317410433689,
"grad_norm": 0.5918748378753662,
"learning_rate": 8.9726401243701e-05,
"loss": 0.4181,
"step": 3315
},
{
"epoch": 0.6976744186046512,
"grad_norm": 0.9726805090904236,
"learning_rate": 8.962315989021304e-05,
"loss": 0.4964,
"step": 3330
},
{
"epoch": 0.7008170961659334,
"grad_norm": 0.8826568126678467,
"learning_rate": 8.951946240634596e-05,
"loss": 0.4702,
"step": 3345
},
{
"epoch": 0.7039597737272156,
"grad_norm": 0.7354099154472351,
"learning_rate": 8.941530998583527e-05,
"loss": 0.4258,
"step": 3360
},
{
"epoch": 0.7071024512884978,
"grad_norm": 0.9217835664749146,
"learning_rate": 8.931070382765359e-05,
"loss": 0.5185,
"step": 3375
},
{
"epoch": 0.71024512884978,
"grad_norm": 0.7444872260093689,
"learning_rate": 8.920564513599679e-05,
"loss": 0.4534,
"step": 3390
},
{
"epoch": 0.7133878064110623,
"grad_norm": 0.7847276926040649,
"learning_rate": 8.910013512027022e-05,
"loss": 0.4232,
"step": 3405
},
{
"epoch": 0.7165304839723444,
"grad_norm": 0.8024355173110962,
"learning_rate": 8.899417499507471e-05,
"loss": 0.4579,
"step": 3420
},
{
"epoch": 0.7196731615336267,
"grad_norm": 0.7088613510131836,
"learning_rate": 8.888776598019266e-05,
"loss": 0.4437,
"step": 3435
},
{
"epoch": 0.7228158390949089,
"grad_norm": 0.6009235382080078,
"learning_rate": 8.87809093005739e-05,
"loss": 0.397,
"step": 3450
},
{
"epoch": 0.725958516656191,
"grad_norm": 0.8743120431900024,
"learning_rate": 8.867360618632172e-05,
"loss": 0.5056,
"step": 3465
},
{
"epoch": 0.7291011942174733,
"grad_norm": 0.899148166179657,
"learning_rate": 8.856585787267856e-05,
"loss": 0.4521,
"step": 3480
},
{
"epoch": 0.7322438717787555,
"grad_norm": 0.8690171837806702,
"learning_rate": 8.845766560001193e-05,
"loss": 0.4708,
"step": 3495
},
{
"epoch": 0.7353865493400377,
"grad_norm": 0.9699186682701111,
"learning_rate": 8.834903061380002e-05,
"loss": 0.4534,
"step": 3510
},
{
"epoch": 0.73852922690132,
"grad_norm": 0.8577262163162231,
"learning_rate": 8.823995416461744e-05,
"loss": 0.4096,
"step": 3525
},
{
"epoch": 0.7416719044626021,
"grad_norm": 0.7458922266960144,
"learning_rate": 8.81304375081208e-05,
"loss": 0.46,
"step": 3540
},
{
"epoch": 0.7448145820238844,
"grad_norm": 0.7347140908241272,
"learning_rate": 8.802048190503423e-05,
"loss": 0.4684,
"step": 3555
},
{
"epoch": 0.7479572595851666,
"grad_norm": 0.7161451578140259,
"learning_rate": 8.79100886211349e-05,
"loss": 0.4715,
"step": 3570
},
{
"epoch": 0.7510999371464487,
"grad_norm": 0.8321588039398193,
"learning_rate": 8.779925892723842e-05,
"loss": 0.3598,
"step": 3585
},
{
"epoch": 0.754242614707731,
"grad_norm": 0.9462142586708069,
"learning_rate": 8.768799409918423e-05,
"loss": 0.4404,
"step": 3600
},
{
"epoch": 0.7573852922690132,
"grad_norm": 0.6842710971832275,
"learning_rate": 8.75762954178209e-05,
"loss": 0.4648,
"step": 3615
},
{
"epoch": 0.7605279698302954,
"grad_norm": 0.8573241829872131,
"learning_rate": 8.746416416899145e-05,
"loss": 0.4592,
"step": 3630
},
{
"epoch": 0.7636706473915776,
"grad_norm": 0.751291811466217,
"learning_rate": 8.735160164351841e-05,
"loss": 0.5319,
"step": 3645
},
{
"epoch": 0.7668133249528598,
"grad_norm": 0.731086790561676,
"learning_rate": 8.72386091371891e-05,
"loss": 0.4629,
"step": 3660
},
{
"epoch": 0.7699560025141421,
"grad_norm": 0.9289976358413696,
"learning_rate": 8.712518795074063e-05,
"loss": 0.4427,
"step": 3675
},
{
"epoch": 0.7730986800754243,
"grad_norm": 0.7036064267158508,
"learning_rate": 8.701133938984496e-05,
"loss": 0.4679,
"step": 3690
},
{
"epoch": 0.7762413576367064,
"grad_norm": 0.778161346912384,
"learning_rate": 8.689706476509385e-05,
"loss": 0.4489,
"step": 3705
},
{
"epoch": 0.7793840351979887,
"grad_norm": 0.8694556951522827,
"learning_rate": 8.678236539198382e-05,
"loss": 0.4048,
"step": 3720
},
{
"epoch": 0.7825267127592709,
"grad_norm": 0.5768362283706665,
"learning_rate": 8.666724259090092e-05,
"loss": 0.4434,
"step": 3735
},
{
"epoch": 0.7856693903205532,
"grad_norm": 0.604917585849762,
"learning_rate": 8.655169768710562e-05,
"loss": 0.4669,
"step": 3750
},
{
"epoch": 0.7888120678818353,
"grad_norm": 0.833985447883606,
"learning_rate": 8.643573201071748e-05,
"loss": 0.4267,
"step": 3765
},
{
"epoch": 0.7919547454431175,
"grad_norm": 0.7951568365097046,
"learning_rate": 8.631934689669992e-05,
"loss": 0.4028,
"step": 3780
},
{
"epoch": 0.7950974230043998,
"grad_norm": 0.7703410983085632,
"learning_rate": 8.620254368484474e-05,
"loss": 0.4153,
"step": 3795
},
{
"epoch": 0.798240100565682,
"grad_norm": 0.8545910716056824,
"learning_rate": 8.608532371975684e-05,
"loss": 0.4949,
"step": 3810
},
{
"epoch": 0.8013827781269641,
"grad_norm": 0.8206099271774292,
"learning_rate": 8.59676883508386e-05,
"loss": 0.4714,
"step": 3825
},
{
"epoch": 0.8045254556882464,
"grad_norm": 0.7841479182243347,
"learning_rate": 8.584963893227442e-05,
"loss": 0.4888,
"step": 3840
},
{
"epoch": 0.8076681332495286,
"grad_norm": 0.7417731285095215,
"learning_rate": 8.573117682301514e-05,
"loss": 0.4951,
"step": 3855
},
{
"epoch": 0.8108108108108109,
"grad_norm": 0.9013925194740295,
"learning_rate": 8.561230338676239e-05,
"loss": 0.4542,
"step": 3870
},
{
"epoch": 0.813953488372093,
"grad_norm": 1.2146642208099365,
"learning_rate": 8.549301999195283e-05,
"loss": 0.4606,
"step": 3885
},
{
"epoch": 0.8170961659333752,
"grad_norm": 0.8740483522415161,
"learning_rate": 8.537332801174245e-05,
"loss": 0.4562,
"step": 3900
},
{
"epoch": 0.8202388434946575,
"grad_norm": 0.7769590020179749,
"learning_rate": 8.525322882399082e-05,
"loss": 0.4385,
"step": 3915
},
{
"epoch": 0.8233815210559396,
"grad_norm": 0.7966271042823792,
"learning_rate": 8.513272381124511e-05,
"loss": 0.4011,
"step": 3930
},
{
"epoch": 0.8265241986172219,
"grad_norm": 0.6132526397705078,
"learning_rate": 8.501181436072422e-05,
"loss": 0.393,
"step": 3945
},
{
"epoch": 0.8296668761785041,
"grad_norm": 0.6438138484954834,
"learning_rate": 8.489050186430285e-05,
"loss": 0.4226,
"step": 3960
},
{
"epoch": 0.8328095537397863,
"grad_norm": 0.8362025022506714,
"learning_rate": 8.476878771849545e-05,
"loss": 0.4216,
"step": 3975
},
{
"epoch": 0.8359522313010685,
"grad_norm": 0.770706057548523,
"learning_rate": 8.464667332444012e-05,
"loss": 0.4278,
"step": 3990
},
{
"epoch": 0.8390949088623507,
"grad_norm": 0.8944802284240723,
"learning_rate": 8.452416008788254e-05,
"loss": 0.4609,
"step": 4005
},
{
"epoch": 0.8422375864236329,
"grad_norm": 0.9292035102844238,
"learning_rate": 8.440124941915972e-05,
"loss": 0.4124,
"step": 4020
},
{
"epoch": 0.8453802639849152,
"grad_norm": 0.6450730562210083,
"learning_rate": 8.427794273318377e-05,
"loss": 0.4124,
"step": 4035
},
{
"epoch": 0.8485229415461973,
"grad_norm": 1.0732468366622925,
"learning_rate": 8.415424144942569e-05,
"loss": 0.4678,
"step": 4050
},
{
"epoch": 0.8516656191074796,
"grad_norm": 0.900360107421875,
"learning_rate": 8.403014699189892e-05,
"loss": 0.4299,
"step": 4065
},
{
"epoch": 0.8548082966687618,
"grad_norm": 0.7163972854614258,
"learning_rate": 8.39056607891431e-05,
"loss": 0.4651,
"step": 4080
},
{
"epoch": 0.857950974230044,
"grad_norm": 0.6078224182128906,
"learning_rate": 8.378078427420739e-05,
"loss": 0.4612,
"step": 4095
},
{
"epoch": 0.8610936517913262,
"grad_norm": 0.7975668907165527,
"learning_rate": 8.365551888463423e-05,
"loss": 0.4521,
"step": 4110
},
{
"epoch": 0.8642363293526084,
"grad_norm": 0.7620348930358887,
"learning_rate": 8.352986606244262e-05,
"loss": 0.4527,
"step": 4125
},
{
"epoch": 0.8673790069138906,
"grad_norm": 0.7811437249183655,
"learning_rate": 8.340382725411155e-05,
"loss": 0.4639,
"step": 4140
},
{
"epoch": 0.8705216844751729,
"grad_norm": 0.46538805961608887,
"learning_rate": 8.327740391056343e-05,
"loss": 0.3793,
"step": 4155
},
{
"epoch": 0.873664362036455,
"grad_norm": 0.893225371837616,
"learning_rate": 8.315059748714728e-05,
"loss": 0.4824,
"step": 4170
},
{
"epoch": 0.8768070395977373,
"grad_norm": 0.8325145244598389,
"learning_rate": 8.302340944362205e-05,
"loss": 0.4623,
"step": 4185
},
{
"epoch": 0.8799497171590195,
"grad_norm": 0.7328510880470276,
"learning_rate": 8.289584124413978e-05,
"loss": 0.4075,
"step": 4200
},
{
"epoch": 0.8830923947203017,
"grad_norm": 0.35754507780075073,
"learning_rate": 8.276789435722875e-05,
"loss": 0.3328,
"step": 4215
},
{
"epoch": 0.8862350722815839,
"grad_norm": 0.78349369764328,
"learning_rate": 8.263957025577663e-05,
"loss": 0.4962,
"step": 4230
},
{
"epoch": 0.8893777498428661,
"grad_norm": 0.644481360912323,
"learning_rate": 8.251087041701339e-05,
"loss": 0.3977,
"step": 4245
},
{
"epoch": 0.8925204274041484,
"grad_norm": 0.618881344795227,
"learning_rate": 8.238179632249443e-05,
"loss": 0.3967,
"step": 4260
},
{
"epoch": 0.8956631049654306,
"grad_norm": 0.7603642344474792,
"learning_rate": 8.22523494580835e-05,
"loss": 0.4413,
"step": 4275
},
{
"epoch": 0.8988057825267127,
"grad_norm": 0.6301630735397339,
"learning_rate": 8.212253131393549e-05,
"loss": 0.4333,
"step": 4290
},
{
"epoch": 0.901948460087995,
"grad_norm": 0.7729358077049255,
"learning_rate": 8.199234338447942e-05,
"loss": 0.4633,
"step": 4305
},
{
"epoch": 0.9050911376492772,
"grad_norm": 0.9121199250221252,
"learning_rate": 8.186178716840118e-05,
"loss": 0.4411,
"step": 4320
},
{
"epoch": 0.9082338152105593,
"grad_norm": 0.5462374091148376,
"learning_rate": 8.17308641686262e-05,
"loss": 0.4659,
"step": 4335
},
{
"epoch": 0.9113764927718416,
"grad_norm": 0.7599003911018372,
"learning_rate": 8.15995758923023e-05,
"loss": 0.4015,
"step": 4350
},
{
"epoch": 0.9145191703331238,
"grad_norm": 0.8557884693145752,
"learning_rate": 8.14679238507822e-05,
"loss": 0.4574,
"step": 4365
},
{
"epoch": 0.9176618478944061,
"grad_norm": 0.7987812757492065,
"learning_rate": 8.133590955960619e-05,
"loss": 0.4501,
"step": 4380
},
{
"epoch": 0.9208045254556882,
"grad_norm": 0.8603717088699341,
"learning_rate": 8.120353453848471e-05,
"loss": 0.4201,
"step": 4395
},
{
"epoch": 0.9239472030169704,
"grad_norm": 0.7066472768783569,
"learning_rate": 8.107080031128078e-05,
"loss": 0.4035,
"step": 4410
},
{
"epoch": 0.9270898805782527,
"grad_norm": 0.6430373191833496,
"learning_rate": 8.09377084059925e-05,
"loss": 0.4141,
"step": 4425
},
{
"epoch": 0.9302325581395349,
"grad_norm": 0.6911259889602661,
"learning_rate": 8.080426035473549e-05,
"loss": 0.4431,
"step": 4440
},
{
"epoch": 0.933375235700817,
"grad_norm": 0.8445611000061035,
"learning_rate": 8.067045769372515e-05,
"loss": 0.4469,
"step": 4455
},
{
"epoch": 0.9365179132620993,
"grad_norm": 0.9317618012428284,
"learning_rate": 8.053630196325914e-05,
"loss": 0.4051,
"step": 4470
},
{
"epoch": 0.9396605908233815,
"grad_norm": 0.8286532163619995,
"learning_rate": 8.040179470769946e-05,
"loss": 0.4158,
"step": 4485
},
{
"epoch": 0.9428032683846638,
"grad_norm": 0.7000495195388794,
"learning_rate": 8.026693747545486e-05,
"loss": 0.4202,
"step": 4500
},
{
"epoch": 0.9459459459459459,
"grad_norm": 0.8104173541069031,
"learning_rate": 8.013173181896283e-05,
"loss": 0.4369,
"step": 4515
},
{
"epoch": 0.9490886235072281,
"grad_norm": 0.864750862121582,
"learning_rate": 7.999617929467187e-05,
"loss": 0.4152,
"step": 4530
},
{
"epoch": 0.9522313010685104,
"grad_norm": 0.7788864970207214,
"learning_rate": 7.98602814630235e-05,
"loss": 0.492,
"step": 4545
},
{
"epoch": 0.9553739786297926,
"grad_norm": 0.707156777381897,
"learning_rate": 7.972403988843435e-05,
"loss": 0.4105,
"step": 4560
},
{
"epoch": 0.9585166561910748,
"grad_norm": 0.8454593420028687,
"learning_rate": 7.958745613927809e-05,
"loss": 0.4622,
"step": 4575
},
{
"epoch": 0.961659333752357,
"grad_norm": 0.8026373982429504,
"learning_rate": 7.945053178786744e-05,
"loss": 0.4236,
"step": 4590
},
{
"epoch": 0.9648020113136392,
"grad_norm": 0.786409318447113,
"learning_rate": 7.931326841043596e-05,
"loss": 0.4677,
"step": 4605
},
{
"epoch": 0.9679446888749215,
"grad_norm": 0.5381405353546143,
"learning_rate": 7.917566758712005e-05,
"loss": 0.443,
"step": 4620
},
{
"epoch": 0.9710873664362036,
"grad_norm": 0.6609058380126953,
"learning_rate": 7.903773090194069e-05,
"loss": 0.4573,
"step": 4635
},
{
"epoch": 0.9742300439974858,
"grad_norm": 0.7192760705947876,
"learning_rate": 7.889945994278514e-05,
"loss": 0.4387,
"step": 4650
},
{
"epoch": 0.9773727215587681,
"grad_norm": 0.7502164244651794,
"learning_rate": 7.87608563013888e-05,
"loss": 0.399,
"step": 4665
},
{
"epoch": 0.9805153991200503,
"grad_norm": 0.7829092144966125,
"learning_rate": 7.86219215733168e-05,
"loss": 0.3705,
"step": 4680
},
{
"epoch": 0.9836580766813325,
"grad_norm": 0.791359007358551,
"learning_rate": 7.848265735794558e-05,
"loss": 0.4434,
"step": 4695
},
{
"epoch": 0.9868007542426147,
"grad_norm": 0.7627493739128113,
"learning_rate": 7.834306525844461e-05,
"loss": 0.4496,
"step": 4710
},
{
"epoch": 0.9899434318038969,
"grad_norm": 0.679959237575531,
"learning_rate": 7.820314688175784e-05,
"loss": 0.4815,
"step": 4725
},
{
"epoch": 0.9930861093651792,
"grad_norm": 0.8766529560089111,
"learning_rate": 7.806290383858523e-05,
"loss": 0.4704,
"step": 4740
},
{
"epoch": 0.9962287869264613,
"grad_norm": 1.1642574071884155,
"learning_rate": 7.792233774336423e-05,
"loss": 0.4974,
"step": 4755
},
{
"epoch": 0.9993714644877436,
"grad_norm": 0.7194317579269409,
"learning_rate": 7.778145021425114e-05,
"loss": 0.4423,
"step": 4770
},
{
"epoch": 1.0025141420490258,
"grad_norm": 0.7814803719520569,
"learning_rate": 7.764024287310252e-05,
"loss": 0.4194,
"step": 4785
},
{
"epoch": 1.005656819610308,
"grad_norm": 0.8891781568527222,
"learning_rate": 7.749871734545652e-05,
"loss": 0.3977,
"step": 4800
},
{
"epoch": 1.0087994971715901,
"grad_norm": 0.7444355487823486,
"learning_rate": 7.735687526051418e-05,
"loss": 0.3924,
"step": 4815
},
{
"epoch": 1.0119421747328725,
"grad_norm": 0.9248786568641663,
"learning_rate": 7.721471825112062e-05,
"loss": 0.4273,
"step": 4830
},
{
"epoch": 1.0150848522941547,
"grad_norm": 0.6513450741767883,
"learning_rate": 7.70722479537463e-05,
"loss": 0.3909,
"step": 4845
},
{
"epoch": 1.0182275298554369,
"grad_norm": 0.8597205877304077,
"learning_rate": 7.692946600846818e-05,
"loss": 0.4027,
"step": 4860
},
{
"epoch": 1.021370207416719,
"grad_norm": 0.9086320996284485,
"learning_rate": 7.678637405895076e-05,
"loss": 0.4225,
"step": 4875
},
{
"epoch": 1.0245128849780012,
"grad_norm": 0.8219915628433228,
"learning_rate": 7.66429737524273e-05,
"loss": 0.4055,
"step": 4890
},
{
"epoch": 1.0276555625392834,
"grad_norm": 0.9232605695724487,
"learning_rate": 7.649926673968069e-05,
"loss": 0.3801,
"step": 4905
},
{
"epoch": 1.0307982401005658,
"grad_norm": 0.8866775035858154,
"learning_rate": 7.635525467502462e-05,
"loss": 0.3887,
"step": 4920
},
{
"epoch": 1.033940917661848,
"grad_norm": 0.6395006775856018,
"learning_rate": 7.62109392162844e-05,
"loss": 0.4018,
"step": 4935
},
{
"epoch": 1.03708359522313,
"grad_norm": 0.8276055455207825,
"learning_rate": 7.60663220247779e-05,
"loss": 0.3875,
"step": 4950
},
{
"epoch": 1.0402262727844123,
"grad_norm": 0.8251763582229614,
"learning_rate": 7.592140476529652e-05,
"loss": 0.3912,
"step": 4965
},
{
"epoch": 1.0433689503456944,
"grad_norm": 0.8321304321289062,
"learning_rate": 7.577618910608591e-05,
"loss": 0.4317,
"step": 4980
},
{
"epoch": 1.0465116279069768,
"grad_norm": 0.6474670171737671,
"learning_rate": 7.56306767188268e-05,
"loss": 0.4594,
"step": 4995
},
{
"epoch": 1.049654305468259,
"grad_norm": 0.6989348530769348,
"learning_rate": 7.548486927861582e-05,
"loss": 0.3744,
"step": 5010
},
{
"epoch": 1.0527969830295412,
"grad_norm": 0.8184515237808228,
"learning_rate": 7.533876846394613e-05,
"loss": 0.3364,
"step": 5025
},
{
"epoch": 1.0559396605908233,
"grad_norm": 0.7965102195739746,
"learning_rate": 7.519237595668811e-05,
"loss": 0.3934,
"step": 5040
},
{
"epoch": 1.0590823381521055,
"grad_norm": 0.731299638748169,
"learning_rate": 7.504569344207007e-05,
"loss": 0.4161,
"step": 5055
},
{
"epoch": 1.062225015713388,
"grad_norm": 0.9074578881263733,
"learning_rate": 7.489872260865877e-05,
"loss": 0.4103,
"step": 5070
},
{
"epoch": 1.06536769327467,
"grad_norm": 0.8735909461975098,
"learning_rate": 7.475146514834001e-05,
"loss": 0.3686,
"step": 5085
},
{
"epoch": 1.0685103708359522,
"grad_norm": 0.7814076542854309,
"learning_rate": 7.460392275629918e-05,
"loss": 0.3943,
"step": 5100
},
{
"epoch": 1.0716530483972344,
"grad_norm": 0.8307476043701172,
"learning_rate": 7.445609713100171e-05,
"loss": 0.3999,
"step": 5115
},
{
"epoch": 1.0747957259585166,
"grad_norm": 0.7908287048339844,
"learning_rate": 7.430798997417353e-05,
"loss": 0.4104,
"step": 5130
},
{
"epoch": 1.077938403519799,
"grad_norm": 0.8598707914352417,
"learning_rate": 7.415960299078143e-05,
"loss": 0.3976,
"step": 5145
},
{
"epoch": 1.0810810810810811,
"grad_norm": 0.5163241028785706,
"learning_rate": 7.40109378890136e-05,
"loss": 0.3506,
"step": 5160
},
{
"epoch": 1.0842237586423633,
"grad_norm": 0.8642787933349609,
"learning_rate": 7.386199638025973e-05,
"loss": 0.31,
"step": 5175
},
{
"epoch": 1.0873664362036455,
"grad_norm": 0.7603743076324463,
"learning_rate": 7.371278017909148e-05,
"loss": 0.4695,
"step": 5190
},
{
"epoch": 1.0905091137649277,
"grad_norm": 0.7949853539466858,
"learning_rate": 7.356329100324273e-05,
"loss": 0.4076,
"step": 5205
},
{
"epoch": 1.0936517913262098,
"grad_norm": 0.8560110926628113,
"learning_rate": 7.341353057358966e-05,
"loss": 0.3833,
"step": 5220
},
{
"epoch": 1.0967944688874922,
"grad_norm": 0.632763147354126,
"learning_rate": 7.326350061413114e-05,
"loss": 0.4128,
"step": 5235
},
{
"epoch": 1.0999371464487744,
"grad_norm": 0.9416031837463379,
"learning_rate": 7.311320285196875e-05,
"loss": 0.3665,
"step": 5250
},
{
"epoch": 1.1030798240100566,
"grad_norm": 0.6195524334907532,
"learning_rate": 7.296263901728694e-05,
"loss": 0.362,
"step": 5265
},
{
"epoch": 1.1062225015713387,
"grad_norm": 0.8545498251914978,
"learning_rate": 7.281181084333311e-05,
"loss": 0.361,
"step": 5280
},
{
"epoch": 1.109365179132621,
"grad_norm": 0.75226229429245,
"learning_rate": 7.26607200663977e-05,
"loss": 0.3948,
"step": 5295
},
{
"epoch": 1.1125078566939033,
"grad_norm": 0.877756655216217,
"learning_rate": 7.250936842579407e-05,
"loss": 0.4061,
"step": 5310
},
{
"epoch": 1.1156505342551855,
"grad_norm": 0.5953283309936523,
"learning_rate": 7.235775766383862e-05,
"loss": 0.3273,
"step": 5325
},
{
"epoch": 1.1187932118164676,
"grad_norm": 0.8206706643104553,
"learning_rate": 7.220588952583071e-05,
"loss": 0.3757,
"step": 5340
},
{
"epoch": 1.1219358893777498,
"grad_norm": 0.7466344237327576,
"learning_rate": 7.205376576003247e-05,
"loss": 0.3892,
"step": 5355
},
{
"epoch": 1.125078566939032,
"grad_norm": 0.8034494519233704,
"learning_rate": 7.190138811764882e-05,
"loss": 0.4043,
"step": 5370
},
{
"epoch": 1.1282212445003144,
"grad_norm": 0.9050668478012085,
"learning_rate": 7.174875835280716e-05,
"loss": 0.3812,
"step": 5385
},
{
"epoch": 1.1313639220615965,
"grad_norm": 0.8540876507759094,
"learning_rate": 7.159587822253733e-05,
"loss": 0.3645,
"step": 5400
},
{
"epoch": 1.1345065996228787,
"grad_norm": 0.7688354849815369,
"learning_rate": 7.14427494867512e-05,
"loss": 0.3683,
"step": 5415
},
{
"epoch": 1.1376492771841609,
"grad_norm": 0.6950829029083252,
"learning_rate": 7.128937390822261e-05,
"loss": 0.3347,
"step": 5430
},
{
"epoch": 1.140791954745443,
"grad_norm": 0.8212427496910095,
"learning_rate": 7.113575325256694e-05,
"loss": 0.3775,
"step": 5445
},
{
"epoch": 1.1439346323067254,
"grad_norm": 0.8312988877296448,
"learning_rate": 7.098188928822084e-05,
"loss": 0.4325,
"step": 5460
},
{
"epoch": 1.1470773098680076,
"grad_norm": 0.9646623134613037,
"learning_rate": 7.082778378642184e-05,
"loss": 0.3898,
"step": 5475
},
{
"epoch": 1.1502199874292898,
"grad_norm": 0.8333424925804138,
"learning_rate": 7.0673438521188e-05,
"loss": 0.4068,
"step": 5490
},
{
"epoch": 1.153362664990572,
"grad_norm": 0.918892502784729,
"learning_rate": 7.051885526929747e-05,
"loss": 0.3968,
"step": 5505
},
{
"epoch": 1.156505342551854,
"grad_norm": 0.5460782647132874,
"learning_rate": 7.0364035810268e-05,
"loss": 0.3672,
"step": 5520
},
{
"epoch": 1.1596480201131363,
"grad_norm": 0.876811683177948,
"learning_rate": 7.020898192633655e-05,
"loss": 0.408,
"step": 5535
},
{
"epoch": 1.1627906976744187,
"grad_norm": 0.6740222573280334,
"learning_rate": 7.005369540243864e-05,
"loss": 0.2995,
"step": 5550
},
{
"epoch": 1.1659333752357008,
"grad_norm": 0.8702965378761292,
"learning_rate": 6.989817802618792e-05,
"loss": 0.3307,
"step": 5565
},
{
"epoch": 1.169076052796983,
"grad_norm": 0.8837511539459229,
"learning_rate": 6.974243158785554e-05,
"loss": 0.3864,
"step": 5580
},
{
"epoch": 1.1722187303582652,
"grad_norm": 0.4050454795360565,
"learning_rate": 6.958645788034952e-05,
"loss": 0.3525,
"step": 5595
},
{
"epoch": 1.1753614079195476,
"grad_norm": 0.8361005187034607,
"learning_rate": 6.943025869919418e-05,
"loss": 0.3747,
"step": 5610
},
{
"epoch": 1.1785040854808297,
"grad_norm": 0.841556191444397,
"learning_rate": 6.92738358425094e-05,
"loss": 0.406,
"step": 5625
},
{
"epoch": 1.181646763042112,
"grad_norm": 0.629443883895874,
"learning_rate": 6.911719111098996e-05,
"loss": 0.4175,
"step": 5640
},
{
"epoch": 1.184789440603394,
"grad_norm": 0.7146449685096741,
"learning_rate": 6.896032630788476e-05,
"loss": 0.3511,
"step": 5655
},
{
"epoch": 1.1879321181646763,
"grad_norm": 0.8358393311500549,
"learning_rate": 6.880324323897617e-05,
"loss": 0.3851,
"step": 5670
},
{
"epoch": 1.1910747957259584,
"grad_norm": 0.742857813835144,
"learning_rate": 6.864594371255913e-05,
"loss": 0.3821,
"step": 5685
},
{
"epoch": 1.1942174732872408,
"grad_norm": 0.7099196910858154,
"learning_rate": 6.848842953942036e-05,
"loss": 0.3789,
"step": 5700
},
{
"epoch": 1.197360150848523,
"grad_norm": 0.754542350769043,
"learning_rate": 6.83307025328176e-05,
"loss": 0.3472,
"step": 5715
},
{
"epoch": 1.2005028284098052,
"grad_norm": 0.7466986775398254,
"learning_rate": 6.817276450845856e-05,
"loss": 0.3393,
"step": 5730
},
{
"epoch": 1.2036455059710873,
"grad_norm": 0.7026840448379517,
"learning_rate": 6.801461728448022e-05,
"loss": 0.3891,
"step": 5745
},
{
"epoch": 1.2067881835323695,
"grad_norm": 1.1348669528961182,
"learning_rate": 6.785626268142777e-05,
"loss": 0.3802,
"step": 5760
},
{
"epoch": 1.2099308610936519,
"grad_norm": 0.7511578798294067,
"learning_rate": 6.769770252223369e-05,
"loss": 0.4252,
"step": 5775
},
{
"epoch": 1.213073538654934,
"grad_norm": 0.8412914276123047,
"learning_rate": 6.753893863219675e-05,
"loss": 0.3813,
"step": 5790
},
{
"epoch": 1.2162162162162162,
"grad_norm": 0.8765383958816528,
"learning_rate": 6.737997283896103e-05,
"loss": 0.3712,
"step": 5805
},
{
"epoch": 1.2193588937774984,
"grad_norm": 0.7843053340911865,
"learning_rate": 6.722080697249487e-05,
"loss": 0.3776,
"step": 5820
},
{
"epoch": 1.2225015713387806,
"grad_norm": 1.0745536088943481,
"learning_rate": 6.706144286506978e-05,
"loss": 0.3499,
"step": 5835
},
{
"epoch": 1.2256442489000627,
"grad_norm": 0.7722020745277405,
"learning_rate": 6.690188235123934e-05,
"loss": 0.4211,
"step": 5850
},
{
"epoch": 1.2287869264613451,
"grad_norm": 0.9631087183952332,
"learning_rate": 6.674212726781814e-05,
"loss": 0.3772,
"step": 5865
},
{
"epoch": 1.2319296040226273,
"grad_norm": 0.8981698751449585,
"learning_rate": 6.65821794538606e-05,
"loss": 0.4598,
"step": 5880
},
{
"epoch": 1.2350722815839095,
"grad_norm": 0.778362512588501,
"learning_rate": 6.642204075063974e-05,
"loss": 0.4179,
"step": 5895
},
{
"epoch": 1.2382149591451916,
"grad_norm": 0.8421118259429932,
"learning_rate": 6.626171300162615e-05,
"loss": 0.3583,
"step": 5910
},
{
"epoch": 1.241357636706474,
"grad_norm": 1.0227240324020386,
"learning_rate": 6.610119805246653e-05,
"loss": 0.3919,
"step": 5925
},
{
"epoch": 1.2445003142677562,
"grad_norm": 0.5748106837272644,
"learning_rate": 6.594049775096268e-05,
"loss": 0.3571,
"step": 5940
},
{
"epoch": 1.2476429918290384,
"grad_norm": 0.6924661993980408,
"learning_rate": 6.577961394705008e-05,
"loss": 0.3812,
"step": 5955
},
{
"epoch": 1.2507856693903205,
"grad_norm": 0.7702043056488037,
"learning_rate": 6.561854849277664e-05,
"loss": 0.331,
"step": 5970
},
{
"epoch": 1.2539283469516027,
"grad_norm": 0.6666329503059387,
"learning_rate": 6.545730324228136e-05,
"loss": 0.3266,
"step": 5985
},
{
"epoch": 1.2570710245128849,
"grad_norm": 0.9120034575462341,
"learning_rate": 6.529588005177305e-05,
"loss": 0.4188,
"step": 6000
},
{
"epoch": 1.260213702074167,
"grad_norm": 0.7251651287078857,
"learning_rate": 6.513428077950886e-05,
"loss": 0.4067,
"step": 6015
},
{
"epoch": 1.2633563796354494,
"grad_norm": 0.6845729947090149,
"learning_rate": 6.497250728577296e-05,
"loss": 0.4266,
"step": 6030
},
{
"epoch": 1.2664990571967316,
"grad_norm": 0.7530787587165833,
"learning_rate": 6.481056143285512e-05,
"loss": 0.3302,
"step": 6045
},
{
"epoch": 1.2696417347580138,
"grad_norm": 0.7474608421325684,
"learning_rate": 6.464844508502927e-05,
"loss": 0.4305,
"step": 6060
},
{
"epoch": 1.2727844123192962,
"grad_norm": 0.8672669529914856,
"learning_rate": 6.448616010853199e-05,
"loss": 0.4267,
"step": 6075
},
{
"epoch": 1.2759270898805783,
"grad_norm": 0.7703887224197388,
"learning_rate": 6.432370837154109e-05,
"loss": 0.3531,
"step": 6090
},
{
"epoch": 1.2790697674418605,
"grad_norm": 0.7432886958122253,
"learning_rate": 6.416109174415406e-05,
"loss": 0.3189,
"step": 6105
},
{
"epoch": 1.2822124450031427,
"grad_norm": 0.9600912928581238,
"learning_rate": 6.399831209836659e-05,
"loss": 0.4036,
"step": 6120
},
{
"epoch": 1.2853551225644249,
"grad_norm": 0.7727882862091064,
"learning_rate": 6.383537130805098e-05,
"loss": 0.3857,
"step": 6135
},
{
"epoch": 1.288497800125707,
"grad_norm": 0.7628008723258972,
"learning_rate": 6.367227124893455e-05,
"loss": 0.4229,
"step": 6150
},
{
"epoch": 1.2916404776869892,
"grad_norm": 0.9682219624519348,
"learning_rate": 6.350901379857814e-05,
"loss": 0.3544,
"step": 6165
},
{
"epoch": 1.2947831552482716,
"grad_norm": 0.7553837895393372,
"learning_rate": 6.334560083635434e-05,
"loss": 0.3968,
"step": 6180
},
{
"epoch": 1.2979258328095538,
"grad_norm": 0.7951422333717346,
"learning_rate": 6.318203424342605e-05,
"loss": 0.2946,
"step": 6195
},
{
"epoch": 1.301068510370836,
"grad_norm": 0.9351706504821777,
"learning_rate": 6.301831590272465e-05,
"loss": 0.4203,
"step": 6210
},
{
"epoch": 1.304211187932118,
"grad_norm": 0.8283166289329529,
"learning_rate": 6.28544476989284e-05,
"loss": 0.4166,
"step": 6225
},
{
"epoch": 1.3073538654934005,
"grad_norm": 0.7889246940612793,
"learning_rate": 6.269043151844081e-05,
"loss": 0.4084,
"step": 6240
},
{
"epoch": 1.3104965430546827,
"grad_norm": 0.7893148064613342,
"learning_rate": 6.252626924936876e-05,
"loss": 0.3327,
"step": 6255
},
{
"epoch": 1.3136392206159648,
"grad_norm": 0.9599968194961548,
"learning_rate": 6.236196278150092e-05,
"loss": 0.3987,
"step": 6270
},
{
"epoch": 1.316781898177247,
"grad_norm": 0.7326962351799011,
"learning_rate": 6.219751400628593e-05,
"loss": 0.3872,
"step": 6285
},
{
"epoch": 1.3199245757385292,
"grad_norm": 0.7666275501251221,
"learning_rate": 6.203292481681061e-05,
"loss": 0.2906,
"step": 6300
},
{
"epoch": 1.3230672532998113,
"grad_norm": 0.7648006081581116,
"learning_rate": 6.186819710777819e-05,
"loss": 0.4077,
"step": 6315
},
{
"epoch": 1.3262099308610937,
"grad_norm": 0.8993086218833923,
"learning_rate": 6.170333277548653e-05,
"loss": 0.3334,
"step": 6330
},
{
"epoch": 1.329352608422376,
"grad_norm": 0.8966405987739563,
"learning_rate": 6.153833371780622e-05,
"loss": 0.3772,
"step": 6345
},
{
"epoch": 1.332495285983658,
"grad_norm": 0.955697774887085,
"learning_rate": 6.137320183415877e-05,
"loss": 0.3652,
"step": 6360
},
{
"epoch": 1.3356379635449402,
"grad_norm": 0.913931667804718,
"learning_rate": 6.120793902549478e-05,
"loss": 0.3943,
"step": 6375
},
{
"epoch": 1.3387806411062226,
"grad_norm": 0.471160352230072,
"learning_rate": 6.1042547194272e-05,
"loss": 0.3656,
"step": 6390
},
{
"epoch": 1.3419233186675048,
"grad_norm": 0.7883521914482117,
"learning_rate": 6.0877028244433444e-05,
"loss": 0.3494,
"step": 6405
},
{
"epoch": 1.345065996228787,
"grad_norm": 0.8015203475952148,
"learning_rate": 6.071138408138547e-05,
"loss": 0.3498,
"step": 6420
},
{
"epoch": 1.3482086737900691,
"grad_norm": 0.8431302905082703,
"learning_rate": 6.0545616611975886e-05,
"loss": 0.3726,
"step": 6435
},
{
"epoch": 1.3513513513513513,
"grad_norm": 0.6410717964172363,
"learning_rate": 6.0379727744471936e-05,
"loss": 0.3793,
"step": 6450
},
{
"epoch": 1.3544940289126335,
"grad_norm": 0.8410218358039856,
"learning_rate": 6.021371938853839e-05,
"loss": 0.4294,
"step": 6465
},
{
"epoch": 1.3576367064739157,
"grad_norm": 0.622178852558136,
"learning_rate": 6.004759345521552e-05,
"loss": 0.3373,
"step": 6480
},
{
"epoch": 1.360779384035198,
"grad_norm": 0.8277848362922668,
"learning_rate": 5.988135185689712e-05,
"loss": 0.3796,
"step": 6495
},
{
"epoch": 1.3639220615964802,
"grad_norm": 0.799150824546814,
"learning_rate": 5.9714996507308465e-05,
"loss": 0.3361,
"step": 6510
},
{
"epoch": 1.3670647391577624,
"grad_norm": 0.8518102765083313,
"learning_rate": 5.954852932148433e-05,
"loss": 0.3913,
"step": 6525
},
{
"epoch": 1.3702074167190446,
"grad_norm": 0.7465687990188599,
"learning_rate": 5.9381952215746905e-05,
"loss": 0.3546,
"step": 6540
},
{
"epoch": 1.373350094280327,
"grad_norm": 0.7342978119850159,
"learning_rate": 5.921526710768376e-05,
"loss": 0.3832,
"step": 6555
},
{
"epoch": 1.3764927718416091,
"grad_norm": 0.6754856109619141,
"learning_rate": 5.9048475916125723e-05,
"loss": 0.4051,
"step": 6570
},
{
"epoch": 1.3796354494028913,
"grad_norm": 0.6392863988876343,
"learning_rate": 5.888158056112486e-05,
"loss": 0.3828,
"step": 6585
},
{
"epoch": 1.3827781269641735,
"grad_norm": 0.897132933139801,
"learning_rate": 5.871458296393231e-05,
"loss": 0.405,
"step": 6600
},
{
"epoch": 1.3859208045254556,
"grad_norm": 0.7124328017234802,
"learning_rate": 5.854748504697624e-05,
"loss": 0.3712,
"step": 6615
},
{
"epoch": 1.3890634820867378,
"grad_norm": 0.8436194062232971,
"learning_rate": 5.8380288733839585e-05,
"loss": 0.3773,
"step": 6630
},
{
"epoch": 1.3922061596480202,
"grad_norm": 0.780944287776947,
"learning_rate": 5.8212995949238083e-05,
"loss": 0.3529,
"step": 6645
},
{
"epoch": 1.3953488372093024,
"grad_norm": 1.0335406064987183,
"learning_rate": 5.804560861899795e-05,
"loss": 0.4262,
"step": 6660
},
{
"epoch": 1.3984915147705845,
"grad_norm": 0.7593971490859985,
"learning_rate": 5.7878128670033826e-05,
"loss": 0.4079,
"step": 6675
},
{
"epoch": 1.4016341923318667,
"grad_norm": 0.7240027189254761,
"learning_rate": 5.7710558030326545e-05,
"loss": 0.3835,
"step": 6690
},
{
"epoch": 1.404776869893149,
"grad_norm": 1.530868411064148,
"learning_rate": 5.754289862890093e-05,
"loss": 0.4294,
"step": 6705
},
{
"epoch": 1.4079195474544313,
"grad_norm": 0.6043078899383545,
"learning_rate": 5.7375152395803624e-05,
"loss": 0.3343,
"step": 6720
},
{
"epoch": 1.4110622250157134,
"grad_norm": 0.8058659434318542,
"learning_rate": 5.720732126208082e-05,
"loss": 0.4533,
"step": 6735
},
{
"epoch": 1.4142049025769956,
"grad_norm": 0.7185141444206238,
"learning_rate": 5.7039407159756106e-05,
"loss": 0.42,
"step": 6750
},
{
"epoch": 1.4173475801382778,
"grad_norm": 1.0086369514465332,
"learning_rate": 5.687141202180817e-05,
"loss": 0.3701,
"step": 6765
},
{
"epoch": 1.42049025769956,
"grad_norm": 1.0289742946624756,
"learning_rate": 5.67033377821485e-05,
"loss": 0.4565,
"step": 6780
},
{
"epoch": 1.4236329352608421,
"grad_norm": 1.1389039754867554,
"learning_rate": 5.6535186375599266e-05,
"loss": 0.3555,
"step": 6795
},
{
"epoch": 1.4267756128221245,
"grad_norm": 0.887610673904419,
"learning_rate": 5.636695973787093e-05,
"loss": 0.368,
"step": 6810
},
{
"epoch": 1.4299182903834067,
"grad_norm": 0.9625629186630249,
"learning_rate": 5.619865980553994e-05,
"loss": 0.3962,
"step": 6825
},
{
"epoch": 1.4330609679446888,
"grad_norm": 0.8793766498565674,
"learning_rate": 5.6030288516026564e-05,
"loss": 0.3979,
"step": 6840
},
{
"epoch": 1.436203645505971,
"grad_norm": 0.7626388669013977,
"learning_rate": 5.586184780757251e-05,
"loss": 0.345,
"step": 6855
},
{
"epoch": 1.4393463230672534,
"grad_norm": 1.109713077545166,
"learning_rate": 5.5693339619218534e-05,
"loss": 0.4446,
"step": 6870
},
{
"epoch": 1.4424890006285356,
"grad_norm": 0.9758956432342529,
"learning_rate": 5.552476589078231e-05,
"loss": 0.401,
"step": 6885
},
{
"epoch": 1.4456316781898177,
"grad_norm": 0.923329770565033,
"learning_rate": 5.5356128562835904e-05,
"loss": 0.385,
"step": 6900
},
{
"epoch": 1.4487743557511,
"grad_norm": 0.7539265155792236,
"learning_rate": 5.518742957668359e-05,
"loss": 0.3274,
"step": 6915
},
{
"epoch": 1.451917033312382,
"grad_norm": 0.8187793493270874,
"learning_rate": 5.5018670874339386e-05,
"loss": 0.3677,
"step": 6930
},
{
"epoch": 1.4550597108736643,
"grad_norm": 0.9522603750228882,
"learning_rate": 5.484985439850473e-05,
"loss": 0.3319,
"step": 6945
},
{
"epoch": 1.4582023884349467,
"grad_norm": 0.8808611631393433,
"learning_rate": 5.468098209254622e-05,
"loss": 0.4311,
"step": 6960
},
{
"epoch": 1.4613450659962288,
"grad_norm": 0.6949836611747742,
"learning_rate": 5.4512055900473035e-05,
"loss": 0.3679,
"step": 6975
},
{
"epoch": 1.464487743557511,
"grad_norm": 0.783545196056366,
"learning_rate": 5.434307776691479e-05,
"loss": 0.3552,
"step": 6990
},
{
"epoch": 1.4676304211187932,
"grad_norm": 0.8342312574386597,
"learning_rate": 5.417404963709894e-05,
"loss": 0.3755,
"step": 7005
},
{
"epoch": 1.4707730986800756,
"grad_norm": 0.7615540027618408,
"learning_rate": 5.400497345682857e-05,
"loss": 0.3605,
"step": 7020
},
{
"epoch": 1.4739157762413577,
"grad_norm": 0.8944594860076904,
"learning_rate": 5.3835851172459794e-05,
"loss": 0.3948,
"step": 7035
},
{
"epoch": 1.47705845380264,
"grad_norm": 0.8412215113639832,
"learning_rate": 5.36666847308796e-05,
"loss": 0.3658,
"step": 7050
},
{
"epoch": 1.480201131363922,
"grad_norm": 0.8457724452018738,
"learning_rate": 5.34974760794832e-05,
"loss": 0.4327,
"step": 7065
},
{
"epoch": 1.4833438089252042,
"grad_norm": 0.7231891751289368,
"learning_rate": 5.332822716615172e-05,
"loss": 0.3489,
"step": 7080
},
{
"epoch": 1.4864864864864864,
"grad_norm": 0.8975026607513428,
"learning_rate": 5.315893993922986e-05,
"loss": 0.331,
"step": 7095
},
{
"epoch": 1.4896291640477686,
"grad_norm": 0.871842086315155,
"learning_rate": 5.2989616347503244e-05,
"loss": 0.4056,
"step": 7110
},
{
"epoch": 1.492771841609051,
"grad_norm": 0.5846161246299744,
"learning_rate": 5.282025834017623e-05,
"loss": 0.381,
"step": 7125
},
{
"epoch": 1.4959145191703331,
"grad_norm": 0.6650387644767761,
"learning_rate": 5.265086786684929e-05,
"loss": 0.34,
"step": 7140
},
{
"epoch": 1.4990571967316153,
"grad_norm": 0.862241804599762,
"learning_rate": 5.2481446877496665e-05,
"loss": 0.354,
"step": 7155
},
{
"epoch": 1.5021998742928977,
"grad_norm": 0.8328828811645508,
"learning_rate": 5.231199732244386e-05,
"loss": 0.3772,
"step": 7170
},
{
"epoch": 1.5053425518541799,
"grad_norm": 0.5438669323921204,
"learning_rate": 5.214252115234527e-05,
"loss": 0.3493,
"step": 7185
},
{
"epoch": 1.508485229415462,
"grad_norm": 0.7722681760787964,
"learning_rate": 5.197302031816165e-05,
"loss": 0.3494,
"step": 7200
},
{
"epoch": 1.5116279069767442,
"grad_norm": 0.9693325161933899,
"learning_rate": 5.180349677113762e-05,
"loss": 0.3512,
"step": 7215
},
{
"epoch": 1.5147705845380264,
"grad_norm": 1.0208348035812378,
"learning_rate": 5.163395246277938e-05,
"loss": 0.2772,
"step": 7230
},
{
"epoch": 1.5179132620993085,
"grad_norm": 0.8255509734153748,
"learning_rate": 5.1464389344832024e-05,
"loss": 0.3491,
"step": 7245
},
{
"epoch": 1.5210559396605907,
"grad_norm": 0.723574697971344,
"learning_rate": 5.1294809369257244e-05,
"loss": 0.3894,
"step": 7260
},
{
"epoch": 1.5241986172218729,
"grad_norm": 0.8955418467521667,
"learning_rate": 5.112521448821076e-05,
"loss": 0.3722,
"step": 7275
},
{
"epoch": 1.5273412947831553,
"grad_norm": 0.9446234703063965,
"learning_rate": 5.0955606654019895e-05,
"loss": 0.3602,
"step": 7290
},
{
"epoch": 1.5304839723444374,
"grad_norm": 0.7256786227226257,
"learning_rate": 5.078598781916107e-05,
"loss": 0.3488,
"step": 7305
},
{
"epoch": 1.5336266499057196,
"grad_norm": 0.775834858417511,
"learning_rate": 5.0616359936237355e-05,
"loss": 0.3983,
"step": 7320
},
{
"epoch": 1.536769327467002,
"grad_norm": 0.7684575915336609,
"learning_rate": 5.044672495795598e-05,
"loss": 0.3992,
"step": 7335
},
{
"epoch": 1.5399120050282842,
"grad_norm": 0.7569010853767395,
"learning_rate": 5.0277084837105826e-05,
"loss": 0.352,
"step": 7350
},
{
"epoch": 1.5430546825895664,
"grad_norm": 0.7330282926559448,
"learning_rate": 5.010744152653501e-05,
"loss": 0.3486,
"step": 7365
},
{
"epoch": 1.5461973601508485,
"grad_norm": 0.8921106457710266,
"learning_rate": 4.993779697912837e-05,
"loss": 0.3107,
"step": 7380
},
{
"epoch": 1.5493400377121307,
"grad_norm": 0.7190592288970947,
"learning_rate": 4.976815314778493e-05,
"loss": 0.3429,
"step": 7395
},
{
"epoch": 1.5524827152734129,
"grad_norm": 0.8145999312400818,
"learning_rate": 4.9598511985395535e-05,
"loss": 0.3455,
"step": 7410
},
{
"epoch": 1.555625392834695,
"grad_norm": 0.7628950476646423,
"learning_rate": 4.942887544482029e-05,
"loss": 0.3362,
"step": 7425
},
{
"epoch": 1.5587680703959774,
"grad_norm": 0.5859194993972778,
"learning_rate": 4.925924547886603e-05,
"loss": 0.3723,
"step": 7440
},
{
"epoch": 1.5619107479572596,
"grad_norm": 0.7906526327133179,
"learning_rate": 4.9089624040264013e-05,
"loss": 0.3511,
"step": 7455
},
{
"epoch": 1.5650534255185418,
"grad_norm": 0.7591722011566162,
"learning_rate": 4.892001308164727e-05,
"loss": 0.4439,
"step": 7470
},
{
"epoch": 1.5681961030798242,
"grad_norm": 0.9237760901451111,
"learning_rate": 4.875041455552817e-05,
"loss": 0.3638,
"step": 7485
},
{
"epoch": 1.5713387806411063,
"grad_norm": 0.734752893447876,
"learning_rate": 4.858083041427599e-05,
"loss": 0.4047,
"step": 7500
},
{
"epoch": 1.5744814582023885,
"grad_norm": 0.676703155040741,
"learning_rate": 4.8411262610094445e-05,
"loss": 0.3566,
"step": 7515
},
{
"epoch": 1.5776241357636707,
"grad_norm": 0.8751126527786255,
"learning_rate": 4.824171309499913e-05,
"loss": 0.3743,
"step": 7530
},
{
"epoch": 1.5807668133249528,
"grad_norm": 0.6884835958480835,
"learning_rate": 4.807218382079511e-05,
"loss": 0.3821,
"step": 7545
},
{
"epoch": 1.583909490886235,
"grad_norm": 0.8230961561203003,
"learning_rate": 4.790267673905447e-05,
"loss": 0.3193,
"step": 7560
},
{
"epoch": 1.5870521684475172,
"grad_norm": 0.8046270608901978,
"learning_rate": 4.7733193801093803e-05,
"loss": 0.3714,
"step": 7575
},
{
"epoch": 1.5901948460087993,
"grad_norm": 0.895897626876831,
"learning_rate": 4.756373695795177e-05,
"loss": 0.386,
"step": 7590
},
{
"epoch": 1.5933375235700817,
"grad_norm": 0.8858537077903748,
"learning_rate": 4.7394308160366617e-05,
"loss": 0.3755,
"step": 7605
},
{
"epoch": 1.596480201131364,
"grad_norm": 0.6874979138374329,
"learning_rate": 4.722490935875377e-05,
"loss": 0.3547,
"step": 7620
},
{
"epoch": 1.5996228786926463,
"grad_norm": 0.8027022480964661,
"learning_rate": 4.705554250318335e-05,
"loss": 0.3702,
"step": 7635
},
{
"epoch": 1.6027655562539285,
"grad_norm": 0.9383290410041809,
"learning_rate": 4.688620954335766e-05,
"loss": 0.4038,
"step": 7650
},
{
"epoch": 1.6059082338152106,
"grad_norm": 0.8475779294967651,
"learning_rate": 4.671691242858891e-05,
"loss": 0.3257,
"step": 7665
},
{
"epoch": 1.6090509113764928,
"grad_norm": 0.702893853187561,
"learning_rate": 4.654765310777659e-05,
"loss": 0.3642,
"step": 7680
},
{
"epoch": 1.612193588937775,
"grad_norm": 0.7762289047241211,
"learning_rate": 4.6378433529385157e-05,
"loss": 0.3859,
"step": 7695
},
{
"epoch": 1.6153362664990571,
"grad_norm": 0.7309826016426086,
"learning_rate": 4.620925564142151e-05,
"loss": 0.3427,
"step": 7710
},
{
"epoch": 1.6184789440603393,
"grad_norm": 0.655974805355072,
"learning_rate": 4.60401213914127e-05,
"loss": 0.3893,
"step": 7725
},
{
"epoch": 1.6216216216216215,
"grad_norm": 0.7434260845184326,
"learning_rate": 4.5871032726383386e-05,
"loss": 0.3528,
"step": 7740
},
{
"epoch": 1.6247642991829039,
"grad_norm": 0.981696605682373,
"learning_rate": 4.570199159283345e-05,
"loss": 0.3792,
"step": 7755
},
{
"epoch": 1.627906976744186,
"grad_norm": 0.5884058475494385,
"learning_rate": 4.553299993671567e-05,
"loss": 0.3082,
"step": 7770
},
{
"epoch": 1.6310496543054682,
"grad_norm": 0.9349349737167358,
"learning_rate": 4.536405970341317e-05,
"loss": 0.3736,
"step": 7785
},
{
"epoch": 1.6341923318667506,
"grad_norm": 0.8422302603721619,
"learning_rate": 4.519517283771717e-05,
"loss": 0.3897,
"step": 7800
},
{
"epoch": 1.6373350094280328,
"grad_norm": 0.7569222450256348,
"learning_rate": 4.502634128380448e-05,
"loss": 0.3581,
"step": 7815
},
{
"epoch": 1.640477686989315,
"grad_norm": 0.8034069538116455,
"learning_rate": 4.4857566985215276e-05,
"loss": 0.3542,
"step": 7830
},
{
"epoch": 1.6436203645505971,
"grad_norm": 0.5547857284545898,
"learning_rate": 4.4688851884830516e-05,
"loss": 0.3089,
"step": 7845
},
{
"epoch": 1.6467630421118793,
"grad_norm": 0.8145669102668762,
"learning_rate": 4.452019792484975e-05,
"loss": 0.3391,
"step": 7860
},
{
"epoch": 1.6499057196731615,
"grad_norm": 0.672332227230072,
"learning_rate": 4.4351607046768704e-05,
"loss": 0.3866,
"step": 7875
},
{
"epoch": 1.6530483972344436,
"grad_norm": 0.7952318787574768,
"learning_rate": 4.418308119135686e-05,
"loss": 0.4221,
"step": 7890
},
{
"epoch": 1.6561910747957258,
"grad_norm": 0.7489158511161804,
"learning_rate": 4.401462229863526e-05,
"loss": 0.3687,
"step": 7905
},
{
"epoch": 1.6593337523570082,
"grad_norm": 0.8457122445106506,
"learning_rate": 4.3846232307854e-05,
"loss": 0.3888,
"step": 7920
},
{
"epoch": 1.6624764299182904,
"grad_norm": 0.7040199637413025,
"learning_rate": 4.36779131574701e-05,
"loss": 0.3437,
"step": 7935
},
{
"epoch": 1.6656191074795728,
"grad_norm": 1.0369516611099243,
"learning_rate": 4.3509666785125005e-05,
"loss": 0.3557,
"step": 7950
},
{
"epoch": 1.668761785040855,
"grad_norm": 0.7418217062950134,
"learning_rate": 4.334149512762238e-05,
"loss": 0.351,
"step": 7965
},
{
"epoch": 1.671904462602137,
"grad_norm": 0.6527841687202454,
"learning_rate": 4.3173400120905824e-05,
"loss": 0.3286,
"step": 7980
},
{
"epoch": 1.6750471401634193,
"grad_norm": 0.9062017798423767,
"learning_rate": 4.3005383700036525e-05,
"loss": 0.3828,
"step": 7995
},
{
"epoch": 1.6781898177247014,
"grad_norm": 0.6981047987937927,
"learning_rate": 4.283744779917102e-05,
"loss": 0.3689,
"step": 8010
},
{
"epoch": 1.6813324952859836,
"grad_norm": 0.8865767121315002,
"learning_rate": 4.26695943515389e-05,
"loss": 0.3912,
"step": 8025
},
{
"epoch": 1.6844751728472658,
"grad_norm": 0.5835604667663574,
"learning_rate": 4.250182528942065e-05,
"loss": 0.317,
"step": 8040
},
{
"epoch": 1.687617850408548,
"grad_norm": 0.869529128074646,
"learning_rate": 4.233414254412525e-05,
"loss": 0.4031,
"step": 8055
},
{
"epoch": 1.6907605279698303,
"grad_norm": 0.7666299939155579,
"learning_rate": 4.216654804596808e-05,
"loss": 0.3635,
"step": 8070
},
{
"epoch": 1.6939032055311125,
"grad_norm": 0.6868289709091187,
"learning_rate": 4.199904372424858e-05,
"loss": 0.3554,
"step": 8085
},
{
"epoch": 1.6970458830923947,
"grad_norm": 0.7406291961669922,
"learning_rate": 4.183163150722822e-05,
"loss": 0.3216,
"step": 8100
},
{
"epoch": 1.700188560653677,
"grad_norm": 0.7962248921394348,
"learning_rate": 4.166431332210807e-05,
"loss": 0.3398,
"step": 8115
},
{
"epoch": 1.7033312382149592,
"grad_norm": 1.02495276927948,
"learning_rate": 4.149709109500678e-05,
"loss": 0.3817,
"step": 8130
},
{
"epoch": 1.7064739157762414,
"grad_norm": 0.7741113305091858,
"learning_rate": 4.13299667509384e-05,
"loss": 0.4072,
"step": 8145
},
{
"epoch": 1.7096165933375236,
"grad_norm": 0.7952526807785034,
"learning_rate": 4.1162942213790086e-05,
"loss": 0.3441,
"step": 8160
},
{
"epoch": 1.7127592708988058,
"grad_norm": 0.7849689722061157,
"learning_rate": 4.0996019406300126e-05,
"loss": 0.3417,
"step": 8175
},
{
"epoch": 1.715901948460088,
"grad_norm": 0.7431788444519043,
"learning_rate": 4.082920025003567e-05,
"loss": 0.3995,
"step": 8190
},
{
"epoch": 1.71904462602137,
"grad_norm": 0.7709872126579285,
"learning_rate": 4.0662486665370734e-05,
"loss": 0.4069,
"step": 8205
},
{
"epoch": 1.7221873035826523,
"grad_norm": 0.6013693809509277,
"learning_rate": 4.049588057146394e-05,
"loss": 0.3877,
"step": 8220
},
{
"epoch": 1.7253299811439347,
"grad_norm": 0.7985032796859741,
"learning_rate": 4.032938388623657e-05,
"loss": 0.3407,
"step": 8235
},
{
"epoch": 1.7284726587052168,
"grad_norm": 0.6259362101554871,
"learning_rate": 4.01629985263504e-05,
"loss": 0.3167,
"step": 8250
},
{
"epoch": 1.7316153362664992,
"grad_norm": 0.7632457613945007,
"learning_rate": 3.999672640718567e-05,
"loss": 0.365,
"step": 8265
},
{
"epoch": 1.7347580138277814,
"grad_norm": 0.9532593488693237,
"learning_rate": 3.983056944281901e-05,
"loss": 0.427,
"step": 8280
},
{
"epoch": 1.7379006913890636,
"grad_norm": 0.7168596386909485,
"learning_rate": 3.966452954600142e-05,
"loss": 0.3776,
"step": 8295
},
{
"epoch": 1.7410433689503457,
"grad_norm": 0.753966748714447,
"learning_rate": 3.94986086281363e-05,
"loss": 0.3792,
"step": 8310
},
{
"epoch": 1.744186046511628,
"grad_norm": 0.38063740730285645,
"learning_rate": 3.933280859925734e-05,
"loss": 0.3499,
"step": 8325
},
{
"epoch": 1.74732872407291,
"grad_norm": 0.8001086711883545,
"learning_rate": 3.916713136800659e-05,
"loss": 0.3491,
"step": 8340
},
{
"epoch": 1.7504714016341922,
"grad_norm": 0.7394033074378967,
"learning_rate": 3.900157884161255e-05,
"loss": 0.3383,
"step": 8355
},
{
"epoch": 1.7536140791954744,
"grad_norm": 0.7337818741798401,
"learning_rate": 3.8836152925868114e-05,
"loss": 0.3705,
"step": 8370
},
{
"epoch": 1.7567567567567568,
"grad_norm": 0.7671971917152405,
"learning_rate": 3.867085552510864e-05,
"loss": 0.3125,
"step": 8385
},
{
"epoch": 1.759899434318039,
"grad_norm": 0.8018542528152466,
"learning_rate": 3.850568854219011e-05,
"loss": 0.3678,
"step": 8400
},
{
"epoch": 1.7630421118793211,
"grad_norm": 0.8364083766937256,
"learning_rate": 3.834065387846718e-05,
"loss": 0.4179,
"step": 8415
},
{
"epoch": 1.7661847894406035,
"grad_norm": 0.8526837825775146,
"learning_rate": 3.817575343377122e-05,
"loss": 0.3881,
"step": 8430
},
{
"epoch": 1.7693274670018857,
"grad_norm": 0.6416676640510559,
"learning_rate": 3.8010989106388554e-05,
"loss": 0.3099,
"step": 8445
},
{
"epoch": 1.7724701445631679,
"grad_norm": 0.7990739941596985,
"learning_rate": 3.784636279303858e-05,
"loss": 0.3598,
"step": 8460
},
{
"epoch": 1.77561282212445,
"grad_norm": 0.8872657418251038,
"learning_rate": 3.76818763888519e-05,
"loss": 0.3882,
"step": 8475
},
{
"epoch": 1.7787554996857322,
"grad_norm": 0.8712546229362488,
"learning_rate": 3.7517531787348484e-05,
"loss": 0.3773,
"step": 8490
},
{
"epoch": 1.7818981772470144,
"grad_norm": 0.7423908710479736,
"learning_rate": 3.735333088041596e-05,
"loss": 0.3777,
"step": 8505
},
{
"epoch": 1.7850408548082966,
"grad_norm": 0.9166727066040039,
"learning_rate": 3.718927555828779e-05,
"loss": 0.4059,
"step": 8520
},
{
"epoch": 1.7881835323695787,
"grad_norm": 0.7207896113395691,
"learning_rate": 3.702536770952148e-05,
"loss": 0.3754,
"step": 8535
},
{
"epoch": 1.7913262099308611,
"grad_norm": 0.844727635383606,
"learning_rate": 3.6861609220976846e-05,
"loss": 0.3328,
"step": 8550
},
{
"epoch": 1.7944688874921433,
"grad_norm": 0.7674320340156555,
"learning_rate": 3.6698001977794366e-05,
"loss": 0.3806,
"step": 8565
},
{
"epoch": 1.7976115650534257,
"grad_norm": 0.6307094693183899,
"learning_rate": 3.6534547863373394e-05,
"loss": 0.3694,
"step": 8580
},
{
"epoch": 1.8007542426147078,
"grad_norm": 0.767432451248169,
"learning_rate": 3.63712487593505e-05,
"loss": 0.4028,
"step": 8595
},
{
"epoch": 1.80389692017599,
"grad_norm": 0.8937990665435791,
"learning_rate": 3.6208106545577824e-05,
"loss": 0.3372,
"step": 8610
},
{
"epoch": 1.8070395977372722,
"grad_norm": 0.590930163860321,
"learning_rate": 3.604512310010146e-05,
"loss": 0.3684,
"step": 8625
},
{
"epoch": 1.8101822752985544,
"grad_norm": 0.8184636831283569,
"learning_rate": 3.58823002991398e-05,
"loss": 0.373,
"step": 8640
},
{
"epoch": 1.8133249528598365,
"grad_norm": 0.9741955399513245,
"learning_rate": 3.5719640017061885e-05,
"loss": 0.3374,
"step": 8655
},
{
"epoch": 1.8164676304211187,
"grad_norm": 1.0014973878860474,
"learning_rate": 3.555714412636595e-05,
"loss": 0.3848,
"step": 8670
},
{
"epoch": 1.8196103079824009,
"grad_norm": 0.6335365772247314,
"learning_rate": 3.53948144976578e-05,
"loss": 0.3689,
"step": 8685
},
{
"epoch": 1.8227529855436833,
"grad_norm": 0.5687909722328186,
"learning_rate": 3.523265299962924e-05,
"loss": 0.4178,
"step": 8700
},
{
"epoch": 1.8258956631049654,
"grad_norm": 0.8622750043869019,
"learning_rate": 3.507066149903662e-05,
"loss": 0.3899,
"step": 8715
},
{
"epoch": 1.8290383406662476,
"grad_norm": 0.7984293699264526,
"learning_rate": 3.490884186067935e-05,
"loss": 0.4353,
"step": 8730
},
{
"epoch": 1.83218101822753,
"grad_norm": 0.7962972521781921,
"learning_rate": 3.474719594737842e-05,
"loss": 0.3324,
"step": 8745
},
{
"epoch": 1.8353236957888122,
"grad_norm": 0.7194257974624634,
"learning_rate": 3.4585725619954864e-05,
"loss": 0.3765,
"step": 8760
},
{
"epoch": 1.8384663733500943,
"grad_norm": 0.6931387782096863,
"learning_rate": 3.442443273720853e-05,
"loss": 0.3183,
"step": 8775
},
{
"epoch": 1.8416090509113765,
"grad_norm": 0.7540430426597595,
"learning_rate": 3.426331915589651e-05,
"loss": 0.3975,
"step": 8790
},
{
"epoch": 1.8447517284726587,
"grad_norm": 0.7310993671417236,
"learning_rate": 3.410238673071185e-05,
"loss": 0.3975,
"step": 8805
},
{
"epoch": 1.8478944060339408,
"grad_norm": 0.7351768612861633,
"learning_rate": 3.394163731426216e-05,
"loss": 0.3558,
"step": 8820
},
{
"epoch": 1.851037083595223,
"grad_norm": 0.7860934138298035,
"learning_rate": 3.378107275704834e-05,
"loss": 0.3601,
"step": 8835
},
{
"epoch": 1.8541797611565052,
"grad_norm": 0.6049594283103943,
"learning_rate": 3.362069490744322e-05,
"loss": 0.3692,
"step": 8850
},
{
"epoch": 1.8573224387177876,
"grad_norm": 0.9184178709983826,
"learning_rate": 3.346050561167029e-05,
"loss": 0.3518,
"step": 8865
},
{
"epoch": 1.8604651162790697,
"grad_norm": 0.7558075189590454,
"learning_rate": 3.3300506713782495e-05,
"loss": 0.3587,
"step": 8880
},
{
"epoch": 1.8636077938403521,
"grad_norm": 0.7545658349990845,
"learning_rate": 3.314070005564097e-05,
"loss": 0.3679,
"step": 8895
},
{
"epoch": 1.8667504714016343,
"grad_norm": 0.9135695695877075,
"learning_rate": 3.2981087476893853e-05,
"loss": 0.3725,
"step": 8910
},
{
"epoch": 1.8698931489629165,
"grad_norm": 0.9788998961448669,
"learning_rate": 3.2821670814955026e-05,
"loss": 0.3149,
"step": 8925
},
{
"epoch": 1.8730358265241986,
"grad_norm": 0.7953155636787415,
"learning_rate": 3.266245190498311e-05,
"loss": 0.3461,
"step": 8940
},
{
"epoch": 1.8761785040854808,
"grad_norm": 0.9166163802146912,
"learning_rate": 3.250343257986027e-05,
"loss": 0.3866,
"step": 8955
},
{
"epoch": 1.879321181646763,
"grad_norm": 0.9379754066467285,
"learning_rate": 3.2344614670171025e-05,
"loss": 0.3928,
"step": 8970
},
{
"epoch": 1.8824638592080452,
"grad_norm": 0.8782539963722229,
"learning_rate": 3.2186000004181314e-05,
"loss": 0.3959,
"step": 8985
},
{
"epoch": 1.8856065367693273,
"grad_norm": 0.7237117886543274,
"learning_rate": 3.2027590407817407e-05,
"loss": 0.3458,
"step": 9000
},
{
"epoch": 1.8887492143306097,
"grad_norm": 0.8787809014320374,
"learning_rate": 3.186938770464486e-05,
"loss": 0.4081,
"step": 9015
},
{
"epoch": 1.8918918918918919,
"grad_norm": 0.7628602981567383,
"learning_rate": 3.1711393715847476e-05,
"loss": 0.3928,
"step": 9030
},
{
"epoch": 1.895034569453174,
"grad_norm": 0.9172194600105286,
"learning_rate": 3.15536102602065e-05,
"loss": 0.3777,
"step": 9045
},
{
"epoch": 1.8981772470144564,
"grad_norm": 0.8413445353507996,
"learning_rate": 3.13960391540795e-05,
"loss": 0.36,
"step": 9060
},
{
"epoch": 1.9013199245757386,
"grad_norm": 0.9793257117271423,
"learning_rate": 3.1238682211379586e-05,
"loss": 0.3801,
"step": 9075
},
{
"epoch": 1.9044626021370208,
"grad_norm": 0.7620652318000793,
"learning_rate": 3.1081541243554427e-05,
"loss": 0.3689,
"step": 9090
},
{
"epoch": 1.907605279698303,
"grad_norm": 0.8353012800216675,
"learning_rate": 3.092461805956551e-05,
"loss": 0.3961,
"step": 9105
},
{
"epoch": 1.9107479572595851,
"grad_norm": 0.8704758882522583,
"learning_rate": 3.0767914465867246e-05,
"loss": 0.3168,
"step": 9120
},
{
"epoch": 1.9138906348208673,
"grad_norm": 0.6754759550094604,
"learning_rate": 3.061143226638611e-05,
"loss": 0.3407,
"step": 9135
},
{
"epoch": 1.9170333123821495,
"grad_norm": 0.9682889580726624,
"learning_rate": 3.0455173262500093e-05,
"loss": 0.4251,
"step": 9150
},
{
"epoch": 1.9201759899434316,
"grad_norm": 0.8114556670188904,
"learning_rate": 3.0299139253017695e-05,
"loss": 0.3397,
"step": 9165
},
{
"epoch": 1.923318667504714,
"grad_norm": 0.8123522996902466,
"learning_rate": 3.014333203415741e-05,
"loss": 0.3372,
"step": 9180
},
{
"epoch": 1.9264613450659962,
"grad_norm": 0.6080268025398254,
"learning_rate": 2.9987753399526934e-05,
"loss": 0.3506,
"step": 9195
},
{
"epoch": 1.9296040226272786,
"grad_norm": 0.8804168701171875,
"learning_rate": 2.9832405140102637e-05,
"loss": 0.3689,
"step": 9210
},
{
"epoch": 1.9327467001885608,
"grad_norm": 0.8579033613204956,
"learning_rate": 2.9677289044208833e-05,
"loss": 0.3875,
"step": 9225
},
{
"epoch": 1.935889377749843,
"grad_norm": 0.9520317316055298,
"learning_rate": 2.952240689749722e-05,
"loss": 0.422,
"step": 9240
},
{
"epoch": 1.939032055311125,
"grad_norm": 0.9517824053764343,
"learning_rate": 2.9367760482926393e-05,
"loss": 0.3917,
"step": 9255
},
{
"epoch": 1.9421747328724073,
"grad_norm": 0.8813058733940125,
"learning_rate": 2.921335158074122e-05,
"loss": 0.3551,
"step": 9270
},
{
"epoch": 1.9453174104336894,
"grad_norm": 0.8402652144432068,
"learning_rate": 2.905918196845242e-05,
"loss": 0.3468,
"step": 9285
},
{
"epoch": 1.9484600879949716,
"grad_norm": 0.855032205581665,
"learning_rate": 2.8905253420816035e-05,
"loss": 0.3534,
"step": 9300
},
{
"epoch": 1.9516027655562538,
"grad_norm": 0.7760915756225586,
"learning_rate": 2.875156770981311e-05,
"loss": 0.348,
"step": 9315
},
{
"epoch": 1.9547454431175362,
"grad_norm": 0.946934163570404,
"learning_rate": 2.8598126604629195e-05,
"loss": 0.3556,
"step": 9330
},
{
"epoch": 1.9578881206788183,
"grad_norm": 0.7589976191520691,
"learning_rate": 2.844493187163395e-05,
"loss": 0.3944,
"step": 9345
},
{
"epoch": 1.9610307982401005,
"grad_norm": 0.8831868171691895,
"learning_rate": 2.8291985274360983e-05,
"loss": 0.3192,
"step": 9360
},
{
"epoch": 1.964173475801383,
"grad_norm": 0.8260477781295776,
"learning_rate": 2.8139288573487337e-05,
"loss": 0.3476,
"step": 9375
},
{
"epoch": 1.967316153362665,
"grad_norm": 0.9583712816238403,
"learning_rate": 2.7986843526813343e-05,
"loss": 0.3112,
"step": 9390
},
{
"epoch": 1.9704588309239472,
"grad_norm": 0.8534590005874634,
"learning_rate": 2.783465188924239e-05,
"loss": 0.3738,
"step": 9405
},
{
"epoch": 1.9736015084852294,
"grad_norm": 0.8562766909599304,
"learning_rate": 2.7682715412760696e-05,
"loss": 0.3831,
"step": 9420
},
{
"epoch": 1.9767441860465116,
"grad_norm": 0.649868905544281,
"learning_rate": 2.7531035846417107e-05,
"loss": 0.379,
"step": 9435
},
{
"epoch": 1.9798868636077938,
"grad_norm": 0.7702896595001221,
"learning_rate": 2.7379614936302982e-05,
"loss": 0.3617,
"step": 9450
},
{
"epoch": 1.983029541169076,
"grad_norm": 0.9378584623336792,
"learning_rate": 2.7228454425532157e-05,
"loss": 0.3681,
"step": 9465
},
{
"epoch": 1.9861722187303583,
"grad_norm": 1.0069222450256348,
"learning_rate": 2.7077556054220804e-05,
"loss": 0.3356,
"step": 9480
},
{
"epoch": 1.9893148962916405,
"grad_norm": 0.9345496892929077,
"learning_rate": 2.6926921559467412e-05,
"loss": 0.3974,
"step": 9495
},
{
"epoch": 1.9924575738529227,
"grad_norm": 0.8090453147888184,
"learning_rate": 2.6776552675332768e-05,
"loss": 0.3397,
"step": 9510
},
{
"epoch": 1.995600251414205,
"grad_norm": 0.647416353225708,
"learning_rate": 2.6626451132820085e-05,
"loss": 0.3259,
"step": 9525
},
{
"epoch": 1.9987429289754872,
"grad_norm": 0.7810280323028564,
"learning_rate": 2.6476618659855023e-05,
"loss": 0.3234,
"step": 9540
},
{
"epoch": 2.0018856065367694,
"grad_norm": 0.7231355309486389,
"learning_rate": 2.6327056981265708e-05,
"loss": 0.3276,
"step": 9555
},
{
"epoch": 2.0050282840980516,
"grad_norm": 0.7072864174842834,
"learning_rate": 2.6177767818763062e-05,
"loss": 0.2683,
"step": 9570
},
{
"epoch": 2.0081709616593337,
"grad_norm": 0.8502817749977112,
"learning_rate": 2.6028752890920783e-05,
"loss": 0.2844,
"step": 9585
},
{
"epoch": 2.011313639220616,
"grad_norm": 0.6001257300376892,
"learning_rate": 2.5880013913155743e-05,
"loss": 0.2582,
"step": 9600
},
{
"epoch": 2.014456316781898,
"grad_norm": 1.037467360496521,
"learning_rate": 2.5731552597708086e-05,
"loss": 0.2666,
"step": 9615
},
{
"epoch": 2.0175989943431802,
"grad_norm": 0.990047812461853,
"learning_rate": 2.5583370653621652e-05,
"loss": 0.3042,
"step": 9630
},
{
"epoch": 2.0207416719044624,
"grad_norm": 1.0518317222595215,
"learning_rate": 2.5435469786724204e-05,
"loss": 0.2543,
"step": 9645
},
{
"epoch": 2.023884349465745,
"grad_norm": 1.225774884223938,
"learning_rate": 2.528785169960779e-05,
"loss": 0.3183,
"step": 9660
},
{
"epoch": 2.027027027027027,
"grad_norm": 0.9525572061538696,
"learning_rate": 2.5140518091609256e-05,
"loss": 0.3426,
"step": 9675
},
{
"epoch": 2.0301697045883094,
"grad_norm": 1.0750566720962524,
"learning_rate": 2.4993470658790573e-05,
"loss": 0.3172,
"step": 9690
},
{
"epoch": 2.0333123821495915,
"grad_norm": 0.8268773555755615,
"learning_rate": 2.484671109391933e-05,
"loss": 0.31,
"step": 9705
},
{
"epoch": 2.0364550597108737,
"grad_norm": 0.679678201675415,
"learning_rate": 2.470024108644925e-05,
"loss": 0.2868,
"step": 9720
},
{
"epoch": 2.039597737272156,
"grad_norm": 0.997440755367279,
"learning_rate": 2.4554062322500797e-05,
"loss": 0.3291,
"step": 9735
},
{
"epoch": 2.042740414833438,
"grad_norm": 0.9968817830085754,
"learning_rate": 2.4408176484841732e-05,
"loss": 0.2664,
"step": 9750
},
{
"epoch": 2.04588309239472,
"grad_norm": 1.0939124822616577,
"learning_rate": 2.4262585252867686e-05,
"loss": 0.2895,
"step": 9765
},
{
"epoch": 2.0490257699560024,
"grad_norm": 1.0220900774002075,
"learning_rate": 2.4117290302582872e-05,
"loss": 0.3191,
"step": 9780
},
{
"epoch": 2.0521684475172846,
"grad_norm": 0.635898768901825,
"learning_rate": 2.397229330658084e-05,
"loss": 0.307,
"step": 9795
},
{
"epoch": 2.0553111250785667,
"grad_norm": 1.112257719039917,
"learning_rate": 2.382759593402517e-05,
"loss": 0.2748,
"step": 9810
},
{
"epoch": 2.0584538026398493,
"grad_norm": 0.9440275430679321,
"learning_rate": 2.3683199850630213e-05,
"loss": 0.2893,
"step": 9825
},
{
"epoch": 2.0615964802011315,
"grad_norm": 1.2118226289749146,
"learning_rate": 2.3539106718642034e-05,
"loss": 0.2791,
"step": 9840
},
{
"epoch": 2.0647391577624137,
"grad_norm": 1.1374374628067017,
"learning_rate": 2.339531819681914e-05,
"loss": 0.2777,
"step": 9855
},
{
"epoch": 2.067881835323696,
"grad_norm": 0.6932136416435242,
"learning_rate": 2.3251835940413517e-05,
"loss": 0.2828,
"step": 9870
},
{
"epoch": 2.071024512884978,
"grad_norm": 1.0308489799499512,
"learning_rate": 2.310866160115146e-05,
"loss": 0.2947,
"step": 9885
},
{
"epoch": 2.07416719044626,
"grad_norm": 1.063235878944397,
"learning_rate": 2.2965796827214665e-05,
"loss": 0.3204,
"step": 9900
},
{
"epoch": 2.0773098680075424,
"grad_norm": 1.1612193584442139,
"learning_rate": 2.282324326322115e-05,
"loss": 0.2976,
"step": 9915
},
{
"epoch": 2.0804525455688245,
"grad_norm": 0.8928938508033752,
"learning_rate": 2.2681002550206355e-05,
"loss": 0.2921,
"step": 9930
},
{
"epoch": 2.0835952231301067,
"grad_norm": 1.066124677658081,
"learning_rate": 2.253907632560439e-05,
"loss": 0.298,
"step": 9945
},
{
"epoch": 2.086737900691389,
"grad_norm": 0.8713576197624207,
"learning_rate": 2.2397466223228947e-05,
"loss": 0.275,
"step": 9960
},
{
"epoch": 2.0898805782526715,
"grad_norm": 1.1056296825408936,
"learning_rate": 2.2256173873254643e-05,
"loss": 0.3266,
"step": 9975
},
{
"epoch": 2.0930232558139537,
"grad_norm": 0.9172502160072327,
"learning_rate": 2.211520090219821e-05,
"loss": 0.2731,
"step": 9990
},
{
"epoch": 2.0951183741881416,
"eval_accuracy": 0.009820309467613697,
"eval_loss": 0.4190310835838318,
"eval_runtime": 424.9528,
"eval_samples_per_second": 11.26,
"eval_steps_per_second": 2.817,
"step": 10000
},
{
"epoch": 2.096165933375236,
"grad_norm": 0.9003602862358093,
"learning_rate": 2.1974548932899814e-05,
"loss": 0.2534,
"step": 10005
},
{
"epoch": 2.099308610936518,
"grad_norm": 1.0138850212097168,
"learning_rate": 2.1834219584504345e-05,
"loss": 0.2847,
"step": 10020
},
{
"epoch": 2.1024512884978,
"grad_norm": 0.8467048406600952,
"learning_rate": 2.169421447244272e-05,
"loss": 0.3011,
"step": 10035
},
{
"epoch": 2.1055939660590823,
"grad_norm": 1.1273193359375,
"learning_rate": 2.1554535208413406e-05,
"loss": 0.3181,
"step": 10050
},
{
"epoch": 2.1087366436203645,
"grad_norm": 1.1201776266098022,
"learning_rate": 2.1415183400363748e-05,
"loss": 0.3122,
"step": 10065
},
{
"epoch": 2.1118793211816467,
"grad_norm": 1.0749905109405518,
"learning_rate": 2.1276160652471555e-05,
"loss": 0.3357,
"step": 10080
},
{
"epoch": 2.115021998742929,
"grad_norm": 0.874462366104126,
"learning_rate": 2.1137468565126543e-05,
"loss": 0.3014,
"step": 10095
},
{
"epoch": 2.118164676304211,
"grad_norm": 1.0569285154342651,
"learning_rate": 2.099910873491202e-05,
"loss": 0.2945,
"step": 10110
},
{
"epoch": 2.121307353865493,
"grad_norm": 0.9067788124084473,
"learning_rate": 2.0861082754586382e-05,
"loss": 0.3218,
"step": 10125
},
{
"epoch": 2.124450031426776,
"grad_norm": 1.2187013626098633,
"learning_rate": 2.0723392213064884e-05,
"loss": 0.3065,
"step": 10140
},
{
"epoch": 2.127592708988058,
"grad_norm": 1.0931589603424072,
"learning_rate": 2.0586038695401317e-05,
"loss": 0.2792,
"step": 10155
},
{
"epoch": 2.13073538654934,
"grad_norm": 1.2825082540512085,
"learning_rate": 2.0449023782769706e-05,
"loss": 0.3138,
"step": 10170
},
{
"epoch": 2.1338780641106223,
"grad_norm": 1.0086079835891724,
"learning_rate": 2.031234905244618e-05,
"loss": 0.3079,
"step": 10185
},
{
"epoch": 2.1370207416719045,
"grad_norm": 0.7740280032157898,
"learning_rate": 2.017601607779074e-05,
"loss": 0.2704,
"step": 10200
},
{
"epoch": 2.1401634192331866,
"grad_norm": 0.7861264944076538,
"learning_rate": 2.0040026428229313e-05,
"loss": 0.296,
"step": 10215
},
{
"epoch": 2.143306096794469,
"grad_norm": 0.8179210424423218,
"learning_rate": 1.9904381669235456e-05,
"loss": 0.296,
"step": 10230
},
{
"epoch": 2.146448774355751,
"grad_norm": 1.410079002380371,
"learning_rate": 1.976908336231245e-05,
"loss": 0.2836,
"step": 10245
},
{
"epoch": 2.149591451917033,
"grad_norm": 1.082899570465088,
"learning_rate": 1.9634133064975402e-05,
"loss": 0.2848,
"step": 10260
},
{
"epoch": 2.1527341294783153,
"grad_norm": 0.9219628572463989,
"learning_rate": 1.9499532330733135e-05,
"loss": 0.3255,
"step": 10275
},
{
"epoch": 2.155876807039598,
"grad_norm": 0.9849101901054382,
"learning_rate": 1.9365282709070487e-05,
"loss": 0.3336,
"step": 10290
},
{
"epoch": 2.15901948460088,
"grad_norm": 0.8761511445045471,
"learning_rate": 1.9231385745430308e-05,
"loss": 0.3128,
"step": 10305
},
{
"epoch": 2.1621621621621623,
"grad_norm": 1.1564205884933472,
"learning_rate": 1.9097842981195834e-05,
"loss": 0.291,
"step": 10320
},
{
"epoch": 2.1653048397234445,
"grad_norm": 0.6984158158302307,
"learning_rate": 1.8964655953672784e-05,
"loss": 0.2761,
"step": 10335
},
{
"epoch": 2.1684475172847266,
"grad_norm": 0.7349433898925781,
"learning_rate": 1.883182619607179e-05,
"loss": 0.3066,
"step": 10350
},
{
"epoch": 2.171590194846009,
"grad_norm": 0.9663205742835999,
"learning_rate": 1.8699355237490694e-05,
"loss": 0.2644,
"step": 10365
},
{
"epoch": 2.174732872407291,
"grad_norm": 1.194226861000061,
"learning_rate": 1.856724460289692e-05,
"loss": 0.3112,
"step": 10380
},
{
"epoch": 2.177875549968573,
"grad_norm": 1.0187724828720093,
"learning_rate": 1.8435495813109938e-05,
"loss": 0.2779,
"step": 10395
},
{
"epoch": 2.1810182275298553,
"grad_norm": 0.7448340654373169,
"learning_rate": 1.8304110384783806e-05,
"loss": 0.2723,
"step": 10410
},
{
"epoch": 2.1841609050911375,
"grad_norm": 1.0969903469085693,
"learning_rate": 1.8173089830389662e-05,
"loss": 0.2824,
"step": 10425
},
{
"epoch": 2.1873035826524196,
"grad_norm": 1.0222073793411255,
"learning_rate": 1.8042435658198286e-05,
"loss": 0.303,
"step": 10440
},
{
"epoch": 2.1904462602137023,
"grad_norm": 0.9316915273666382,
"learning_rate": 1.7912149372262793e-05,
"loss": 0.2562,
"step": 10455
},
{
"epoch": 2.1935889377749844,
"grad_norm": 0.6998715996742249,
"learning_rate": 1.77822324724013e-05,
"loss": 0.298,
"step": 10470
},
{
"epoch": 2.1967316153362666,
"grad_norm": 0.9719591736793518,
"learning_rate": 1.7652686454179686e-05,
"loss": 0.2887,
"step": 10485
},
{
"epoch": 2.1998742928975488,
"grad_norm": 0.8645143508911133,
"learning_rate": 1.7523512808894288e-05,
"loss": 0.2532,
"step": 10500
},
{
"epoch": 2.203016970458831,
"grad_norm": 1.1070195436477661,
"learning_rate": 1.739471302355482e-05,
"loss": 0.2999,
"step": 10515
},
{
"epoch": 2.206159648020113,
"grad_norm": 0.8601672053337097,
"learning_rate": 1.7266288580867258e-05,
"loss": 0.3209,
"step": 10530
},
{
"epoch": 2.2093023255813953,
"grad_norm": 1.0818884372711182,
"learning_rate": 1.713824095921668e-05,
"loss": 0.3079,
"step": 10545
},
{
"epoch": 2.2124450031426774,
"grad_norm": 0.7250615954399109,
"learning_rate": 1.701057163265038e-05,
"loss": 0.3364,
"step": 10560
},
{
"epoch": 2.2155876807039596,
"grad_norm": 0.9716282486915588,
"learning_rate": 1.6883282070860763e-05,
"loss": 0.2898,
"step": 10575
},
{
"epoch": 2.218730358265242,
"grad_norm": 1.0294605493545532,
"learning_rate": 1.675637373916855e-05,
"loss": 0.3075,
"step": 10590
},
{
"epoch": 2.2218730358265244,
"grad_norm": 1.0724180936813354,
"learning_rate": 1.662984809850579e-05,
"loss": 0.3068,
"step": 10605
},
{
"epoch": 2.2250157133878066,
"grad_norm": 0.9719418883323669,
"learning_rate": 1.6503706605399156e-05,
"loss": 0.3153,
"step": 10620
},
{
"epoch": 2.2281583909490887,
"grad_norm": 0.8698229193687439,
"learning_rate": 1.6377950711953115e-05,
"loss": 0.2597,
"step": 10635
},
{
"epoch": 2.231301068510371,
"grad_norm": 0.9012719988822937,
"learning_rate": 1.6252581865833198e-05,
"loss": 0.3284,
"step": 10650
},
{
"epoch": 2.234443746071653,
"grad_norm": 0.8515365123748779,
"learning_rate": 1.612760151024936e-05,
"loss": 0.3147,
"step": 10665
},
{
"epoch": 2.2375864236329353,
"grad_norm": 1.1416083574295044,
"learning_rate": 1.6003011083939396e-05,
"loss": 0.2958,
"step": 10680
},
{
"epoch": 2.2407291011942174,
"grad_norm": 0.9006314873695374,
"learning_rate": 1.5878812021152334e-05,
"loss": 0.2757,
"step": 10695
},
{
"epoch": 2.2438717787554996,
"grad_norm": 1.1663639545440674,
"learning_rate": 1.5755005751631922e-05,
"loss": 0.3064,
"step": 10710
},
{
"epoch": 2.2470144563167818,
"grad_norm": 1.0664478540420532,
"learning_rate": 1.563159370060019e-05,
"loss": 0.2878,
"step": 10725
},
{
"epoch": 2.250157133878064,
"grad_norm": 0.7780718207359314,
"learning_rate": 1.5508577288741056e-05,
"loss": 0.3065,
"step": 10740
},
{
"epoch": 2.253299811439346,
"grad_norm": 1.1266307830810547,
"learning_rate": 1.5385957932183954e-05,
"loss": 0.3004,
"step": 10755
},
{
"epoch": 2.2564424890006287,
"grad_norm": 0.7767760157585144,
"learning_rate": 1.5263737042487514e-05,
"loss": 0.291,
"step": 10770
},
{
"epoch": 2.259585166561911,
"grad_norm": 0.6928930878639221,
"learning_rate": 1.514191602662332e-05,
"loss": 0.2945,
"step": 10785
},
{
"epoch": 2.262727844123193,
"grad_norm": 1.177262544631958,
"learning_rate": 1.5020496286959752e-05,
"loss": 0.3168,
"step": 10800
},
{
"epoch": 2.2658705216844752,
"grad_norm": 1.1784379482269287,
"learning_rate": 1.4899479221245827e-05,
"loss": 0.342,
"step": 10815
},
{
"epoch": 2.2690131992457574,
"grad_norm": 1.4985358715057373,
"learning_rate": 1.477886622259504e-05,
"loss": 0.3073,
"step": 10830
},
{
"epoch": 2.2721558768070396,
"grad_norm": 1.0009207725524902,
"learning_rate": 1.4658658679469445e-05,
"loss": 0.2888,
"step": 10845
},
{
"epoch": 2.2752985543683217,
"grad_norm": 1.0263885259628296,
"learning_rate": 1.4538857975663567e-05,
"loss": 0.3153,
"step": 10860
},
{
"epoch": 2.278441231929604,
"grad_norm": 0.8072161078453064,
"learning_rate": 1.4419465490288508e-05,
"loss": 0.2481,
"step": 10875
},
{
"epoch": 2.281583909490886,
"grad_norm": 0.8211586475372314,
"learning_rate": 1.430048259775611e-05,
"loss": 0.2738,
"step": 10890
},
{
"epoch": 2.2847265870521687,
"grad_norm": 1.0490375757217407,
"learning_rate": 1.418191066776311e-05,
"loss": 0.3005,
"step": 10905
},
{
"epoch": 2.287869264613451,
"grad_norm": 0.9059322476387024,
"learning_rate": 1.4063751065275315e-05,
"loss": 0.2578,
"step": 10920
},
{
"epoch": 2.291011942174733,
"grad_norm": 0.9448453187942505,
"learning_rate": 1.3946005150511948e-05,
"loss": 0.3033,
"step": 10935
},
{
"epoch": 2.294154619736015,
"grad_norm": 0.9595757126808167,
"learning_rate": 1.3828674278930009e-05,
"loss": 0.3092,
"step": 10950
},
{
"epoch": 2.2972972972972974,
"grad_norm": 0.6836899518966675,
"learning_rate": 1.371175980120864e-05,
"loss": 0.2354,
"step": 10965
},
{
"epoch": 2.3004399748585795,
"grad_norm": 1.1870014667510986,
"learning_rate": 1.3595263063233538e-05,
"loss": 0.339,
"step": 10980
},
{
"epoch": 2.3035826524198617,
"grad_norm": 0.9335547685623169,
"learning_rate": 1.3479185406081519e-05,
"loss": 0.2667,
"step": 10995
},
{
"epoch": 2.306725329981144,
"grad_norm": 1.0864135026931763,
"learning_rate": 1.3363528166005068e-05,
"loss": 0.2993,
"step": 11010
},
{
"epoch": 2.309868007542426,
"grad_norm": 1.3026399612426758,
"learning_rate": 1.3248292674416968e-05,
"loss": 0.2838,
"step": 11025
},
{
"epoch": 2.313010685103708,
"grad_norm": 0.7582332491874695,
"learning_rate": 1.3133480257874902e-05,
"loss": 0.2746,
"step": 11040
},
{
"epoch": 2.3161533626649904,
"grad_norm": 1.0766429901123047,
"learning_rate": 1.3019092238066304e-05,
"loss": 0.2915,
"step": 11055
},
{
"epoch": 2.3192960402262726,
"grad_norm": 0.7966647148132324,
"learning_rate": 1.2905129931793009e-05,
"loss": 0.2586,
"step": 11070
},
{
"epoch": 2.322438717787555,
"grad_norm": 1.0455411672592163,
"learning_rate": 1.2791594650956212e-05,
"loss": 0.2867,
"step": 11085
},
{
"epoch": 2.3255813953488373,
"grad_norm": 0.9847836494445801,
"learning_rate": 1.267848770254127e-05,
"loss": 0.3219,
"step": 11100
},
{
"epoch": 2.3287240729101195,
"grad_norm": 0.9694182276725769,
"learning_rate": 1.256581038860275e-05,
"loss": 0.2558,
"step": 11115
},
{
"epoch": 2.3318667504714017,
"grad_norm": 1.4064688682556152,
"learning_rate": 1.2453564006249352e-05,
"loss": 0.2609,
"step": 11130
},
{
"epoch": 2.335009428032684,
"grad_norm": 0.8352707028388977,
"learning_rate": 1.2341749847628997e-05,
"loss": 0.2985,
"step": 11145
},
{
"epoch": 2.338152105593966,
"grad_norm": 1.016571044921875,
"learning_rate": 1.2230369199914066e-05,
"loss": 0.2673,
"step": 11160
},
{
"epoch": 2.341294783155248,
"grad_norm": 0.9296002984046936,
"learning_rate": 1.211942334528639e-05,
"loss": 0.2685,
"step": 11175
},
{
"epoch": 2.3444374607165304,
"grad_norm": 1.4591748714447021,
"learning_rate": 1.200891356092263e-05,
"loss": 0.2773,
"step": 11190
},
{
"epoch": 2.3475801382778125,
"grad_norm": 0.9775596261024475,
"learning_rate": 1.1898841118979504e-05,
"loss": 0.2976,
"step": 11205
},
{
"epoch": 2.350722815839095,
"grad_norm": 1.2126258611679077,
"learning_rate": 1.1789207286579201e-05,
"loss": 0.3298,
"step": 11220
},
{
"epoch": 2.3538654934003773,
"grad_norm": 1.3125213384628296,
"learning_rate": 1.1680013325794776e-05,
"loss": 0.2639,
"step": 11235
},
{
"epoch": 2.3570081709616595,
"grad_norm": 1.0396140813827515,
"learning_rate": 1.1571260493635561e-05,
"loss": 0.292,
"step": 11250
},
{
"epoch": 2.3601508485229417,
"grad_norm": 0.9269897937774658,
"learning_rate": 1.1462950042032767e-05,
"loss": 0.3426,
"step": 11265
},
{
"epoch": 2.363293526084224,
"grad_norm": 1.1665176153182983,
"learning_rate": 1.1355083217825052e-05,
"loss": 0.2794,
"step": 11280
},
{
"epoch": 2.366436203645506,
"grad_norm": 1.0097540616989136,
"learning_rate": 1.1247661262744175e-05,
"loss": 0.2986,
"step": 11295
},
{
"epoch": 2.369578881206788,
"grad_norm": 1.1132863759994507,
"learning_rate": 1.1140685413400648e-05,
"loss": 0.3229,
"step": 11310
},
{
"epoch": 2.3727215587680703,
"grad_norm": 1.2184104919433594,
"learning_rate": 1.1034156901269598e-05,
"loss": 0.2708,
"step": 11325
},
{
"epoch": 2.3758642363293525,
"grad_norm": 1.0664645433425903,
"learning_rate": 1.0928076952676474e-05,
"loss": 0.2728,
"step": 11340
},
{
"epoch": 2.3790069138906347,
"grad_norm": 1.2971463203430176,
"learning_rate": 1.0822446788783058e-05,
"loss": 0.3048,
"step": 11355
},
{
"epoch": 2.382149591451917,
"grad_norm": 0.9727672338485718,
"learning_rate": 1.0717267625573279e-05,
"loss": 0.2918,
"step": 11370
},
{
"epoch": 2.385292269013199,
"grad_norm": 1.0206960439682007,
"learning_rate": 1.0612540673839322e-05,
"loss": 0.2885,
"step": 11385
},
{
"epoch": 2.3884349465744816,
"grad_norm": 1.1079341173171997,
"learning_rate": 1.0508267139167615e-05,
"loss": 0.309,
"step": 11400
},
{
"epoch": 2.391577624135764,
"grad_norm": 1.1144444942474365,
"learning_rate": 1.0404448221924961e-05,
"loss": 0.2268,
"step": 11415
},
{
"epoch": 2.394720301697046,
"grad_norm": 1.1846858263015747,
"learning_rate": 1.030108511724483e-05,
"loss": 0.2822,
"step": 11430
},
{
"epoch": 2.397862979258328,
"grad_norm": 1.063310146331787,
"learning_rate": 1.019817901501341e-05,
"loss": 0.2883,
"step": 11445
},
{
"epoch": 2.4010056568196103,
"grad_norm": 1.1355246305465698,
"learning_rate": 1.0095731099856049e-05,
"loss": 0.2975,
"step": 11460
},
{
"epoch": 2.4041483343808925,
"grad_norm": 1.017663836479187,
"learning_rate": 9.993742551123558e-06,
"loss": 0.2883,
"step": 11475
},
{
"epoch": 2.4072910119421747,
"grad_norm": 1.3695423603057861,
"learning_rate": 9.892214542878686e-06,
"loss": 0.343,
"step": 11490
},
{
"epoch": 2.410433689503457,
"grad_norm": 1.0663484334945679,
"learning_rate": 9.79114824388257e-06,
"loss": 0.26,
"step": 11505
},
{
"epoch": 2.413576367064739,
"grad_norm": 1.0500160455703735,
"learning_rate": 9.690544817581243e-06,
"loss": 0.2877,
"step": 11520
},
{
"epoch": 2.4167190446260216,
"grad_norm": 1.0720367431640625,
"learning_rate": 9.590405422092336e-06,
"loss": 0.2561,
"step": 11535
},
{
"epoch": 2.4198617221873038,
"grad_norm": 0.9935043454170227,
"learning_rate": 9.49073121019164e-06,
"loss": 0.2764,
"step": 11550
},
{
"epoch": 2.423004399748586,
"grad_norm": 1.2285892963409424,
"learning_rate": 9.391523329299928e-06,
"loss": 0.303,
"step": 11565
},
{
"epoch": 2.426147077309868,
"grad_norm": 1.2495083808898926,
"learning_rate": 9.292782921469673e-06,
"loss": 0.3252,
"step": 11580
},
{
"epoch": 2.4292897548711503,
"grad_norm": 1.0354247093200684,
"learning_rate": 9.194511123371963e-06,
"loss": 0.2692,
"step": 11595
},
{
"epoch": 2.4324324324324325,
"grad_norm": 1.0744938850402832,
"learning_rate": 9.096709066283354e-06,
"loss": 0.2793,
"step": 11610
},
{
"epoch": 2.4355751099937146,
"grad_norm": 1.145193338394165,
"learning_rate": 8.9993778760729e-06,
"loss": 0.3108,
"step": 11625
},
{
"epoch": 2.438717787554997,
"grad_norm": 0.7168245911598206,
"learning_rate": 8.902518673189192e-06,
"loss": 0.3088,
"step": 11640
},
{
"epoch": 2.441860465116279,
"grad_norm": 0.9759941697120667,
"learning_rate": 8.806132572647386e-06,
"loss": 0.2771,
"step": 11655
},
{
"epoch": 2.445003142677561,
"grad_norm": 0.9443902373313904,
"learning_rate": 8.710220684016462e-06,
"loss": 0.2593,
"step": 11670
},
{
"epoch": 2.4481458202388433,
"grad_norm": 0.9628651142120361,
"learning_rate": 8.614784111406365e-06,
"loss": 0.267,
"step": 11685
},
{
"epoch": 2.4512884978001255,
"grad_norm": 1.0149531364440918,
"learning_rate": 8.519823953455424e-06,
"loss": 0.2929,
"step": 11700
},
{
"epoch": 2.454431175361408,
"grad_norm": 0.9107941389083862,
"learning_rate": 8.425341303317536e-06,
"loss": 0.2911,
"step": 11715
},
{
"epoch": 2.4575738529226903,
"grad_norm": 1.1681251525878906,
"learning_rate": 8.33133724864969e-06,
"loss": 0.2939,
"step": 11730
},
{
"epoch": 2.4607165304839724,
"grad_norm": 0.8774799704551697,
"learning_rate": 8.237812871599448e-06,
"loss": 0.2612,
"step": 11745
},
{
"epoch": 2.4638592080452546,
"grad_norm": 0.8654860854148865,
"learning_rate": 8.144769248792417e-06,
"loss": 0.2924,
"step": 11760
},
{
"epoch": 2.4670018856065368,
"grad_norm": 1.062782645225525,
"learning_rate": 8.052207451319954e-06,
"loss": 0.2466,
"step": 11775
},
{
"epoch": 2.470144563167819,
"grad_norm": 0.8732921481132507,
"learning_rate": 7.960128544726724e-06,
"loss": 0.2318,
"step": 11790
},
{
"epoch": 2.473287240729101,
"grad_norm": 1.191798210144043,
"learning_rate": 7.86853358899855e-06,
"loss": 0.3097,
"step": 11805
},
{
"epoch": 2.4764299182903833,
"grad_norm": 0.9445894360542297,
"learning_rate": 7.777423638550096e-06,
"loss": 0.2935,
"step": 11820
},
{
"epoch": 2.4795725958516655,
"grad_norm": 0.9677672386169434,
"learning_rate": 7.68679974221282e-06,
"loss": 0.2949,
"step": 11835
},
{
"epoch": 2.482715273412948,
"grad_norm": 0.756100058555603,
"learning_rate": 7.596662943222877e-06,
"loss": 0.2685,
"step": 11850
},
{
"epoch": 2.4858579509742302,
"grad_norm": 1.2218337059020996,
"learning_rate": 7.507014279209057e-06,
"loss": 0.3395,
"step": 11865
},
{
"epoch": 2.4890006285355124,
"grad_norm": 1.1206847429275513,
"learning_rate": 7.417854782180894e-06,
"loss": 0.2641,
"step": 11880
},
{
"epoch": 2.4921433060967946,
"grad_norm": 1.095615029335022,
"learning_rate": 7.329185478516798e-06,
"loss": 0.3021,
"step": 11895
},
{
"epoch": 2.4952859836580767,
"grad_norm": 0.9641756415367126,
"learning_rate": 7.241007388952209e-06,
"loss": 0.2847,
"step": 11910
},
{
"epoch": 2.498428661219359,
"grad_norm": 0.9637003540992737,
"learning_rate": 7.153321528567819e-06,
"loss": 0.2775,
"step": 11925
},
{
"epoch": 2.501571338780641,
"grad_norm": 0.8976852297782898,
"learning_rate": 7.066128906777941e-06,
"loss": 0.2636,
"step": 11940
},
{
"epoch": 2.5047140163419233,
"grad_norm": 1.006549596786499,
"learning_rate": 6.97943052731887e-06,
"loss": 0.2616,
"step": 11955
},
{
"epoch": 2.5078566939032054,
"grad_norm": 1.004257321357727,
"learning_rate": 6.893227388237345e-06,
"loss": 0.2579,
"step": 11970
},
{
"epoch": 2.5109993714644876,
"grad_norm": 0.8972447514533997,
"learning_rate": 6.807520481879004e-06,
"loss": 0.2469,
"step": 11985
},
{
"epoch": 2.5141420490257698,
"grad_norm": 0.8245068192481995,
"learning_rate": 6.722310794877002e-06,
"loss": 0.3258,
"step": 12000
},
{
"epoch": 2.517284726587052,
"grad_norm": 1.2819231748580933,
"learning_rate": 6.637599308140685e-06,
"loss": 0.2503,
"step": 12015
},
{
"epoch": 2.520427404148334,
"grad_norm": 0.9961299896240234,
"learning_rate": 6.553386996844208e-06,
"loss": 0.2766,
"step": 12030
},
{
"epoch": 2.5235700817096167,
"grad_norm": 0.7203584909439087,
"learning_rate": 6.469674830415412e-06,
"loss": 0.3168,
"step": 12045
},
{
"epoch": 2.526712759270899,
"grad_norm": 0.8977159261703491,
"learning_rate": 6.386463772524576e-06,
"loss": 0.2573,
"step": 12060
},
{
"epoch": 2.529855436832181,
"grad_norm": 1.2124725580215454,
"learning_rate": 6.303754781073395e-06,
"loss": 0.3008,
"step": 12075
},
{
"epoch": 2.5329981143934632,
"grad_norm": 0.7577414512634277,
"learning_rate": 6.2215488081838854e-06,
"loss": 0.2492,
"step": 12090
},
{
"epoch": 2.5361407919547454,
"grad_norm": 1.308779001235962,
"learning_rate": 6.139846800187493e-06,
"loss": 0.3002,
"step": 12105
},
{
"epoch": 2.5392834695160276,
"grad_norm": 1.0538486242294312,
"learning_rate": 6.058649697614149e-06,
"loss": 0.3068,
"step": 12120
},
{
"epoch": 2.5424261470773097,
"grad_norm": 1.1852937936782837,
"learning_rate": 5.9779584351814636e-06,
"loss": 0.308,
"step": 12135
},
{
"epoch": 2.5455688246385924,
"grad_norm": 0.9339080452919006,
"learning_rate": 5.897773941783935e-06,
"loss": 0.297,
"step": 12150
},
{
"epoch": 2.5487115021998745,
"grad_norm": 0.8344528079032898,
"learning_rate": 5.8180971404823205e-06,
"loss": 0.2789,
"step": 12165
},
{
"epoch": 2.5518541797611567,
"grad_norm": 1.3588929176330566,
"learning_rate": 5.738928948492966e-06,
"loss": 0.296,
"step": 12180
},
{
"epoch": 2.554996857322439,
"grad_norm": 1.0490657091140747,
"learning_rate": 5.660270277177243e-06,
"loss": 0.2864,
"step": 12195
},
{
"epoch": 2.558139534883721,
"grad_norm": 1.2904434204101562,
"learning_rate": 5.582122032031051e-06,
"loss": 0.2966,
"step": 12210
},
{
"epoch": 2.561282212445003,
"grad_norm": 0.7123144268989563,
"learning_rate": 5.5044851126744404e-06,
"loss": 0.2733,
"step": 12225
},
{
"epoch": 2.5644248900062854,
"grad_norm": 1.2593188285827637,
"learning_rate": 5.4273604128412315e-06,
"loss": 0.2873,
"step": 12240
},
{
"epoch": 2.5675675675675675,
"grad_norm": 0.9681785106658936,
"learning_rate": 5.35074882036869e-06,
"loss": 0.2596,
"step": 12255
},
{
"epoch": 2.5707102451288497,
"grad_norm": 0.944814145565033,
"learning_rate": 5.2746512171873485e-06,
"loss": 0.2871,
"step": 12270
},
{
"epoch": 2.573852922690132,
"grad_norm": 1.0654292106628418,
"learning_rate": 5.199068479310865e-06,
"loss": 0.2856,
"step": 12285
},
{
"epoch": 2.576995600251414,
"grad_norm": 1.4697771072387695,
"learning_rate": 5.12400147682589e-06,
"loss": 0.3125,
"step": 12300
},
{
"epoch": 2.5801382778126962,
"grad_norm": 1.1471614837646484,
"learning_rate": 5.0494510738820836e-06,
"loss": 0.2712,
"step": 12315
},
{
"epoch": 2.5832809553739784,
"grad_norm": 1.2926499843597412,
"learning_rate": 4.9754181286821855e-06,
"loss": 0.2721,
"step": 12330
},
{
"epoch": 2.586423632935261,
"grad_norm": 1.1065871715545654,
"learning_rate": 4.901903493472071e-06,
"loss": 0.3443,
"step": 12345
},
{
"epoch": 2.589566310496543,
"grad_norm": 1.0714068412780762,
"learning_rate": 4.8289080145309974e-06,
"loss": 0.2963,
"step": 12360
},
{
"epoch": 2.5927089880578253,
"grad_norm": 0.8245282769203186,
"learning_rate": 4.756432532161858e-06,
"loss": 0.2564,
"step": 12375
},
{
"epoch": 2.5958516656191075,
"grad_norm": 1.266921043395996,
"learning_rate": 4.684477880681492e-06,
"loss": 0.2712,
"step": 12390
},
{
"epoch": 2.5989943431803897,
"grad_norm": 1.2646595239639282,
"learning_rate": 4.613044888411067e-06,
"loss": 0.2845,
"step": 12405
},
{
"epoch": 2.602137020741672,
"grad_norm": 1.0433062314987183,
"learning_rate": 4.542134377666562e-06,
"loss": 0.309,
"step": 12420
},
{
"epoch": 2.605279698302954,
"grad_norm": 0.9236804246902466,
"learning_rate": 4.471747164749318e-06,
"loss": 0.2576,
"step": 12435
},
{
"epoch": 2.608422375864236,
"grad_norm": 0.8656274676322937,
"learning_rate": 4.401884059936618e-06,
"loss": 0.2695,
"step": 12450
},
{
"epoch": 2.611565053425519,
"grad_norm": 1.226678729057312,
"learning_rate": 4.332545867472354e-06,
"loss": 0.2993,
"step": 12465
},
{
"epoch": 2.614707730986801,
"grad_norm": 1.1997127532958984,
"learning_rate": 4.263733385557767e-06,
"loss": 0.2832,
"step": 12480
},
{
"epoch": 2.617850408548083,
"grad_norm": 1.113054871559143,
"learning_rate": 4.195447406342301e-06,
"loss": 0.2429,
"step": 12495
},
{
"epoch": 2.6209930861093653,
"grad_norm": 1.1524410247802734,
"learning_rate": 4.127688715914446e-06,
"loss": 0.3216,
"step": 12510
},
{
"epoch": 2.6241357636706475,
"grad_norm": 1.1508104801177979,
"learning_rate": 4.060458094292663e-06,
"loss": 0.2685,
"step": 12525
},
{
"epoch": 2.6272784412319297,
"grad_norm": 1.1233001947402954,
"learning_rate": 3.993756315416486e-06,
"loss": 0.2525,
"step": 12540
},
{
"epoch": 2.630421118793212,
"grad_norm": 1.041908621788025,
"learning_rate": 3.927584147137514e-06,
"loss": 0.2833,
"step": 12555
},
{
"epoch": 2.633563796354494,
"grad_norm": 1.2598505020141602,
"learning_rate": 3.8619423512106734e-06,
"loss": 0.2895,
"step": 12570
},
{
"epoch": 2.636706473915776,
"grad_norm": 1.137080430984497,
"learning_rate": 3.7968316832853456e-06,
"loss": 0.29,
"step": 12585
},
{
"epoch": 2.6398491514770583,
"grad_norm": 1.0239893198013306,
"learning_rate": 3.7322528928967703e-06,
"loss": 0.2548,
"step": 12600
},
{
"epoch": 2.6429918290383405,
"grad_norm": 0.9820106625556946,
"learning_rate": 3.668206723457329e-06,
"loss": 0.3135,
"step": 12615
},
{
"epoch": 2.6461345065996227,
"grad_norm": 0.8583505153656006,
"learning_rate": 3.604693912248025e-06,
"loss": 0.2581,
"step": 12630
},
{
"epoch": 2.649277184160905,
"grad_norm": 1.1391513347625732,
"learning_rate": 3.541715190410022e-06,
"loss": 0.2878,
"step": 12645
},
{
"epoch": 2.6524198617221875,
"grad_norm": 1.0786199569702148,
"learning_rate": 3.4792712829361917e-06,
"loss": 0.2667,
"step": 12660
},
{
"epoch": 2.6555625392834696,
"grad_norm": 0.9973167777061462,
"learning_rate": 3.4173629086627633e-06,
"loss": 0.2455,
"step": 12675
},
{
"epoch": 2.658705216844752,
"grad_norm": 0.8622914552688599,
"learning_rate": 3.355990780261059e-06,
"loss": 0.2264,
"step": 12690
},
{
"epoch": 2.661847894406034,
"grad_norm": 0.9155644774436951,
"learning_rate": 3.295155604229322e-06,
"loss": 0.3147,
"step": 12705
},
{
"epoch": 2.664990571967316,
"grad_norm": 1.313897728919983,
"learning_rate": 3.234858080884545e-06,
"loss": 0.2793,
"step": 12720
},
{
"epoch": 2.6681332495285983,
"grad_norm": 1.0417330265045166,
"learning_rate": 3.1750989043543843e-06,
"loss": 0.3048,
"step": 12735
},
{
"epoch": 2.6712759270898805,
"grad_norm": 1.175787091255188,
"learning_rate": 3.1158787625692632e-06,
"loss": 0.2897,
"step": 12750
},
{
"epoch": 2.6744186046511627,
"grad_norm": 1.1047790050506592,
"learning_rate": 3.05719833725433e-06,
"loss": 0.3,
"step": 12765
},
{
"epoch": 2.6775612822124453,
"grad_norm": 0.8376184701919556,
"learning_rate": 2.9990583039217203e-06,
"loss": 0.2654,
"step": 12780
},
{
"epoch": 2.6807039597737274,
"grad_norm": 0.6929535269737244,
"learning_rate": 2.941459331862706e-06,
"loss": 0.3012,
"step": 12795
},
{
"epoch": 2.6838466373350096,
"grad_norm": 0.832949161529541,
"learning_rate": 2.8844020841400364e-06,
"loss": 0.2765,
"step": 12810
},
{
"epoch": 2.686989314896292,
"grad_norm": 0.9470664858818054,
"learning_rate": 2.827887217580266e-06,
"loss": 0.2729,
"step": 12825
},
{
"epoch": 2.690131992457574,
"grad_norm": 0.7952046394348145,
"learning_rate": 2.771915382766238e-06,
"loss": 0.2464,
"step": 12840
},
{
"epoch": 2.693274670018856,
"grad_norm": 1.0609912872314453,
"learning_rate": 2.7164872240295458e-06,
"loss": 0.3087,
"step": 12855
},
{
"epoch": 2.6964173475801383,
"grad_norm": 0.9275609850883484,
"learning_rate": 2.6616033794431614e-06,
"loss": 0.2575,
"step": 12870
},
{
"epoch": 2.6995600251414205,
"grad_norm": 1.464107871055603,
"learning_rate": 2.607264480814059e-06,
"loss": 0.2919,
"step": 12885
},
{
"epoch": 2.7027027027027026,
"grad_norm": 1.1258777379989624,
"learning_rate": 2.5534711536759404e-06,
"loss": 0.265,
"step": 12900
},
{
"epoch": 2.705845380263985,
"grad_norm": 1.169700264930725,
"learning_rate": 2.5002240172820823e-06,
"loss": 0.2849,
"step": 12915
},
{
"epoch": 2.708988057825267,
"grad_norm": 1.3186782598495483,
"learning_rate": 2.4475236845981465e-06,
"loss": 0.2806,
"step": 12930
},
{
"epoch": 2.712130735386549,
"grad_norm": 1.4104660749435425,
"learning_rate": 2.395370762295135e-06,
"loss": 0.3004,
"step": 12945
},
{
"epoch": 2.7152734129478313,
"grad_norm": 1.2798209190368652,
"learning_rate": 2.343765850742441e-06,
"loss": 0.2887,
"step": 12960
},
{
"epoch": 2.718416090509114,
"grad_norm": 1.0648716688156128,
"learning_rate": 2.2927095440009093e-06,
"loss": 0.2842,
"step": 12975
},
{
"epoch": 2.721558768070396,
"grad_norm": 1.0158684253692627,
"learning_rate": 2.2422024298160147e-06,
"loss": 0.2977,
"step": 12990
},
{
"epoch": 2.7247014456316783,
"grad_norm": 0.6185563802719116,
"learning_rate": 2.1922450896110614e-06,
"loss": 0.2967,
"step": 13005
},
{
"epoch": 2.7278441231929604,
"grad_norm": 1.0942654609680176,
"learning_rate": 2.142838098480543e-06,
"loss": 0.277,
"step": 13020
},
{
"epoch": 2.7309868007542426,
"grad_norm": 1.0424152612686157,
"learning_rate": 2.0939820251834717e-06,
"loss": 0.2908,
"step": 13035
},
{
"epoch": 2.7341294783155248,
"grad_norm": 1.048524022102356,
"learning_rate": 2.0456774321368666e-06,
"loss": 0.3442,
"step": 13050
},
{
"epoch": 2.737272155876807,
"grad_norm": 0.8081900477409363,
"learning_rate": 1.9979248754092517e-06,
"loss": 0.2707,
"step": 13065
},
{
"epoch": 2.740414833438089,
"grad_norm": 1.3440662622451782,
"learning_rate": 1.950724904714285e-06,
"loss": 0.3337,
"step": 13080
},
{
"epoch": 2.7435575109993717,
"grad_norm": 0.9911431670188904,
"learning_rate": 1.904078063404391e-06,
"loss": 0.2852,
"step": 13095
},
{
"epoch": 2.746700188560654,
"grad_norm": 1.150423526763916,
"learning_rate": 1.8579848884645534e-06,
"loss": 0.2571,
"step": 13110
},
{
"epoch": 2.749842866121936,
"grad_norm": 1.1156803369522095,
"learning_rate": 1.8124459105060942e-06,
"loss": 0.2896,
"step": 13125
},
{
"epoch": 2.7529855436832182,
"grad_norm": 1.040390133857727,
"learning_rate": 1.767461653760588e-06,
"loss": 0.278,
"step": 13140
},
{
"epoch": 2.7561282212445004,
"grad_norm": 1.0304458141326904,
"learning_rate": 1.723032636073807e-06,
"loss": 0.2613,
"step": 13155
},
{
"epoch": 2.7592708988057826,
"grad_norm": 1.1717437505722046,
"learning_rate": 1.679159368899763e-06,
"loss": 0.3064,
"step": 13170
},
{
"epoch": 2.7624135763670647,
"grad_norm": 0.9141078591346741,
"learning_rate": 1.63584235729487e-06,
"loss": 0.2837,
"step": 13185
},
{
"epoch": 2.765556253928347,
"grad_norm": 1.1188409328460693,
"learning_rate": 1.593082099912052e-06,
"loss": 0.2932,
"step": 13200
},
{
"epoch": 2.768698931489629,
"grad_norm": 1.0684481859207153,
"learning_rate": 1.5508790889950441e-06,
"loss": 0.267,
"step": 13215
},
{
"epoch": 2.7718416090509113,
"grad_norm": 0.976677417755127,
"learning_rate": 1.5092338103727344e-06,
"loss": 0.2897,
"step": 13230
},
{
"epoch": 2.7749842866121934,
"grad_norm": 1.081978678703308,
"learning_rate": 1.4681467434535356e-06,
"loss": 0.2592,
"step": 13245
},
{
"epoch": 2.7781269641734756,
"grad_norm": 1.090117335319519,
"learning_rate": 1.4276183612199178e-06,
"loss": 0.2923,
"step": 13260
},
{
"epoch": 2.7812696417347578,
"grad_norm": 1.1117249727249146,
"learning_rate": 1.3876491302229011e-06,
"loss": 0.2701,
"step": 13275
},
{
"epoch": 2.7844123192960404,
"grad_norm": 1.4228675365447998,
"learning_rate": 1.3482395105767543e-06,
"loss": 0.3066,
"step": 13290
},
{
"epoch": 2.7875549968573226,
"grad_norm": 0.9276790618896484,
"learning_rate": 1.3093899559536272e-06,
"loss": 0.2437,
"step": 13305
},
{
"epoch": 2.7906976744186047,
"grad_norm": 1.1724159717559814,
"learning_rate": 1.2711009135783825e-06,
"loss": 0.3051,
"step": 13320
},
{
"epoch": 2.793840351979887,
"grad_norm": 0.9188593029975891,
"learning_rate": 1.2333728242234333e-06,
"loss": 0.3214,
"step": 13335
},
{
"epoch": 2.796983029541169,
"grad_norm": 1.084934949874878,
"learning_rate": 1.196206122203647e-06,
"loss": 0.2653,
"step": 13350
},
{
"epoch": 2.8001257071024512,
"grad_norm": 1.041142225265503,
"learning_rate": 1.1596012353713604e-06,
"loss": 0.2879,
"step": 13365
},
{
"epoch": 2.8032683846637334,
"grad_norm": 1.026824951171875,
"learning_rate": 1.1235585851114726e-06,
"loss": 0.3006,
"step": 13380
},
{
"epoch": 2.8064110622250156,
"grad_norm": 1.143835425376892,
"learning_rate": 1.0880785863365718e-06,
"loss": 0.305,
"step": 13395
},
{
"epoch": 2.809553739786298,
"grad_norm": 0.5169873833656311,
"learning_rate": 1.0531616474821649e-06,
"loss": 0.2878,
"step": 13410
},
{
"epoch": 2.8126964173475804,
"grad_norm": 1.1536767482757568,
"learning_rate": 1.0188081705019558e-06,
"loss": 0.2877,
"step": 13425
},
{
"epoch": 2.8158390949088625,
"grad_norm": 0.9985389113426208,
"learning_rate": 9.850185508632704e-07,
"loss": 0.3113,
"step": 13440
},
{
"epoch": 2.8189817724701447,
"grad_norm": 0.9148264527320862,
"learning_rate": 9.517931775424593e-07,
"loss": 0.3117,
"step": 13455
},
{
"epoch": 2.822124450031427,
"grad_norm": 1.1424579620361328,
"learning_rate": 9.191324330204199e-07,
"loss": 0.2721,
"step": 13470
},
{
"epoch": 2.825267127592709,
"grad_norm": 1.054230809211731,
"learning_rate": 8.870366932782093e-07,
"loss": 0.303,
"step": 13485
},
{
"epoch": 2.828409805153991,
"grad_norm": 1.211416482925415,
"learning_rate": 8.555063277927378e-07,
"loss": 0.2932,
"step": 13500
},
{
"epoch": 2.8315524827152734,
"grad_norm": 1.4953478574752808,
"learning_rate": 8.24541699532455e-07,
"loss": 0.3246,
"step": 13515
},
{
"epoch": 2.8346951602765555,
"grad_norm": 0.773501455783844,
"learning_rate": 7.94143164953226e-07,
"loss": 0.2777,
"step": 13530
},
{
"epoch": 2.8378378378378377,
"grad_norm": 0.6173717379570007,
"learning_rate": 7.643110739942172e-07,
"loss": 0.3181,
"step": 13545
},
{
"epoch": 2.84098051539912,
"grad_norm": 1.1255333423614502,
"learning_rate": 7.350457700738389e-07,
"loss": 0.2954,
"step": 13560
},
{
"epoch": 2.844123192960402,
"grad_norm": 1.1932814121246338,
"learning_rate": 7.063475900858263e-07,
"loss": 0.314,
"step": 13575
},
{
"epoch": 2.8472658705216842,
"grad_norm": 1.5271681547164917,
"learning_rate": 6.782168643953312e-07,
"loss": 0.3197,
"step": 13590
},
{
"epoch": 2.850408548082967,
"grad_norm": 0.9488076567649841,
"learning_rate": 6.506539168351699e-07,
"loss": 0.2993,
"step": 13605
},
{
"epoch": 2.853551225644249,
"grad_norm": 1.015404462814331,
"learning_rate": 6.236590647020202e-07,
"loss": 0.2831,
"step": 13620
},
{
"epoch": 2.856693903205531,
"grad_norm": 0.6510112881660461,
"learning_rate": 5.972326187528299e-07,
"loss": 0.2806,
"step": 13635
},
{
"epoch": 2.8598365807668134,
"grad_norm": 1.1119881868362427,
"learning_rate": 5.7137488320122e-07,
"loss": 0.2625,
"step": 13650
},
{
"epoch": 2.8629792583280955,
"grad_norm": 1.0891669988632202,
"learning_rate": 5.460861557139818e-07,
"loss": 0.2913,
"step": 13665
},
{
"epoch": 2.8661219358893777,
"grad_norm": 1.3575654029846191,
"learning_rate": 5.213667274076461e-07,
"loss": 0.3209,
"step": 13680
},
{
"epoch": 2.86926461345066,
"grad_norm": 0.7372342944145203,
"learning_rate": 4.972168828451251e-07,
"loss": 0.2798,
"step": 13695
},
{
"epoch": 2.872407291011942,
"grad_norm": 1.258745551109314,
"learning_rate": 4.736369000324703e-07,
"loss": 0.3125,
"step": 13710
},
{
"epoch": 2.8755499685732246,
"grad_norm": 0.7658424973487854,
"learning_rate": 4.506270504156307e-07,
"loss": 0.2501,
"step": 13725
},
{
"epoch": 2.878692646134507,
"grad_norm": 1.225644826889038,
"learning_rate": 4.281875988773554e-07,
"loss": 0.2975,
"step": 13740
},
{
"epoch": 2.881835323695789,
"grad_norm": 1.0335606336593628,
"learning_rate": 4.063188037341348e-07,
"loss": 0.2852,
"step": 13755
},
{
"epoch": 2.884978001257071,
"grad_norm": 0.8567134737968445,
"learning_rate": 3.8502091673322526e-07,
"loss": 0.2584,
"step": 13770
},
{
"epoch": 2.8881206788183533,
"grad_norm": 0.8661710023880005,
"learning_rate": 3.642941830497515e-07,
"loss": 0.3128,
"step": 13785
},
{
"epoch": 2.8912633563796355,
"grad_norm": 1.1629458665847778,
"learning_rate": 3.441388412838864e-07,
"loss": 0.2919,
"step": 13800
},
{
"epoch": 2.8944060339409177,
"grad_norm": 0.9116327166557312,
"learning_rate": 3.2455512345811457e-07,
"loss": 0.2464,
"step": 13815
},
{
"epoch": 2.8975487115022,
"grad_norm": 0.8351930975914001,
"learning_rate": 3.055432550145398e-07,
"loss": 0.3138,
"step": 13830
},
{
"epoch": 2.900691389063482,
"grad_norm": 0.8611274361610413,
"learning_rate": 2.871034548122986e-07,
"loss": 0.2675,
"step": 13845
},
{
"epoch": 2.903834066624764,
"grad_norm": 1.021216630935669,
"learning_rate": 2.692359351250506e-07,
"loss": 0.2545,
"step": 13860
},
{
"epoch": 2.9069767441860463,
"grad_norm": 0.9064350128173828,
"learning_rate": 2.5194090163853103e-07,
"loss": 0.2813,
"step": 13875
},
{
"epoch": 2.9101194217473285,
"grad_norm": 0.7603162527084351,
"learning_rate": 2.3521855344816323e-07,
"loss": 0.2837,
"step": 13890
},
{
"epoch": 2.9132620993086107,
"grad_norm": 1.0929245948791504,
"learning_rate": 2.1906908305679986e-07,
"loss": 0.3017,
"step": 13905
},
{
"epoch": 2.9164047768698933,
"grad_norm": 1.078133225440979,
"learning_rate": 2.0349267637247982e-07,
"loss": 0.2812,
"step": 13920
},
{
"epoch": 2.9195474544311755,
"grad_norm": 0.6622474789619446,
"learning_rate": 1.8848951270630244e-07,
"loss": 0.2775,
"step": 13935
},
{
"epoch": 2.9226901319924576,
"grad_norm": 0.8766260147094727,
"learning_rate": 1.7405976477035124e-07,
"loss": 0.2694,
"step": 13950
},
{
"epoch": 2.92583280955374,
"grad_norm": 1.1658825874328613,
"learning_rate": 1.6020359867572333e-07,
"loss": 0.2946,
"step": 13965
},
{
"epoch": 2.928975487115022,
"grad_norm": 1.0801419019699097,
"learning_rate": 1.469211739306031e-07,
"loss": 0.3458,
"step": 13980
},
{
"epoch": 2.932118164676304,
"grad_norm": 1.0484652519226074,
"learning_rate": 1.3421264343843054e-07,
"loss": 0.3075,
"step": 13995
},
{
"epoch": 2.9352608422375863,
"grad_norm": 1.168779730796814,
"learning_rate": 1.2207815349614128e-07,
"loss": 0.2848,
"step": 14010
},
{
"epoch": 2.9384035197988685,
"grad_norm": 0.684332013130188,
"learning_rate": 1.105178437924792e-07,
"loss": 0.2766,
"step": 14025
},
{
"epoch": 2.941546197360151,
"grad_norm": 0.8341169953346252,
"learning_rate": 9.953184740639222e-08,
"loss": 0.2733,
"step": 14040
},
{
"epoch": 2.9446888749214333,
"grad_norm": 0.9254804849624634,
"learning_rate": 8.91202908055e-08,
"loss": 0.2501,
"step": 14055
},
{
"epoch": 2.9478315524827154,
"grad_norm": 1.1126680374145508,
"learning_rate": 7.928329384463418e-08,
"loss": 0.3106,
"step": 14070
},
{
"epoch": 2.9509742300439976,
"grad_norm": 0.7974324226379395,
"learning_rate": 7.002096976446715e-08,
"loss": 0.2767,
"step": 14085
},
{
"epoch": 2.95411690760528,
"grad_norm": 1.1926546096801758,
"learning_rate": 6.133342519020202e-08,
"loss": 0.3035,
"step": 14100
},
{
"epoch": 2.957259585166562,
"grad_norm": 1.0937379598617554,
"learning_rate": 5.322076013034027e-08,
"loss": 0.2464,
"step": 14115
},
{
"epoch": 2.960402262727844,
"grad_norm": 0.7003195285797119,
"learning_rate": 4.568306797554378e-08,
"loss": 0.2879,
"step": 14130
},
{
"epoch": 2.9635449402891263,
"grad_norm": 1.247180461883545,
"learning_rate": 3.872043549754678e-08,
"loss": 0.2745,
"step": 14145
},
{
"epoch": 2.9666876178504085,
"grad_norm": 1.021088719367981,
"learning_rate": 3.233294284816224e-08,
"loss": 0.2661,
"step": 14160
},
{
"epoch": 2.9698302954116906,
"grad_norm": 1.149630069732666,
"learning_rate": 2.652066355836591e-08,
"loss": 0.3032,
"step": 14175
},
{
"epoch": 2.972972972972973,
"grad_norm": 1.1792501211166382,
"learning_rate": 2.128366453743591e-08,
"loss": 0.2652,
"step": 14190
},
{
"epoch": 2.976115650534255,
"grad_norm": 1.1864453554153442,
"learning_rate": 1.662200607219777e-08,
"loss": 0.2712,
"step": 14205
},
{
"epoch": 2.979258328095537,
"grad_norm": 1.6070250272750854,
"learning_rate": 1.2535741826313897e-08,
"loss": 0.2848,
"step": 14220
},
{
"epoch": 2.9824010056568198,
"grad_norm": 0.9383937120437622,
"learning_rate": 9.024918839678486e-09,
"loss": 0.2689,
"step": 14235
},
{
"epoch": 2.985543683218102,
"grad_norm": 0.9039358496665955,
"learning_rate": 6.089577527873535e-09,
"loss": 0.2109,
"step": 14250
},
{
"epoch": 2.988686360779384,
"grad_norm": 0.8809177279472351,
"learning_rate": 3.729751681702531e-09,
"loss": 0.2992,
"step": 14265
},
{
"epoch": 2.9918290383406663,
"grad_norm": 1.0034148693084717,
"learning_rate": 1.94546846679633e-09,
"loss": 0.2721,
"step": 14280
},
{
"epoch": 2.9949717159019484,
"grad_norm": 0.8567355871200562,
"learning_rate": 7.367484233133937e-10,
"loss": 0.2449,
"step": 14295
},
{
"epoch": 2.9981143934632306,
"grad_norm": 1.462417721748352,
"learning_rate": 1.0360546568444207e-10,
"loss": 0.2634,
"step": 14310
},
{
"epoch": 3.0,
"step": 14319,
"total_flos": 2.7323814530514944e+18,
"train_loss": 0.39260489724686265,
"train_runtime": 12699.4551,
"train_samples_per_second": 4.51,
"train_steps_per_second": 1.128
}
],
"logging_steps": 15,
"max_steps": 14319,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 10000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.7323814530514944e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}