ft_video_phy-20k / trainer_state.json
seungkukim's picture
Add files using upload-large-folder tool
c3d471b verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.8504847763225039,
"eval_steps": 500,
"global_step": 20000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00042524238816125194,
"grad_norm": 3.2010223865509033,
"learning_rate": 9e-08,
"loss": 1.3874,
"step": 10
},
{
"epoch": 0.0008504847763225039,
"grad_norm": 1.5439364910125732,
"learning_rate": 1.9e-07,
"loss": 1.3901,
"step": 20
},
{
"epoch": 0.0012757271644837558,
"grad_norm": 1.5690912008285522,
"learning_rate": 2.9000000000000003e-07,
"loss": 1.3805,
"step": 30
},
{
"epoch": 0.0017009695526450078,
"grad_norm": 3.358100175857544,
"learning_rate": 3.8999999999999997e-07,
"loss": 1.3746,
"step": 40
},
{
"epoch": 0.0021262119408062595,
"grad_norm": 1.5554291009902954,
"learning_rate": 4.9e-07,
"loss": 1.3712,
"step": 50
},
{
"epoch": 0.0025514543289675115,
"grad_norm": 1.6682578325271606,
"learning_rate": 5.9e-07,
"loss": 1.3693,
"step": 60
},
{
"epoch": 0.0029766967171287635,
"grad_norm": 2.5594966411590576,
"learning_rate": 6.9e-07,
"loss": 1.3535,
"step": 70
},
{
"epoch": 0.0034019391052900155,
"grad_norm": 1.2641184329986572,
"learning_rate": 7.900000000000001e-07,
"loss": 1.3506,
"step": 80
},
{
"epoch": 0.003827181493451267,
"grad_norm": 1.0074024200439453,
"learning_rate": 8.900000000000001e-07,
"loss": 1.3411,
"step": 90
},
{
"epoch": 0.004252423881612519,
"grad_norm": 2.079498291015625,
"learning_rate": 9.9e-07,
"loss": 1.3332,
"step": 100
},
{
"epoch": 0.004677666269773771,
"grad_norm": 1.5527578592300415,
"learning_rate": 1.0900000000000002e-06,
"loss": 1.3341,
"step": 110
},
{
"epoch": 0.005102908657935023,
"grad_norm": 0.8216768503189087,
"learning_rate": 1.19e-06,
"loss": 1.3179,
"step": 120
},
{
"epoch": 0.005528151046096275,
"grad_norm": 0.7347335815429688,
"learning_rate": 1.29e-06,
"loss": 1.3076,
"step": 130
},
{
"epoch": 0.005953393434257527,
"grad_norm": 0.6095930933952332,
"learning_rate": 1.39e-06,
"loss": 1.3021,
"step": 140
},
{
"epoch": 0.006378635822418779,
"grad_norm": 0.7857323288917542,
"learning_rate": 1.49e-06,
"loss": 1.3051,
"step": 150
},
{
"epoch": 0.006803878210580031,
"grad_norm": 0.728410542011261,
"learning_rate": 1.59e-06,
"loss": 1.3004,
"step": 160
},
{
"epoch": 0.007229120598741283,
"grad_norm": 1.132466197013855,
"learning_rate": 1.69e-06,
"loss": 1.2892,
"step": 170
},
{
"epoch": 0.007654362986902534,
"grad_norm": 1.3443944454193115,
"learning_rate": 1.79e-06,
"loss": 1.2835,
"step": 180
},
{
"epoch": 0.008079605375063786,
"grad_norm": 0.6804354786872864,
"learning_rate": 1.8900000000000001e-06,
"loss": 1.2874,
"step": 190
},
{
"epoch": 0.008504847763225038,
"grad_norm": 0.4348722994327545,
"learning_rate": 1.99e-06,
"loss": 1.2857,
"step": 200
},
{
"epoch": 0.00893009015138629,
"grad_norm": 0.6353126168251038,
"learning_rate": 2.09e-06,
"loss": 1.2806,
"step": 210
},
{
"epoch": 0.009355332539547541,
"grad_norm": 0.5617169737815857,
"learning_rate": 2.1899999999999998e-06,
"loss": 1.279,
"step": 220
},
{
"epoch": 0.009780574927708794,
"grad_norm": 0.665167510509491,
"learning_rate": 2.29e-06,
"loss": 1.2731,
"step": 230
},
{
"epoch": 0.010205817315870046,
"grad_norm": 0.4070955514907837,
"learning_rate": 2.39e-06,
"loss": 1.271,
"step": 240
},
{
"epoch": 0.010631059704031299,
"grad_norm": 0.6760728359222412,
"learning_rate": 2.4900000000000003e-06,
"loss": 1.265,
"step": 250
},
{
"epoch": 0.01105630209219255,
"grad_norm": 0.4963316023349762,
"learning_rate": 2.59e-06,
"loss": 1.2628,
"step": 260
},
{
"epoch": 0.011481544480353802,
"grad_norm": 0.42656829953193665,
"learning_rate": 2.69e-06,
"loss": 1.2602,
"step": 270
},
{
"epoch": 0.011906786868515054,
"grad_norm": 0.3536563813686371,
"learning_rate": 2.79e-06,
"loss": 1.2481,
"step": 280
},
{
"epoch": 0.012332029256676305,
"grad_norm": 0.4079868793487549,
"learning_rate": 2.8900000000000003e-06,
"loss": 1.2361,
"step": 290
},
{
"epoch": 0.012757271644837557,
"grad_norm": 0.33841463923454285,
"learning_rate": 2.99e-06,
"loss": 1.2329,
"step": 300
},
{
"epoch": 0.01318251403299881,
"grad_norm": 0.3516484797000885,
"learning_rate": 3.09e-06,
"loss": 1.2188,
"step": 310
},
{
"epoch": 0.013607756421160062,
"grad_norm": 0.296055406332016,
"learning_rate": 3.19e-06,
"loss": 1.2077,
"step": 320
},
{
"epoch": 0.014032998809321313,
"grad_norm": 0.2963598966598511,
"learning_rate": 3.29e-06,
"loss": 1.1939,
"step": 330
},
{
"epoch": 0.014458241197482565,
"grad_norm": 0.3225858211517334,
"learning_rate": 3.39e-06,
"loss": 1.181,
"step": 340
},
{
"epoch": 0.014883483585643818,
"grad_norm": 0.2994067072868347,
"learning_rate": 3.49e-06,
"loss": 1.1655,
"step": 350
},
{
"epoch": 0.015308725973805068,
"grad_norm": 0.3013548254966736,
"learning_rate": 3.5900000000000004e-06,
"loss": 1.1578,
"step": 360
},
{
"epoch": 0.015733968361966322,
"grad_norm": 0.4306448698043823,
"learning_rate": 3.6900000000000002e-06,
"loss": 1.1496,
"step": 370
},
{
"epoch": 0.01615921075012757,
"grad_norm": 0.33329278230667114,
"learning_rate": 3.7899999999999997e-06,
"loss": 1.1427,
"step": 380
},
{
"epoch": 0.016584453138288824,
"grad_norm": 0.30759066343307495,
"learning_rate": 3.890000000000001e-06,
"loss": 1.1413,
"step": 390
},
{
"epoch": 0.017009695526450076,
"grad_norm": 0.28163671493530273,
"learning_rate": 3.99e-06,
"loss": 1.1257,
"step": 400
},
{
"epoch": 0.01743493791461133,
"grad_norm": 0.3048485517501831,
"learning_rate": 4.09e-06,
"loss": 1.1293,
"step": 410
},
{
"epoch": 0.01786018030277258,
"grad_norm": 0.3524611294269562,
"learning_rate": 4.19e-06,
"loss": 1.1192,
"step": 420
},
{
"epoch": 0.018285422690933834,
"grad_norm": 0.33914950489997864,
"learning_rate": 4.29e-06,
"loss": 1.1145,
"step": 430
},
{
"epoch": 0.018710665079095083,
"grad_norm": 0.33718347549438477,
"learning_rate": 4.39e-06,
"loss": 1.1122,
"step": 440
},
{
"epoch": 0.019135907467256335,
"grad_norm": 0.3634999096393585,
"learning_rate": 4.49e-06,
"loss": 1.1153,
"step": 450
},
{
"epoch": 0.019561149855417587,
"grad_norm": 0.43056294322013855,
"learning_rate": 4.59e-06,
"loss": 1.1115,
"step": 460
},
{
"epoch": 0.01998639224357884,
"grad_norm": 0.3170914351940155,
"learning_rate": 4.69e-06,
"loss": 1.1015,
"step": 470
},
{
"epoch": 0.020411634631740092,
"grad_norm": 0.4195287823677063,
"learning_rate": 4.790000000000001e-06,
"loss": 1.0997,
"step": 480
},
{
"epoch": 0.020836877019901345,
"grad_norm": 0.3294726610183716,
"learning_rate": 4.890000000000001e-06,
"loss": 1.0998,
"step": 490
},
{
"epoch": 0.021262119408062597,
"grad_norm": 0.312850683927536,
"learning_rate": 4.9900000000000005e-06,
"loss": 1.094,
"step": 500
},
{
"epoch": 0.021687361796223846,
"grad_norm": 0.3543089032173157,
"learning_rate": 5.0899999999999995e-06,
"loss": 1.0897,
"step": 510
},
{
"epoch": 0.0221126041843851,
"grad_norm": 0.39667844772338867,
"learning_rate": 5.1899999999999994e-06,
"loss": 1.0863,
"step": 520
},
{
"epoch": 0.02253784657254635,
"grad_norm": 0.44174888730049133,
"learning_rate": 5.29e-06,
"loss": 1.0799,
"step": 530
},
{
"epoch": 0.022963088960707603,
"grad_norm": 0.37127602100372314,
"learning_rate": 5.39e-06,
"loss": 1.0814,
"step": 540
},
{
"epoch": 0.023388331348868856,
"grad_norm": 0.35916680097579956,
"learning_rate": 5.49e-06,
"loss": 1.079,
"step": 550
},
{
"epoch": 0.023813573737030108,
"grad_norm": 0.41336843371391296,
"learning_rate": 5.59e-06,
"loss": 1.0728,
"step": 560
},
{
"epoch": 0.02423881612519136,
"grad_norm": 0.5130095481872559,
"learning_rate": 5.690000000000001e-06,
"loss": 1.0743,
"step": 570
},
{
"epoch": 0.02466405851335261,
"grad_norm": 0.2982211410999298,
"learning_rate": 5.7900000000000005e-06,
"loss": 1.0703,
"step": 580
},
{
"epoch": 0.025089300901513862,
"grad_norm": 0.3798081874847412,
"learning_rate": 5.89e-06,
"loss": 1.0737,
"step": 590
},
{
"epoch": 0.025514543289675114,
"grad_norm": 0.4531615972518921,
"learning_rate": 5.99e-06,
"loss": 1.0643,
"step": 600
},
{
"epoch": 0.025939785677836367,
"grad_norm": 0.37526369094848633,
"learning_rate": 6.090000000000001e-06,
"loss": 1.0645,
"step": 610
},
{
"epoch": 0.02636502806599762,
"grad_norm": 0.4011104106903076,
"learning_rate": 6.19e-06,
"loss": 1.0591,
"step": 620
},
{
"epoch": 0.02679027045415887,
"grad_norm": 0.41057097911834717,
"learning_rate": 6.29e-06,
"loss": 1.0564,
"step": 630
},
{
"epoch": 0.027215512842320124,
"grad_norm": 0.5642093420028687,
"learning_rate": 6.39e-06,
"loss": 1.0522,
"step": 640
},
{
"epoch": 0.027640755230481373,
"grad_norm": 0.6502516269683838,
"learning_rate": 6.49e-06,
"loss": 1.0454,
"step": 650
},
{
"epoch": 0.028065997618642625,
"grad_norm": 0.4717367887496948,
"learning_rate": 6.5900000000000004e-06,
"loss": 1.0386,
"step": 660
},
{
"epoch": 0.028491240006803878,
"grad_norm": 0.6143516898155212,
"learning_rate": 6.69e-06,
"loss": 1.0228,
"step": 670
},
{
"epoch": 0.02891648239496513,
"grad_norm": 0.46155494451522827,
"learning_rate": 6.79e-06,
"loss": 1.0089,
"step": 680
},
{
"epoch": 0.029341724783126383,
"grad_norm": 0.7181910276412964,
"learning_rate": 6.89e-06,
"loss": 0.9919,
"step": 690
},
{
"epoch": 0.029766967171287635,
"grad_norm": 0.49455365538597107,
"learning_rate": 6.990000000000001e-06,
"loss": 0.9628,
"step": 700
},
{
"epoch": 0.030192209559448884,
"grad_norm": 0.5362220406532288,
"learning_rate": 7.090000000000001e-06,
"loss": 0.9337,
"step": 710
},
{
"epoch": 0.030617451947610137,
"grad_norm": 0.5086848139762878,
"learning_rate": 7.19e-06,
"loss": 0.9057,
"step": 720
},
{
"epoch": 0.03104269433577139,
"grad_norm": 0.5631227493286133,
"learning_rate": 7.29e-06,
"loss": 0.8713,
"step": 730
},
{
"epoch": 0.031467936723932645,
"grad_norm": 0.6062225699424744,
"learning_rate": 7.3899999999999995e-06,
"loss": 0.838,
"step": 740
},
{
"epoch": 0.03189317911209389,
"grad_norm": 0.7424901127815247,
"learning_rate": 7.49e-06,
"loss": 0.7954,
"step": 750
},
{
"epoch": 0.03231842150025514,
"grad_norm": 0.8033110499382019,
"learning_rate": 7.59e-06,
"loss": 0.754,
"step": 760
},
{
"epoch": 0.032743663888416395,
"grad_norm": 0.9407315850257874,
"learning_rate": 7.690000000000001e-06,
"loss": 0.7209,
"step": 770
},
{
"epoch": 0.03316890627657765,
"grad_norm": 0.8432111144065857,
"learning_rate": 7.79e-06,
"loss": 0.6719,
"step": 780
},
{
"epoch": 0.0335941486647389,
"grad_norm": 0.8872693181037903,
"learning_rate": 7.89e-06,
"loss": 0.6367,
"step": 790
},
{
"epoch": 0.03401939105290015,
"grad_norm": 0.8927829265594482,
"learning_rate": 7.99e-06,
"loss": 0.6017,
"step": 800
},
{
"epoch": 0.034444633441061405,
"grad_norm": 1.2115992307662964,
"learning_rate": 8.09e-06,
"loss": 0.5578,
"step": 810
},
{
"epoch": 0.03486987582922266,
"grad_norm": 1.2473925352096558,
"learning_rate": 8.190000000000001e-06,
"loss": 0.5364,
"step": 820
},
{
"epoch": 0.03529511821738391,
"grad_norm": 1.0221428871154785,
"learning_rate": 8.29e-06,
"loss": 0.5031,
"step": 830
},
{
"epoch": 0.03572036060554516,
"grad_norm": 1.24955415725708,
"learning_rate": 8.390000000000001e-06,
"loss": 0.4787,
"step": 840
},
{
"epoch": 0.036145602993706415,
"grad_norm": 1.1550233364105225,
"learning_rate": 8.49e-06,
"loss": 0.4508,
"step": 850
},
{
"epoch": 0.03657084538186767,
"grad_norm": 1.9099974632263184,
"learning_rate": 8.59e-06,
"loss": 0.4327,
"step": 860
},
{
"epoch": 0.03699608777002892,
"grad_norm": 1.241186499595642,
"learning_rate": 8.690000000000002e-06,
"loss": 0.411,
"step": 870
},
{
"epoch": 0.037421330158190165,
"grad_norm": 1.373657464981079,
"learning_rate": 8.79e-06,
"loss": 0.3953,
"step": 880
},
{
"epoch": 0.03784657254635142,
"grad_norm": 1.6695680618286133,
"learning_rate": 8.89e-06,
"loss": 0.374,
"step": 890
},
{
"epoch": 0.03827181493451267,
"grad_norm": 1.3347766399383545,
"learning_rate": 8.99e-06,
"loss": 0.3563,
"step": 900
},
{
"epoch": 0.03869705732267392,
"grad_norm": 1.2124155759811401,
"learning_rate": 9.09e-06,
"loss": 0.3237,
"step": 910
},
{
"epoch": 0.039122299710835175,
"grad_norm": 1.1073696613311768,
"learning_rate": 9.19e-06,
"loss": 0.2997,
"step": 920
},
{
"epoch": 0.03954754209899643,
"grad_norm": 1.322092056274414,
"learning_rate": 9.289999999999999e-06,
"loss": 0.2878,
"step": 930
},
{
"epoch": 0.03997278448715768,
"grad_norm": 1.1476775407791138,
"learning_rate": 9.39e-06,
"loss": 0.263,
"step": 940
},
{
"epoch": 0.04039802687531893,
"grad_norm": 1.3284542560577393,
"learning_rate": 9.49e-06,
"loss": 0.2456,
"step": 950
},
{
"epoch": 0.040823269263480184,
"grad_norm": 1.3168091773986816,
"learning_rate": 9.59e-06,
"loss": 0.2359,
"step": 960
},
{
"epoch": 0.04124851165164144,
"grad_norm": 1.368938684463501,
"learning_rate": 9.69e-06,
"loss": 0.2278,
"step": 970
},
{
"epoch": 0.04167375403980269,
"grad_norm": 1.097208023071289,
"learning_rate": 9.79e-06,
"loss": 0.2119,
"step": 980
},
{
"epoch": 0.04209899642796394,
"grad_norm": 1.1645355224609375,
"learning_rate": 9.89e-06,
"loss": 0.2169,
"step": 990
},
{
"epoch": 0.042524238816125194,
"grad_norm": 1.1648592948913574,
"learning_rate": 9.990000000000001e-06,
"loss": 0.1849,
"step": 1000
},
{
"epoch": 0.04294948120428645,
"grad_norm": 1.443503975868225,
"learning_rate": 1.009e-05,
"loss": 0.1788,
"step": 1010
},
{
"epoch": 0.04337472359244769,
"grad_norm": 1.0813487768173218,
"learning_rate": 1.019e-05,
"loss": 0.1847,
"step": 1020
},
{
"epoch": 0.043799965980608944,
"grad_norm": 0.81873619556427,
"learning_rate": 1.0290000000000001e-05,
"loss": 0.1687,
"step": 1030
},
{
"epoch": 0.0442252083687702,
"grad_norm": 1.162904143333435,
"learning_rate": 1.039e-05,
"loss": 0.1576,
"step": 1040
},
{
"epoch": 0.04465045075693145,
"grad_norm": 1.271335244178772,
"learning_rate": 1.0490000000000001e-05,
"loss": 0.1553,
"step": 1050
},
{
"epoch": 0.0450756931450927,
"grad_norm": 1.254536509513855,
"learning_rate": 1.059e-05,
"loss": 0.15,
"step": 1060
},
{
"epoch": 0.045500935533253954,
"grad_norm": 1.403584599494934,
"learning_rate": 1.0690000000000001e-05,
"loss": 0.1537,
"step": 1070
},
{
"epoch": 0.04592617792141521,
"grad_norm": 1.571937918663025,
"learning_rate": 1.0790000000000002e-05,
"loss": 0.152,
"step": 1080
},
{
"epoch": 0.04635142030957646,
"grad_norm": 1.3833575248718262,
"learning_rate": 1.089e-05,
"loss": 0.144,
"step": 1090
},
{
"epoch": 0.04677666269773771,
"grad_norm": 1.6977688074111938,
"learning_rate": 1.099e-05,
"loss": 0.1393,
"step": 1100
},
{
"epoch": 0.047201905085898964,
"grad_norm": 1.1160330772399902,
"learning_rate": 1.1089999999999999e-05,
"loss": 0.1333,
"step": 1110
},
{
"epoch": 0.047627147474060216,
"grad_norm": 1.1706668138504028,
"learning_rate": 1.119e-05,
"loss": 0.136,
"step": 1120
},
{
"epoch": 0.04805238986222147,
"grad_norm": 1.2051717042922974,
"learning_rate": 1.129e-05,
"loss": 0.1402,
"step": 1130
},
{
"epoch": 0.04847763225038272,
"grad_norm": 1.4838896989822388,
"learning_rate": 1.139e-05,
"loss": 0.134,
"step": 1140
},
{
"epoch": 0.04890287463854397,
"grad_norm": 1.3389332294464111,
"learning_rate": 1.149e-05,
"loss": 0.1264,
"step": 1150
},
{
"epoch": 0.04932811702670522,
"grad_norm": 1.3594571352005005,
"learning_rate": 1.1589999999999999e-05,
"loss": 0.1326,
"step": 1160
},
{
"epoch": 0.04975335941486647,
"grad_norm": 1.2801322937011719,
"learning_rate": 1.169e-05,
"loss": 0.1312,
"step": 1170
},
{
"epoch": 0.050178601803027724,
"grad_norm": 1.1950969696044922,
"learning_rate": 1.179e-05,
"loss": 0.1254,
"step": 1180
},
{
"epoch": 0.050603844191188976,
"grad_norm": 1.2749476432800293,
"learning_rate": 1.189e-05,
"loss": 0.1225,
"step": 1190
},
{
"epoch": 0.05102908657935023,
"grad_norm": 1.4519540071487427,
"learning_rate": 1.199e-05,
"loss": 0.1312,
"step": 1200
},
{
"epoch": 0.05145432896751148,
"grad_norm": 1.6330214738845825,
"learning_rate": 1.2090000000000001e-05,
"loss": 0.1213,
"step": 1210
},
{
"epoch": 0.051879571355672734,
"grad_norm": 1.9503471851348877,
"learning_rate": 1.219e-05,
"loss": 0.1228,
"step": 1220
},
{
"epoch": 0.052304813743833986,
"grad_norm": 1.6446688175201416,
"learning_rate": 1.2290000000000001e-05,
"loss": 0.1251,
"step": 1230
},
{
"epoch": 0.05273005613199524,
"grad_norm": 1.630383014678955,
"learning_rate": 1.239e-05,
"loss": 0.1186,
"step": 1240
},
{
"epoch": 0.05315529852015649,
"grad_norm": 1.3865199089050293,
"learning_rate": 1.249e-05,
"loss": 0.1105,
"step": 1250
},
{
"epoch": 0.05358054090831774,
"grad_norm": 1.2496237754821777,
"learning_rate": 1.2590000000000001e-05,
"loss": 0.114,
"step": 1260
},
{
"epoch": 0.054005783296478996,
"grad_norm": 1.639675259590149,
"learning_rate": 1.269e-05,
"loss": 0.1152,
"step": 1270
},
{
"epoch": 0.05443102568464025,
"grad_norm": 1.650262475013733,
"learning_rate": 1.2790000000000001e-05,
"loss": 0.1226,
"step": 1280
},
{
"epoch": 0.054856268072801494,
"grad_norm": 1.3705108165740967,
"learning_rate": 1.289e-05,
"loss": 0.1128,
"step": 1290
},
{
"epoch": 0.055281510460962746,
"grad_norm": 1.304731845855713,
"learning_rate": 1.2990000000000001e-05,
"loss": 0.1096,
"step": 1300
},
{
"epoch": 0.055706752849124,
"grad_norm": 1.6657530069351196,
"learning_rate": 1.309e-05,
"loss": 0.1122,
"step": 1310
},
{
"epoch": 0.05613199523728525,
"grad_norm": 2.000190019607544,
"learning_rate": 1.3189999999999999e-05,
"loss": 0.1082,
"step": 1320
},
{
"epoch": 0.0565572376254465,
"grad_norm": 1.6935395002365112,
"learning_rate": 1.329e-05,
"loss": 0.119,
"step": 1330
},
{
"epoch": 0.056982480013607756,
"grad_norm": 1.2586040496826172,
"learning_rate": 1.339e-05,
"loss": 0.1117,
"step": 1340
},
{
"epoch": 0.05740772240176901,
"grad_norm": 1.3810162544250488,
"learning_rate": 1.349e-05,
"loss": 0.1085,
"step": 1350
},
{
"epoch": 0.05783296478993026,
"grad_norm": 1.6154093742370605,
"learning_rate": 1.359e-05,
"loss": 0.1098,
"step": 1360
},
{
"epoch": 0.05825820717809151,
"grad_norm": 1.2975713014602661,
"learning_rate": 1.369e-05,
"loss": 0.1055,
"step": 1370
},
{
"epoch": 0.058683449566252766,
"grad_norm": 1.3397157192230225,
"learning_rate": 1.379e-05,
"loss": 0.0988,
"step": 1380
},
{
"epoch": 0.05910869195441402,
"grad_norm": 1.884732961654663,
"learning_rate": 1.389e-05,
"loss": 0.106,
"step": 1390
},
{
"epoch": 0.05953393434257527,
"grad_norm": 1.21388840675354,
"learning_rate": 1.399e-05,
"loss": 0.0998,
"step": 1400
},
{
"epoch": 0.05995917673073652,
"grad_norm": 1.3441375494003296,
"learning_rate": 1.409e-05,
"loss": 0.0981,
"step": 1410
},
{
"epoch": 0.06038441911889777,
"grad_norm": 1.3364049196243286,
"learning_rate": 1.419e-05,
"loss": 0.0983,
"step": 1420
},
{
"epoch": 0.06080966150705902,
"grad_norm": 1.7252370119094849,
"learning_rate": 1.429e-05,
"loss": 0.1027,
"step": 1430
},
{
"epoch": 0.06123490389522027,
"grad_norm": 1.3132603168487549,
"learning_rate": 1.4390000000000001e-05,
"loss": 0.0971,
"step": 1440
},
{
"epoch": 0.061660146283381526,
"grad_norm": 1.9096121788024902,
"learning_rate": 1.449e-05,
"loss": 0.0992,
"step": 1450
},
{
"epoch": 0.06208538867154278,
"grad_norm": 1.864293098449707,
"learning_rate": 1.4590000000000001e-05,
"loss": 0.1079,
"step": 1460
},
{
"epoch": 0.06251063105970403,
"grad_norm": 1.2758187055587769,
"learning_rate": 1.469e-05,
"loss": 0.0952,
"step": 1470
},
{
"epoch": 0.06293587344786529,
"grad_norm": 1.5732557773590088,
"learning_rate": 1.479e-05,
"loss": 0.0985,
"step": 1480
},
{
"epoch": 0.06336111583602654,
"grad_norm": 1.8538833856582642,
"learning_rate": 1.4890000000000001e-05,
"loss": 0.0957,
"step": 1490
},
{
"epoch": 0.06378635822418778,
"grad_norm": 1.4448537826538086,
"learning_rate": 1.499e-05,
"loss": 0.0958,
"step": 1500
},
{
"epoch": 0.06421160061234904,
"grad_norm": 1.7205644845962524,
"learning_rate": 1.5090000000000001e-05,
"loss": 0.0997,
"step": 1510
},
{
"epoch": 0.06463684300051029,
"grad_norm": 1.3191324472427368,
"learning_rate": 1.519e-05,
"loss": 0.0895,
"step": 1520
},
{
"epoch": 0.06506208538867155,
"grad_norm": 1.219373345375061,
"learning_rate": 1.529e-05,
"loss": 0.0955,
"step": 1530
},
{
"epoch": 0.06548732777683279,
"grad_norm": 1.2836029529571533,
"learning_rate": 1.539e-05,
"loss": 0.0948,
"step": 1540
},
{
"epoch": 0.06591257016499405,
"grad_norm": 1.2800109386444092,
"learning_rate": 1.549e-05,
"loss": 0.0956,
"step": 1550
},
{
"epoch": 0.0663378125531553,
"grad_norm": 1.4099206924438477,
"learning_rate": 1.559e-05,
"loss": 0.0919,
"step": 1560
},
{
"epoch": 0.06676305494131655,
"grad_norm": 1.320432186126709,
"learning_rate": 1.569e-05,
"loss": 0.0947,
"step": 1570
},
{
"epoch": 0.0671882973294778,
"grad_norm": 1.7595242261886597,
"learning_rate": 1.579e-05,
"loss": 0.0877,
"step": 1580
},
{
"epoch": 0.06761353971763906,
"grad_norm": 1.6487762928009033,
"learning_rate": 1.589e-05,
"loss": 0.093,
"step": 1590
},
{
"epoch": 0.0680387821058003,
"grad_norm": 1.58949613571167,
"learning_rate": 1.599e-05,
"loss": 0.0949,
"step": 1600
},
{
"epoch": 0.06846402449396156,
"grad_norm": 1.098441243171692,
"learning_rate": 1.609e-05,
"loss": 0.0898,
"step": 1610
},
{
"epoch": 0.06888926688212281,
"grad_norm": 1.2733993530273438,
"learning_rate": 1.619e-05,
"loss": 0.0841,
"step": 1620
},
{
"epoch": 0.06931450927028406,
"grad_norm": 1.5060628652572632,
"learning_rate": 1.629e-05,
"loss": 0.0919,
"step": 1630
},
{
"epoch": 0.06973975165844531,
"grad_norm": 1.5991514921188354,
"learning_rate": 1.639e-05,
"loss": 0.0863,
"step": 1640
},
{
"epoch": 0.07016499404660656,
"grad_norm": 1.4756519794464111,
"learning_rate": 1.649e-05,
"loss": 0.0848,
"step": 1650
},
{
"epoch": 0.07059023643476782,
"grad_norm": 1.5085145235061646,
"learning_rate": 1.6590000000000002e-05,
"loss": 0.0834,
"step": 1660
},
{
"epoch": 0.07101547882292907,
"grad_norm": 1.3816215991973877,
"learning_rate": 1.669e-05,
"loss": 0.0861,
"step": 1670
},
{
"epoch": 0.07144072121109032,
"grad_norm": 1.3119925260543823,
"learning_rate": 1.679e-05,
"loss": 0.088,
"step": 1680
},
{
"epoch": 0.07186596359925157,
"grad_norm": 1.1059609651565552,
"learning_rate": 1.689e-05,
"loss": 0.0941,
"step": 1690
},
{
"epoch": 0.07229120598741283,
"grad_norm": 0.9836457371711731,
"learning_rate": 1.699e-05,
"loss": 0.0889,
"step": 1700
},
{
"epoch": 0.07271644837557407,
"grad_norm": 1.3157384395599365,
"learning_rate": 1.709e-05,
"loss": 0.0867,
"step": 1710
},
{
"epoch": 0.07314169076373533,
"grad_norm": 1.0749443769454956,
"learning_rate": 1.719e-05,
"loss": 0.0809,
"step": 1720
},
{
"epoch": 0.07356693315189658,
"grad_norm": 1.4055633544921875,
"learning_rate": 1.7290000000000002e-05,
"loss": 0.0825,
"step": 1730
},
{
"epoch": 0.07399217554005784,
"grad_norm": 1.156111240386963,
"learning_rate": 1.739e-05,
"loss": 0.0833,
"step": 1740
},
{
"epoch": 0.07441741792821908,
"grad_norm": 1.625030517578125,
"learning_rate": 1.749e-05,
"loss": 0.0866,
"step": 1750
},
{
"epoch": 0.07484266031638033,
"grad_norm": 1.5402555465698242,
"learning_rate": 1.7590000000000003e-05,
"loss": 0.0838,
"step": 1760
},
{
"epoch": 0.07526790270454159,
"grad_norm": 1.3374762535095215,
"learning_rate": 1.7690000000000002e-05,
"loss": 0.0889,
"step": 1770
},
{
"epoch": 0.07569314509270283,
"grad_norm": 1.5017690658569336,
"learning_rate": 1.779e-05,
"loss": 0.0863,
"step": 1780
},
{
"epoch": 0.0761183874808641,
"grad_norm": 1.2897818088531494,
"learning_rate": 1.7890000000000003e-05,
"loss": 0.0848,
"step": 1790
},
{
"epoch": 0.07654362986902534,
"grad_norm": 1.434987187385559,
"learning_rate": 1.7990000000000002e-05,
"loss": 0.0816,
"step": 1800
},
{
"epoch": 0.0769688722571866,
"grad_norm": 1.3573745489120483,
"learning_rate": 1.809e-05,
"loss": 0.0796,
"step": 1810
},
{
"epoch": 0.07739411464534784,
"grad_norm": 1.3428010940551758,
"learning_rate": 1.819e-05,
"loss": 0.0822,
"step": 1820
},
{
"epoch": 0.0778193570335091,
"grad_norm": 1.129830241203308,
"learning_rate": 1.8290000000000003e-05,
"loss": 0.0802,
"step": 1830
},
{
"epoch": 0.07824459942167035,
"grad_norm": 1.1881945133209229,
"learning_rate": 1.8390000000000002e-05,
"loss": 0.0851,
"step": 1840
},
{
"epoch": 0.07866984180983161,
"grad_norm": 1.3428897857666016,
"learning_rate": 1.8489999999999997e-05,
"loss": 0.0839,
"step": 1850
},
{
"epoch": 0.07909508419799285,
"grad_norm": 0.9840798377990723,
"learning_rate": 1.859e-05,
"loss": 0.0837,
"step": 1860
},
{
"epoch": 0.07952032658615411,
"grad_norm": 1.179263710975647,
"learning_rate": 1.869e-05,
"loss": 0.0824,
"step": 1870
},
{
"epoch": 0.07994556897431536,
"grad_norm": 1.192193865776062,
"learning_rate": 1.8789999999999998e-05,
"loss": 0.0794,
"step": 1880
},
{
"epoch": 0.08037081136247662,
"grad_norm": 1.5266687870025635,
"learning_rate": 1.889e-05,
"loss": 0.0857,
"step": 1890
},
{
"epoch": 0.08079605375063786,
"grad_norm": 1.2958351373672485,
"learning_rate": 1.899e-05,
"loss": 0.0787,
"step": 1900
},
{
"epoch": 0.08122129613879911,
"grad_norm": 1.245995044708252,
"learning_rate": 1.909e-05,
"loss": 0.0786,
"step": 1910
},
{
"epoch": 0.08164653852696037,
"grad_norm": 1.16083562374115,
"learning_rate": 1.919e-05,
"loss": 0.0789,
"step": 1920
},
{
"epoch": 0.08207178091512161,
"grad_norm": 1.0717693567276,
"learning_rate": 1.929e-05,
"loss": 0.0786,
"step": 1930
},
{
"epoch": 0.08249702330328287,
"grad_norm": 1.2903943061828613,
"learning_rate": 1.939e-05,
"loss": 0.0796,
"step": 1940
},
{
"epoch": 0.08292226569144412,
"grad_norm": 1.5063890218734741,
"learning_rate": 1.9489999999999998e-05,
"loss": 0.0792,
"step": 1950
},
{
"epoch": 0.08334750807960538,
"grad_norm": 1.065131664276123,
"learning_rate": 1.959e-05,
"loss": 0.0757,
"step": 1960
},
{
"epoch": 0.08377275046776662,
"grad_norm": 1.508479118347168,
"learning_rate": 1.969e-05,
"loss": 0.0801,
"step": 1970
},
{
"epoch": 0.08419799285592788,
"grad_norm": 1.4832401275634766,
"learning_rate": 1.979e-05,
"loss": 0.0781,
"step": 1980
},
{
"epoch": 0.08462323524408913,
"grad_norm": 1.1999212503433228,
"learning_rate": 1.989e-05,
"loss": 0.0772,
"step": 1990
},
{
"epoch": 0.08504847763225039,
"grad_norm": 1.1872233152389526,
"learning_rate": 1.999e-05,
"loss": 0.0753,
"step": 2000
},
{
"epoch": 0.08547372002041163,
"grad_norm": 1.603405237197876,
"learning_rate": 2.009e-05,
"loss": 0.0737,
"step": 2010
},
{
"epoch": 0.0858989624085729,
"grad_norm": 1.4420855045318604,
"learning_rate": 2.019e-05,
"loss": 0.0795,
"step": 2020
},
{
"epoch": 0.08632420479673414,
"grad_norm": 1.2586225271224976,
"learning_rate": 2.029e-05,
"loss": 0.0762,
"step": 2030
},
{
"epoch": 0.08674944718489538,
"grad_norm": 1.6337774991989136,
"learning_rate": 2.039e-05,
"loss": 0.0751,
"step": 2040
},
{
"epoch": 0.08717468957305664,
"grad_norm": 1.450543999671936,
"learning_rate": 2.0490000000000002e-05,
"loss": 0.0776,
"step": 2050
},
{
"epoch": 0.08759993196121789,
"grad_norm": 1.1475777626037598,
"learning_rate": 2.059e-05,
"loss": 0.0757,
"step": 2060
},
{
"epoch": 0.08802517434937915,
"grad_norm": 1.5073463916778564,
"learning_rate": 2.069e-05,
"loss": 0.0756,
"step": 2070
},
{
"epoch": 0.0884504167375404,
"grad_norm": 1.4093585014343262,
"learning_rate": 2.079e-05,
"loss": 0.0697,
"step": 2080
},
{
"epoch": 0.08887565912570165,
"grad_norm": 1.230241298675537,
"learning_rate": 2.089e-05,
"loss": 0.0782,
"step": 2090
},
{
"epoch": 0.0893009015138629,
"grad_norm": 1.3263798952102661,
"learning_rate": 2.099e-05,
"loss": 0.0736,
"step": 2100
},
{
"epoch": 0.08972614390202416,
"grad_norm": 1.136014461517334,
"learning_rate": 2.109e-05,
"loss": 0.0676,
"step": 2110
},
{
"epoch": 0.0901513862901854,
"grad_norm": 1.4707313776016235,
"learning_rate": 2.1190000000000002e-05,
"loss": 0.0701,
"step": 2120
},
{
"epoch": 0.09057662867834666,
"grad_norm": 1.4252678155899048,
"learning_rate": 2.129e-05,
"loss": 0.0767,
"step": 2130
},
{
"epoch": 0.09100187106650791,
"grad_norm": 1.2199596166610718,
"learning_rate": 2.139e-05,
"loss": 0.0693,
"step": 2140
},
{
"epoch": 0.09142711345466917,
"grad_norm": 1.658355474472046,
"learning_rate": 2.1490000000000003e-05,
"loss": 0.0709,
"step": 2150
},
{
"epoch": 0.09185235584283041,
"grad_norm": 1.2282954454421997,
"learning_rate": 2.159e-05,
"loss": 0.0688,
"step": 2160
},
{
"epoch": 0.09227759823099166,
"grad_norm": 1.3242515325546265,
"learning_rate": 2.169e-05,
"loss": 0.071,
"step": 2170
},
{
"epoch": 0.09270284061915292,
"grad_norm": 1.1989598274230957,
"learning_rate": 2.1790000000000003e-05,
"loss": 0.0683,
"step": 2180
},
{
"epoch": 0.09312808300731416,
"grad_norm": 1.3082703351974487,
"learning_rate": 2.1890000000000002e-05,
"loss": 0.0712,
"step": 2190
},
{
"epoch": 0.09355332539547542,
"grad_norm": 1.3433862924575806,
"learning_rate": 2.199e-05,
"loss": 0.0768,
"step": 2200
},
{
"epoch": 0.09397856778363667,
"grad_norm": 1.1946460008621216,
"learning_rate": 2.209e-05,
"loss": 0.0754,
"step": 2210
},
{
"epoch": 0.09440381017179793,
"grad_norm": 1.3874248266220093,
"learning_rate": 2.2190000000000003e-05,
"loss": 0.0784,
"step": 2220
},
{
"epoch": 0.09482905255995917,
"grad_norm": 1.2676492929458618,
"learning_rate": 2.2290000000000002e-05,
"loss": 0.0694,
"step": 2230
},
{
"epoch": 0.09525429494812043,
"grad_norm": 1.0633152723312378,
"learning_rate": 2.239e-05,
"loss": 0.0667,
"step": 2240
},
{
"epoch": 0.09567953733628168,
"grad_norm": 1.0461442470550537,
"learning_rate": 2.2490000000000003e-05,
"loss": 0.0707,
"step": 2250
},
{
"epoch": 0.09610477972444294,
"grad_norm": 1.5528920888900757,
"learning_rate": 2.2590000000000002e-05,
"loss": 0.0728,
"step": 2260
},
{
"epoch": 0.09653002211260418,
"grad_norm": 1.3866478204727173,
"learning_rate": 2.269e-05,
"loss": 0.0683,
"step": 2270
},
{
"epoch": 0.09695526450076544,
"grad_norm": 1.2420086860656738,
"learning_rate": 2.279e-05,
"loss": 0.0653,
"step": 2280
},
{
"epoch": 0.09738050688892669,
"grad_norm": 1.163353681564331,
"learning_rate": 2.289e-05,
"loss": 0.0687,
"step": 2290
},
{
"epoch": 0.09780574927708793,
"grad_norm": 1.2111841440200806,
"learning_rate": 2.299e-05,
"loss": 0.0687,
"step": 2300
},
{
"epoch": 0.09823099166524919,
"grad_norm": 1.3638079166412354,
"learning_rate": 2.309e-05,
"loss": 0.0721,
"step": 2310
},
{
"epoch": 0.09865623405341044,
"grad_norm": 1.3157620429992676,
"learning_rate": 2.319e-05,
"loss": 0.071,
"step": 2320
},
{
"epoch": 0.0990814764415717,
"grad_norm": 1.142834186553955,
"learning_rate": 2.329e-05,
"loss": 0.0736,
"step": 2330
},
{
"epoch": 0.09950671882973294,
"grad_norm": 1.247509241104126,
"learning_rate": 2.3389999999999998e-05,
"loss": 0.0648,
"step": 2340
},
{
"epoch": 0.0999319612178942,
"grad_norm": 1.134833574295044,
"learning_rate": 2.349e-05,
"loss": 0.0748,
"step": 2350
},
{
"epoch": 0.10035720360605545,
"grad_norm": 1.1281261444091797,
"learning_rate": 2.359e-05,
"loss": 0.0678,
"step": 2360
},
{
"epoch": 0.10078244599421671,
"grad_norm": 1.2668074369430542,
"learning_rate": 2.369e-05,
"loss": 0.0666,
"step": 2370
},
{
"epoch": 0.10120768838237795,
"grad_norm": 1.2871060371398926,
"learning_rate": 2.379e-05,
"loss": 0.0695,
"step": 2380
},
{
"epoch": 0.10163293077053921,
"grad_norm": 1.4086240530014038,
"learning_rate": 2.389e-05,
"loss": 0.0678,
"step": 2390
},
{
"epoch": 0.10205817315870046,
"grad_norm": 1.2249305248260498,
"learning_rate": 2.399e-05,
"loss": 0.0676,
"step": 2400
},
{
"epoch": 0.10248341554686172,
"grad_norm": 1.1750974655151367,
"learning_rate": 2.409e-05,
"loss": 0.0627,
"step": 2410
},
{
"epoch": 0.10290865793502296,
"grad_norm": 1.1746395826339722,
"learning_rate": 2.419e-05,
"loss": 0.0732,
"step": 2420
},
{
"epoch": 0.10333390032318422,
"grad_norm": 1.2999951839447021,
"learning_rate": 2.429e-05,
"loss": 0.067,
"step": 2430
},
{
"epoch": 0.10375914271134547,
"grad_norm": 1.4515321254730225,
"learning_rate": 2.439e-05,
"loss": 0.0708,
"step": 2440
},
{
"epoch": 0.10418438509950671,
"grad_norm": 1.1880756616592407,
"learning_rate": 2.449e-05,
"loss": 0.0691,
"step": 2450
},
{
"epoch": 0.10460962748766797,
"grad_norm": 2.154705047607422,
"learning_rate": 2.459e-05,
"loss": 0.0691,
"step": 2460
},
{
"epoch": 0.10503486987582922,
"grad_norm": 1.569284439086914,
"learning_rate": 2.469e-05,
"loss": 0.0696,
"step": 2470
},
{
"epoch": 0.10546011226399048,
"grad_norm": 1.4350764751434326,
"learning_rate": 2.479e-05,
"loss": 0.0627,
"step": 2480
},
{
"epoch": 0.10588535465215172,
"grad_norm": 1.3060920238494873,
"learning_rate": 2.489e-05,
"loss": 0.0736,
"step": 2490
},
{
"epoch": 0.10631059704031298,
"grad_norm": 1.4141494035720825,
"learning_rate": 2.499e-05,
"loss": 0.0669,
"step": 2500
},
{
"epoch": 0.10673583942847423,
"grad_norm": 1.221895456314087,
"learning_rate": 2.5090000000000002e-05,
"loss": 0.0631,
"step": 2510
},
{
"epoch": 0.10716108181663549,
"grad_norm": 1.24517023563385,
"learning_rate": 2.519e-05,
"loss": 0.0646,
"step": 2520
},
{
"epoch": 0.10758632420479673,
"grad_norm": 0.9323004484176636,
"learning_rate": 2.529e-05,
"loss": 0.0669,
"step": 2530
},
{
"epoch": 0.10801156659295799,
"grad_norm": 1.0457737445831299,
"learning_rate": 2.5390000000000003e-05,
"loss": 0.0638,
"step": 2540
},
{
"epoch": 0.10843680898111924,
"grad_norm": 1.4137825965881348,
"learning_rate": 2.549e-05,
"loss": 0.0683,
"step": 2550
},
{
"epoch": 0.1088620513692805,
"grad_norm": 1.292286992073059,
"learning_rate": 2.559e-05,
"loss": 0.0717,
"step": 2560
},
{
"epoch": 0.10928729375744174,
"grad_norm": 1.1481221914291382,
"learning_rate": 2.569e-05,
"loss": 0.0633,
"step": 2570
},
{
"epoch": 0.10971253614560299,
"grad_norm": 1.4091408252716064,
"learning_rate": 2.5790000000000002e-05,
"loss": 0.0633,
"step": 2580
},
{
"epoch": 0.11013777853376425,
"grad_norm": 1.1633602380752563,
"learning_rate": 2.589e-05,
"loss": 0.0604,
"step": 2590
},
{
"epoch": 0.11056302092192549,
"grad_norm": 1.198021650314331,
"learning_rate": 2.599e-05,
"loss": 0.0705,
"step": 2600
},
{
"epoch": 0.11098826331008675,
"grad_norm": 1.1614141464233398,
"learning_rate": 2.6090000000000003e-05,
"loss": 0.0679,
"step": 2610
},
{
"epoch": 0.111413505698248,
"grad_norm": 0.9087436199188232,
"learning_rate": 2.619e-05,
"loss": 0.065,
"step": 2620
},
{
"epoch": 0.11183874808640926,
"grad_norm": 1.429312825202942,
"learning_rate": 2.629e-05,
"loss": 0.065,
"step": 2630
},
{
"epoch": 0.1122639904745705,
"grad_norm": 1.4051202535629272,
"learning_rate": 2.6390000000000003e-05,
"loss": 0.0656,
"step": 2640
},
{
"epoch": 0.11268923286273176,
"grad_norm": 1.294622540473938,
"learning_rate": 2.6490000000000002e-05,
"loss": 0.0669,
"step": 2650
},
{
"epoch": 0.113114475250893,
"grad_norm": 1.2461621761322021,
"learning_rate": 2.659e-05,
"loss": 0.0664,
"step": 2660
},
{
"epoch": 0.11353971763905427,
"grad_norm": 1.217442274093628,
"learning_rate": 2.6690000000000004e-05,
"loss": 0.0641,
"step": 2670
},
{
"epoch": 0.11396496002721551,
"grad_norm": 1.3828129768371582,
"learning_rate": 2.6790000000000003e-05,
"loss": 0.0651,
"step": 2680
},
{
"epoch": 0.11439020241537677,
"grad_norm": 1.1948891878128052,
"learning_rate": 2.6890000000000002e-05,
"loss": 0.0643,
"step": 2690
},
{
"epoch": 0.11481544480353802,
"grad_norm": 1.2406245470046997,
"learning_rate": 2.699e-05,
"loss": 0.0622,
"step": 2700
},
{
"epoch": 0.11524068719169926,
"grad_norm": 1.248665452003479,
"learning_rate": 2.709e-05,
"loss": 0.0599,
"step": 2710
},
{
"epoch": 0.11566592957986052,
"grad_norm": 1.2043137550354004,
"learning_rate": 2.719e-05,
"loss": 0.0604,
"step": 2720
},
{
"epoch": 0.11609117196802177,
"grad_norm": 1.176339030265808,
"learning_rate": 2.7289999999999998e-05,
"loss": 0.0651,
"step": 2730
},
{
"epoch": 0.11651641435618303,
"grad_norm": 1.115503191947937,
"learning_rate": 2.739e-05,
"loss": 0.0591,
"step": 2740
},
{
"epoch": 0.11694165674434427,
"grad_norm": 1.1059077978134155,
"learning_rate": 2.749e-05,
"loss": 0.0581,
"step": 2750
},
{
"epoch": 0.11736689913250553,
"grad_norm": 1.0317000150680542,
"learning_rate": 2.759e-05,
"loss": 0.0616,
"step": 2760
},
{
"epoch": 0.11779214152066678,
"grad_norm": 0.9671021103858948,
"learning_rate": 2.769e-05,
"loss": 0.0639,
"step": 2770
},
{
"epoch": 0.11821738390882804,
"grad_norm": 1.3735092878341675,
"learning_rate": 2.779e-05,
"loss": 0.0586,
"step": 2780
},
{
"epoch": 0.11864262629698928,
"grad_norm": 1.3800747394561768,
"learning_rate": 2.789e-05,
"loss": 0.0606,
"step": 2790
},
{
"epoch": 0.11906786868515054,
"grad_norm": 1.1293871402740479,
"learning_rate": 2.799e-05,
"loss": 0.0659,
"step": 2800
},
{
"epoch": 0.11949311107331179,
"grad_norm": 1.056138038635254,
"learning_rate": 2.809e-05,
"loss": 0.063,
"step": 2810
},
{
"epoch": 0.11991835346147305,
"grad_norm": 1.1736313104629517,
"learning_rate": 2.819e-05,
"loss": 0.0634,
"step": 2820
},
{
"epoch": 0.12034359584963429,
"grad_norm": 1.3959088325500488,
"learning_rate": 2.829e-05,
"loss": 0.0597,
"step": 2830
},
{
"epoch": 0.12076883823779554,
"grad_norm": 1.186424970626831,
"learning_rate": 2.839e-05,
"loss": 0.0604,
"step": 2840
},
{
"epoch": 0.1211940806259568,
"grad_norm": 1.1499032974243164,
"learning_rate": 2.849e-05,
"loss": 0.0614,
"step": 2850
},
{
"epoch": 0.12161932301411804,
"grad_norm": 1.2873589992523193,
"learning_rate": 2.859e-05,
"loss": 0.0647,
"step": 2860
},
{
"epoch": 0.1220445654022793,
"grad_norm": 1.3243647813796997,
"learning_rate": 2.869e-05,
"loss": 0.0628,
"step": 2870
},
{
"epoch": 0.12246980779044055,
"grad_norm": 1.2491867542266846,
"learning_rate": 2.879e-05,
"loss": 0.0602,
"step": 2880
},
{
"epoch": 0.1228950501786018,
"grad_norm": 1.207425832748413,
"learning_rate": 2.889e-05,
"loss": 0.0609,
"step": 2890
},
{
"epoch": 0.12332029256676305,
"grad_norm": 1.0992892980575562,
"learning_rate": 2.8990000000000002e-05,
"loss": 0.0578,
"step": 2900
},
{
"epoch": 0.12374553495492431,
"grad_norm": 1.2509560585021973,
"learning_rate": 2.909e-05,
"loss": 0.0613,
"step": 2910
},
{
"epoch": 0.12417077734308556,
"grad_norm": 1.1020233631134033,
"learning_rate": 2.919e-05,
"loss": 0.0629,
"step": 2920
},
{
"epoch": 0.12459601973124682,
"grad_norm": 1.1341887712478638,
"learning_rate": 2.9290000000000002e-05,
"loss": 0.0622,
"step": 2930
},
{
"epoch": 0.12502126211940806,
"grad_norm": 0.9719606041908264,
"learning_rate": 2.939e-05,
"loss": 0.0657,
"step": 2940
},
{
"epoch": 0.1254465045075693,
"grad_norm": 1.160712718963623,
"learning_rate": 2.949e-05,
"loss": 0.0596,
"step": 2950
},
{
"epoch": 0.12587174689573058,
"grad_norm": 1.1883903741836548,
"learning_rate": 2.959e-05,
"loss": 0.0613,
"step": 2960
},
{
"epoch": 0.12629698928389183,
"grad_norm": 1.1179677248001099,
"learning_rate": 2.9690000000000002e-05,
"loss": 0.0608,
"step": 2970
},
{
"epoch": 0.12672223167205307,
"grad_norm": 0.936011791229248,
"learning_rate": 2.979e-05,
"loss": 0.0576,
"step": 2980
},
{
"epoch": 0.12714747406021432,
"grad_norm": 1.0857445001602173,
"learning_rate": 2.989e-05,
"loss": 0.0578,
"step": 2990
},
{
"epoch": 0.12757271644837556,
"grad_norm": 1.2470076084136963,
"learning_rate": 2.9990000000000003e-05,
"loss": 0.0591,
"step": 3000
},
{
"epoch": 0.12799795883653683,
"grad_norm": 1.279581069946289,
"learning_rate": 2.9999998154575393e-05,
"loss": 0.0609,
"step": 3010
},
{
"epoch": 0.12842320122469808,
"grad_norm": 1.257899522781372,
"learning_rate": 2.999999177533042e-05,
"loss": 0.0589,
"step": 3020
},
{
"epoch": 0.12884844361285933,
"grad_norm": 0.9201774001121521,
"learning_rate": 2.9999980839483992e-05,
"loss": 0.0583,
"step": 3030
},
{
"epoch": 0.12927368600102057,
"grad_norm": 0.919161319732666,
"learning_rate": 2.999996534703944e-05,
"loss": 0.0597,
"step": 3040
},
{
"epoch": 0.12969892838918184,
"grad_norm": 0.9568930268287659,
"learning_rate": 2.9999945298001468e-05,
"loss": 0.0563,
"step": 3050
},
{
"epoch": 0.1301241707773431,
"grad_norm": 0.7360648512840271,
"learning_rate": 2.9999920692376165e-05,
"loss": 0.0569,
"step": 3060
},
{
"epoch": 0.13054941316550434,
"grad_norm": 0.9083346128463745,
"learning_rate": 2.9999891530171002e-05,
"loss": 0.0576,
"step": 3070
},
{
"epoch": 0.13097465555366558,
"grad_norm": 1.020730972290039,
"learning_rate": 2.9999857811394845e-05,
"loss": 0.059,
"step": 3080
},
{
"epoch": 0.13139989794182685,
"grad_norm": 1.0232067108154297,
"learning_rate": 2.9999819536057935e-05,
"loss": 0.0596,
"step": 3090
},
{
"epoch": 0.1318251403299881,
"grad_norm": 1.2200008630752563,
"learning_rate": 2.9999776704171894e-05,
"loss": 0.0611,
"step": 3100
},
{
"epoch": 0.13225038271814935,
"grad_norm": 1.1772595643997192,
"learning_rate": 2.999972931574973e-05,
"loss": 0.06,
"step": 3110
},
{
"epoch": 0.1326756251063106,
"grad_norm": 1.0882384777069092,
"learning_rate": 2.9999677370805858e-05,
"loss": 0.0598,
"step": 3120
},
{
"epoch": 0.13310086749447184,
"grad_norm": 0.9506834745407104,
"learning_rate": 2.9999620869356034e-05,
"loss": 0.0577,
"step": 3130
},
{
"epoch": 0.1335261098826331,
"grad_norm": 0.8521194458007812,
"learning_rate": 2.9999559811417436e-05,
"loss": 0.0577,
"step": 3140
},
{
"epoch": 0.13395135227079435,
"grad_norm": 0.9100087285041809,
"learning_rate": 2.999949419700861e-05,
"loss": 0.0564,
"step": 3150
},
{
"epoch": 0.1343765946589556,
"grad_norm": 0.8859062194824219,
"learning_rate": 2.9999424026149484e-05,
"loss": 0.054,
"step": 3160
},
{
"epoch": 0.13480183704711685,
"grad_norm": 1.4700942039489746,
"learning_rate": 2.9999349298861376e-05,
"loss": 0.0568,
"step": 3170
},
{
"epoch": 0.13522707943527812,
"grad_norm": 0.989130437374115,
"learning_rate": 2.9999270015166983e-05,
"loss": 0.0547,
"step": 3180
},
{
"epoch": 0.13565232182343936,
"grad_norm": 1.364755392074585,
"learning_rate": 2.999918617509039e-05,
"loss": 0.058,
"step": 3190
},
{
"epoch": 0.1360775642116006,
"grad_norm": 1.0077078342437744,
"learning_rate": 2.9999097778657074e-05,
"loss": 0.0589,
"step": 3200
},
{
"epoch": 0.13650280659976186,
"grad_norm": 1.1901323795318604,
"learning_rate": 2.9999004825893878e-05,
"loss": 0.058,
"step": 3210
},
{
"epoch": 0.13692804898792313,
"grad_norm": 1.0837482213974,
"learning_rate": 2.9998907316829043e-05,
"loss": 0.0528,
"step": 3220
},
{
"epoch": 0.13735329137608437,
"grad_norm": 1.2312610149383545,
"learning_rate": 2.999880525149219e-05,
"loss": 0.0551,
"step": 3230
},
{
"epoch": 0.13777853376424562,
"grad_norm": 1.3543277978897095,
"learning_rate": 2.999869862991432e-05,
"loss": 0.0584,
"step": 3240
},
{
"epoch": 0.13820377615240687,
"grad_norm": 1.1511555910110474,
"learning_rate": 2.9998587452127822e-05,
"loss": 0.0531,
"step": 3250
},
{
"epoch": 0.1386290185405681,
"grad_norm": 1.1416348218917847,
"learning_rate": 2.999847171816647e-05,
"loss": 0.0587,
"step": 3260
},
{
"epoch": 0.13905426092872938,
"grad_norm": 1.1889604330062866,
"learning_rate": 2.999835142806543e-05,
"loss": 0.0578,
"step": 3270
},
{
"epoch": 0.13947950331689063,
"grad_norm": 0.9729725122451782,
"learning_rate": 2.9998226581861227e-05,
"loss": 0.0573,
"step": 3280
},
{
"epoch": 0.13990474570505188,
"grad_norm": 1.0449501276016235,
"learning_rate": 2.9998097179591793e-05,
"loss": 0.0658,
"step": 3290
},
{
"epoch": 0.14032998809321312,
"grad_norm": 1.0162328481674194,
"learning_rate": 2.9997963221296443e-05,
"loss": 0.0556,
"step": 3300
},
{
"epoch": 0.1407552304813744,
"grad_norm": 1.19861900806427,
"learning_rate": 2.9997824707015858e-05,
"loss": 0.0559,
"step": 3310
},
{
"epoch": 0.14118047286953564,
"grad_norm": 1.102773666381836,
"learning_rate": 2.9997681636792126e-05,
"loss": 0.0584,
"step": 3320
},
{
"epoch": 0.14160571525769688,
"grad_norm": 1.011214256286621,
"learning_rate": 2.9997534010668707e-05,
"loss": 0.0523,
"step": 3330
},
{
"epoch": 0.14203095764585813,
"grad_norm": 0.9323782324790955,
"learning_rate": 2.9997381828690435e-05,
"loss": 0.0559,
"step": 3340
},
{
"epoch": 0.1424562000340194,
"grad_norm": 0.9286249279975891,
"learning_rate": 2.9997225090903552e-05,
"loss": 0.0547,
"step": 3350
},
{
"epoch": 0.14288144242218065,
"grad_norm": 0.7139819860458374,
"learning_rate": 2.9997063797355664e-05,
"loss": 0.0553,
"step": 3360
},
{
"epoch": 0.1433066848103419,
"grad_norm": 0.8565438985824585,
"learning_rate": 2.9996897948095768e-05,
"loss": 0.0585,
"step": 3370
},
{
"epoch": 0.14373192719850314,
"grad_norm": 1.053774118423462,
"learning_rate": 2.9996727543174244e-05,
"loss": 0.056,
"step": 3380
},
{
"epoch": 0.14415716958666439,
"grad_norm": 1.0462929010391235,
"learning_rate": 2.9996552582642857e-05,
"loss": 0.0535,
"step": 3390
},
{
"epoch": 0.14458241197482566,
"grad_norm": 0.90887451171875,
"learning_rate": 2.9996373066554763e-05,
"loss": 0.0581,
"step": 3400
},
{
"epoch": 0.1450076543629869,
"grad_norm": 0.8000062704086304,
"learning_rate": 2.999618899496448e-05,
"loss": 0.0526,
"step": 3410
},
{
"epoch": 0.14543289675114815,
"grad_norm": 0.88075852394104,
"learning_rate": 2.9996000367927936e-05,
"loss": 0.053,
"step": 3420
},
{
"epoch": 0.1458581391393094,
"grad_norm": 0.8267508745193481,
"learning_rate": 2.9995807185502425e-05,
"loss": 0.054,
"step": 3430
},
{
"epoch": 0.14628338152747067,
"grad_norm": 1.1968902349472046,
"learning_rate": 2.9995609447746636e-05,
"loss": 0.0615,
"step": 3440
},
{
"epoch": 0.1467086239156319,
"grad_norm": 1.008314609527588,
"learning_rate": 2.999540715472063e-05,
"loss": 0.0553,
"step": 3450
},
{
"epoch": 0.14713386630379316,
"grad_norm": 1.1219818592071533,
"learning_rate": 2.999520030648586e-05,
"loss": 0.0592,
"step": 3460
},
{
"epoch": 0.1475591086919544,
"grad_norm": 1.2746617794036865,
"learning_rate": 2.9994988903105163e-05,
"loss": 0.0508,
"step": 3470
},
{
"epoch": 0.14798435108011568,
"grad_norm": 0.914639413356781,
"learning_rate": 2.999477294464276e-05,
"loss": 0.0569,
"step": 3480
},
{
"epoch": 0.14840959346827692,
"grad_norm": 1.115281581878662,
"learning_rate": 2.9994552431164247e-05,
"loss": 0.0524,
"step": 3490
},
{
"epoch": 0.14883483585643817,
"grad_norm": 0.9016542434692383,
"learning_rate": 2.9994327362736617e-05,
"loss": 0.0545,
"step": 3500
},
{
"epoch": 0.14926007824459941,
"grad_norm": 0.7672004699707031,
"learning_rate": 2.9994097739428233e-05,
"loss": 0.0507,
"step": 3510
},
{
"epoch": 0.14968532063276066,
"grad_norm": 1.0223740339279175,
"learning_rate": 2.9993863561308856e-05,
"loss": 0.0558,
"step": 3520
},
{
"epoch": 0.15011056302092193,
"grad_norm": 1.0547072887420654,
"learning_rate": 2.9993624828449616e-05,
"loss": 0.0543,
"step": 3530
},
{
"epoch": 0.15053580540908318,
"grad_norm": 0.882698655128479,
"learning_rate": 2.9993381540923037e-05,
"loss": 0.0525,
"step": 3540
},
{
"epoch": 0.15096104779724442,
"grad_norm": 0.8458632230758667,
"learning_rate": 2.9993133698803025e-05,
"loss": 0.0512,
"step": 3550
},
{
"epoch": 0.15138629018540567,
"grad_norm": 0.872350811958313,
"learning_rate": 2.9992881302164862e-05,
"loss": 0.0529,
"step": 3560
},
{
"epoch": 0.15181153257356694,
"grad_norm": 1.0274560451507568,
"learning_rate": 2.9992624351085226e-05,
"loss": 0.0533,
"step": 3570
},
{
"epoch": 0.1522367749617282,
"grad_norm": 1.1185356378555298,
"learning_rate": 2.9992362845642167e-05,
"loss": 0.057,
"step": 3580
},
{
"epoch": 0.15266201734988943,
"grad_norm": 0.9312054514884949,
"learning_rate": 2.9992096785915132e-05,
"loss": 0.0524,
"step": 3590
},
{
"epoch": 0.15308725973805068,
"grad_norm": 1.1811115741729736,
"learning_rate": 2.999182617198493e-05,
"loss": 0.0564,
"step": 3600
},
{
"epoch": 0.15351250212621195,
"grad_norm": 1.1230878829956055,
"learning_rate": 2.999155100393378e-05,
"loss": 0.0526,
"step": 3610
},
{
"epoch": 0.1539377445143732,
"grad_norm": 0.9790253043174744,
"learning_rate": 2.9991271281845256e-05,
"loss": 0.0515,
"step": 3620
},
{
"epoch": 0.15436298690253444,
"grad_norm": 0.8891139626502991,
"learning_rate": 2.9990987005804347e-05,
"loss": 0.0527,
"step": 3630
},
{
"epoch": 0.1547882292906957,
"grad_norm": 0.9285902976989746,
"learning_rate": 2.999069817589739e-05,
"loss": 0.0526,
"step": 3640
},
{
"epoch": 0.15521347167885693,
"grad_norm": 1.1737614870071411,
"learning_rate": 2.9990404792212142e-05,
"loss": 0.0551,
"step": 3650
},
{
"epoch": 0.1556387140670182,
"grad_norm": 1.1494216918945312,
"learning_rate": 2.9990106854837715e-05,
"loss": 0.0537,
"step": 3660
},
{
"epoch": 0.15606395645517945,
"grad_norm": 1.0738581418991089,
"learning_rate": 2.9989804363864615e-05,
"loss": 0.0562,
"step": 3670
},
{
"epoch": 0.1564891988433407,
"grad_norm": 0.920110821723938,
"learning_rate": 2.998949731938473e-05,
"loss": 0.0493,
"step": 3680
},
{
"epoch": 0.15691444123150194,
"grad_norm": 0.9473956227302551,
"learning_rate": 2.9989185721491338e-05,
"loss": 0.0501,
"step": 3690
},
{
"epoch": 0.15733968361966322,
"grad_norm": 1.0324159860610962,
"learning_rate": 2.998886957027909e-05,
"loss": 0.0523,
"step": 3700
},
{
"epoch": 0.15776492600782446,
"grad_norm": 0.9430623054504395,
"learning_rate": 2.998854886584402e-05,
"loss": 0.0528,
"step": 3710
},
{
"epoch": 0.1581901683959857,
"grad_norm": 0.9284719228744507,
"learning_rate": 2.9988223608283557e-05,
"loss": 0.0519,
"step": 3720
},
{
"epoch": 0.15861541078414695,
"grad_norm": 1.019679307937622,
"learning_rate": 2.9987893797696502e-05,
"loss": 0.0522,
"step": 3730
},
{
"epoch": 0.15904065317230823,
"grad_norm": 1.0338765382766724,
"learning_rate": 2.9987559434183046e-05,
"loss": 0.0501,
"step": 3740
},
{
"epoch": 0.15946589556046947,
"grad_norm": 1.1244757175445557,
"learning_rate": 2.998722051784476e-05,
"loss": 0.0555,
"step": 3750
},
{
"epoch": 0.15989113794863072,
"grad_norm": 1.0248645544052124,
"learning_rate": 2.9986877048784594e-05,
"loss": 0.0565,
"step": 3760
},
{
"epoch": 0.16031638033679196,
"grad_norm": 0.8432033061981201,
"learning_rate": 2.9986529027106885e-05,
"loss": 0.0542,
"step": 3770
},
{
"epoch": 0.16074162272495324,
"grad_norm": 0.8507984280586243,
"learning_rate": 2.998617645291735e-05,
"loss": 0.0483,
"step": 3780
},
{
"epoch": 0.16116686511311448,
"grad_norm": 0.8564496040344238,
"learning_rate": 2.9985819326323097e-05,
"loss": 0.0539,
"step": 3790
},
{
"epoch": 0.16159210750127573,
"grad_norm": 0.8069973587989807,
"learning_rate": 2.9985457647432613e-05,
"loss": 0.0529,
"step": 3800
},
{
"epoch": 0.16201734988943697,
"grad_norm": 0.7802913188934326,
"learning_rate": 2.998509141635576e-05,
"loss": 0.0482,
"step": 3810
},
{
"epoch": 0.16244259227759822,
"grad_norm": 0.8876798152923584,
"learning_rate": 2.99847206332038e-05,
"loss": 0.0506,
"step": 3820
},
{
"epoch": 0.1628678346657595,
"grad_norm": 1.1593446731567383,
"learning_rate": 2.9984345298089356e-05,
"loss": 0.0531,
"step": 3830
},
{
"epoch": 0.16329307705392074,
"grad_norm": 0.9372552037239075,
"learning_rate": 2.998396541112645e-05,
"loss": 0.0514,
"step": 3840
},
{
"epoch": 0.16371831944208198,
"grad_norm": 0.8006210327148438,
"learning_rate": 2.998358097243048e-05,
"loss": 0.051,
"step": 3850
},
{
"epoch": 0.16414356183024323,
"grad_norm": 0.8869373798370361,
"learning_rate": 2.998319198211823e-05,
"loss": 0.0502,
"step": 3860
},
{
"epoch": 0.1645688042184045,
"grad_norm": 1.0628048181533813,
"learning_rate": 2.998279844030786e-05,
"loss": 0.0521,
"step": 3870
},
{
"epoch": 0.16499404660656575,
"grad_norm": 1.1185886859893799,
"learning_rate": 2.9982400347118926e-05,
"loss": 0.0501,
"step": 3880
},
{
"epoch": 0.165419288994727,
"grad_norm": 1.170861005783081,
"learning_rate": 2.9981997702672353e-05,
"loss": 0.0492,
"step": 3890
},
{
"epoch": 0.16584453138288824,
"grad_norm": 0.9327141642570496,
"learning_rate": 2.9981590507090454e-05,
"loss": 0.0493,
"step": 3900
},
{
"epoch": 0.1662697737710495,
"grad_norm": 0.9895486831665039,
"learning_rate": 2.9981178760496927e-05,
"loss": 0.0552,
"step": 3910
},
{
"epoch": 0.16669501615921076,
"grad_norm": 0.8877111673355103,
"learning_rate": 2.9980762463016848e-05,
"loss": 0.0543,
"step": 3920
},
{
"epoch": 0.167120258547372,
"grad_norm": 0.7400316596031189,
"learning_rate": 2.9980341614776673e-05,
"loss": 0.0479,
"step": 3930
},
{
"epoch": 0.16754550093553325,
"grad_norm": 0.7988236546516418,
"learning_rate": 2.9979916215904247e-05,
"loss": 0.0499,
"step": 3940
},
{
"epoch": 0.1679707433236945,
"grad_norm": 0.8732298016548157,
"learning_rate": 2.99794862665288e-05,
"loss": 0.0477,
"step": 3950
},
{
"epoch": 0.16839598571185577,
"grad_norm": 1.0647722482681274,
"learning_rate": 2.9979051766780938e-05,
"loss": 0.0517,
"step": 3960
},
{
"epoch": 0.168821228100017,
"grad_norm": 0.8953602313995361,
"learning_rate": 2.9978612716792647e-05,
"loss": 0.0509,
"step": 3970
},
{
"epoch": 0.16924647048817826,
"grad_norm": 0.9406638145446777,
"learning_rate": 2.99781691166973e-05,
"loss": 0.0574,
"step": 3980
},
{
"epoch": 0.1696717128763395,
"grad_norm": 1.0174944400787354,
"learning_rate": 2.9977720966629646e-05,
"loss": 0.0515,
"step": 3990
},
{
"epoch": 0.17009695526450078,
"grad_norm": 0.9914764761924744,
"learning_rate": 2.997726826672583e-05,
"loss": 0.0507,
"step": 4000
},
{
"epoch": 0.17052219765266202,
"grad_norm": 0.9415300488471985,
"learning_rate": 2.9976811017123368e-05,
"loss": 0.0519,
"step": 4010
},
{
"epoch": 0.17094744004082327,
"grad_norm": 0.8257099986076355,
"learning_rate": 2.9976349217961158e-05,
"loss": 0.0501,
"step": 4020
},
{
"epoch": 0.1713726824289845,
"grad_norm": 1.0278548002243042,
"learning_rate": 2.9975882869379482e-05,
"loss": 0.0505,
"step": 4030
},
{
"epoch": 0.1717979248171458,
"grad_norm": 0.9726608991622925,
"learning_rate": 2.997541197152001e-05,
"loss": 0.0506,
"step": 4040
},
{
"epoch": 0.17222316720530703,
"grad_norm": 0.9818130731582642,
"learning_rate": 2.9974936524525774e-05,
"loss": 0.0464,
"step": 4050
},
{
"epoch": 0.17264840959346828,
"grad_norm": 0.796649694442749,
"learning_rate": 2.9974456528541222e-05,
"loss": 0.0503,
"step": 4060
},
{
"epoch": 0.17307365198162952,
"grad_norm": 0.9704950451850891,
"learning_rate": 2.9973971983712147e-05,
"loss": 0.0505,
"step": 4070
},
{
"epoch": 0.17349889436979077,
"grad_norm": 1.0106730461120605,
"learning_rate": 2.9973482890185753e-05,
"loss": 0.0491,
"step": 4080
},
{
"epoch": 0.17392413675795204,
"grad_norm": 1.1311920881271362,
"learning_rate": 2.9972989248110606e-05,
"loss": 0.0523,
"step": 4090
},
{
"epoch": 0.1743493791461133,
"grad_norm": 1.1980477571487427,
"learning_rate": 2.9972491057636665e-05,
"loss": 0.0498,
"step": 4100
},
{
"epoch": 0.17477462153427453,
"grad_norm": 0.9751189351081848,
"learning_rate": 2.9971988318915268e-05,
"loss": 0.0498,
"step": 4110
},
{
"epoch": 0.17519986392243578,
"grad_norm": 1.084840178489685,
"learning_rate": 2.997148103209913e-05,
"loss": 0.0486,
"step": 4120
},
{
"epoch": 0.17562510631059705,
"grad_norm": 0.8263355493545532,
"learning_rate": 2.9970969197342356e-05,
"loss": 0.0466,
"step": 4130
},
{
"epoch": 0.1760503486987583,
"grad_norm": 0.8387073874473572,
"learning_rate": 2.9970452814800422e-05,
"loss": 0.054,
"step": 4140
},
{
"epoch": 0.17647559108691954,
"grad_norm": 1.1868751049041748,
"learning_rate": 2.99699318846302e-05,
"loss": 0.0487,
"step": 4150
},
{
"epoch": 0.1769008334750808,
"grad_norm": 0.7838359475135803,
"learning_rate": 2.9969406406989927e-05,
"loss": 0.0507,
"step": 4160
},
{
"epoch": 0.17732607586324206,
"grad_norm": 1.2756154537200928,
"learning_rate": 2.996887638203923e-05,
"loss": 0.0484,
"step": 4170
},
{
"epoch": 0.1777513182514033,
"grad_norm": 0.7980268597602844,
"learning_rate": 2.9968341809939116e-05,
"loss": 0.0493,
"step": 4180
},
{
"epoch": 0.17817656063956455,
"grad_norm": 0.8188315629959106,
"learning_rate": 2.9967802690851985e-05,
"loss": 0.0457,
"step": 4190
},
{
"epoch": 0.1786018030277258,
"grad_norm": 0.8356110453605652,
"learning_rate": 2.9967259024941595e-05,
"loss": 0.0483,
"step": 4200
},
{
"epoch": 0.17902704541588704,
"grad_norm": 0.6958544254302979,
"learning_rate": 2.9966710812373097e-05,
"loss": 0.0473,
"step": 4210
},
{
"epoch": 0.17945228780404832,
"grad_norm": 0.8168766498565674,
"learning_rate": 2.996615805331303e-05,
"loss": 0.0466,
"step": 4220
},
{
"epoch": 0.17987753019220956,
"grad_norm": 0.8009299039840698,
"learning_rate": 2.99656007479293e-05,
"loss": 0.0453,
"step": 4230
},
{
"epoch": 0.1803027725803708,
"grad_norm": 0.9034857153892517,
"learning_rate": 2.9965038896391213e-05,
"loss": 0.0482,
"step": 4240
},
{
"epoch": 0.18072801496853205,
"grad_norm": 1.084303379058838,
"learning_rate": 2.9964472498869434e-05,
"loss": 0.0486,
"step": 4250
},
{
"epoch": 0.18115325735669333,
"grad_norm": 0.9920953512191772,
"learning_rate": 2.996390155553603e-05,
"loss": 0.0474,
"step": 4260
},
{
"epoch": 0.18157849974485457,
"grad_norm": 0.8993090391159058,
"learning_rate": 2.9963326066564426e-05,
"loss": 0.0465,
"step": 4270
},
{
"epoch": 0.18200374213301582,
"grad_norm": 0.8214529752731323,
"learning_rate": 2.996274603212945e-05,
"loss": 0.046,
"step": 4280
},
{
"epoch": 0.18242898452117706,
"grad_norm": 0.847908079624176,
"learning_rate": 2.9962161452407296e-05,
"loss": 0.0467,
"step": 4290
},
{
"epoch": 0.18285422690933834,
"grad_norm": 0.8442016839981079,
"learning_rate": 2.996157232757555e-05,
"loss": 0.0489,
"step": 4300
},
{
"epoch": 0.18327946929749958,
"grad_norm": 0.9253048300743103,
"learning_rate": 2.996097865781316e-05,
"loss": 0.0522,
"step": 4310
},
{
"epoch": 0.18370471168566083,
"grad_norm": 0.7988717555999756,
"learning_rate": 2.996038044330048e-05,
"loss": 0.0467,
"step": 4320
},
{
"epoch": 0.18412995407382207,
"grad_norm": 0.866479754447937,
"learning_rate": 2.9959777684219233e-05,
"loss": 0.0484,
"step": 4330
},
{
"epoch": 0.18455519646198332,
"grad_norm": 0.9718467593193054,
"learning_rate": 2.9959170380752508e-05,
"loss": 0.048,
"step": 4340
},
{
"epoch": 0.1849804388501446,
"grad_norm": 0.8483349084854126,
"learning_rate": 2.99585585330848e-05,
"loss": 0.0469,
"step": 4350
},
{
"epoch": 0.18540568123830584,
"grad_norm": 0.7021053433418274,
"learning_rate": 2.9957942141401967e-05,
"loss": 0.0451,
"step": 4360
},
{
"epoch": 0.18583092362646708,
"grad_norm": 0.8285357356071472,
"learning_rate": 2.995732120589125e-05,
"loss": 0.0483,
"step": 4370
},
{
"epoch": 0.18625616601462833,
"grad_norm": 1.0425705909729004,
"learning_rate": 2.9956695726741276e-05,
"loss": 0.0488,
"step": 4380
},
{
"epoch": 0.1866814084027896,
"grad_norm": 0.9305728673934937,
"learning_rate": 2.995606570414205e-05,
"loss": 0.0473,
"step": 4390
},
{
"epoch": 0.18710665079095085,
"grad_norm": 0.8360809683799744,
"learning_rate": 2.9955431138284955e-05,
"loss": 0.0491,
"step": 4400
},
{
"epoch": 0.1875318931791121,
"grad_norm": 0.9870768785476685,
"learning_rate": 2.9954792029362754e-05,
"loss": 0.0508,
"step": 4410
},
{
"epoch": 0.18795713556727334,
"grad_norm": 0.7025906443595886,
"learning_rate": 2.9954148377569598e-05,
"loss": 0.0447,
"step": 4420
},
{
"epoch": 0.1883823779554346,
"grad_norm": 0.8500348329544067,
"learning_rate": 2.9953500183101002e-05,
"loss": 0.048,
"step": 4430
},
{
"epoch": 0.18880762034359586,
"grad_norm": 0.8822534084320068,
"learning_rate": 2.9952847446153877e-05,
"loss": 0.047,
"step": 4440
},
{
"epoch": 0.1892328627317571,
"grad_norm": 0.9271929264068604,
"learning_rate": 2.9952190166926498e-05,
"loss": 0.049,
"step": 4450
},
{
"epoch": 0.18965810511991835,
"grad_norm": 0.8434593677520752,
"learning_rate": 2.9951528345618546e-05,
"loss": 0.046,
"step": 4460
},
{
"epoch": 0.1900833475080796,
"grad_norm": 0.7526440024375916,
"learning_rate": 2.995086198243105e-05,
"loss": 0.046,
"step": 4470
},
{
"epoch": 0.19050858989624087,
"grad_norm": 0.8926873803138733,
"learning_rate": 2.9950191077566434e-05,
"loss": 0.0487,
"step": 4480
},
{
"epoch": 0.1909338322844021,
"grad_norm": 0.8471497297286987,
"learning_rate": 2.9949515631228515e-05,
"loss": 0.0448,
"step": 4490
},
{
"epoch": 0.19135907467256336,
"grad_norm": 0.7868749499320984,
"learning_rate": 2.994883564362246e-05,
"loss": 0.0488,
"step": 4500
},
{
"epoch": 0.1917843170607246,
"grad_norm": 0.6812358498573303,
"learning_rate": 2.994815111495484e-05,
"loss": 0.0442,
"step": 4510
},
{
"epoch": 0.19220955944888587,
"grad_norm": 0.727590024471283,
"learning_rate": 2.9947462045433594e-05,
"loss": 0.0479,
"step": 4520
},
{
"epoch": 0.19263480183704712,
"grad_norm": 0.6835356950759888,
"learning_rate": 2.9946768435268045e-05,
"loss": 0.0474,
"step": 4530
},
{
"epoch": 0.19306004422520837,
"grad_norm": 0.7723318934440613,
"learning_rate": 2.9946070284668884e-05,
"loss": 0.0451,
"step": 4540
},
{
"epoch": 0.1934852866133696,
"grad_norm": 0.8470324277877808,
"learning_rate": 2.994536759384821e-05,
"loss": 0.0475,
"step": 4550
},
{
"epoch": 0.19391052900153088,
"grad_norm": 0.7473281025886536,
"learning_rate": 2.994466036301946e-05,
"loss": 0.0484,
"step": 4560
},
{
"epoch": 0.19433577138969213,
"grad_norm": 0.9247629046440125,
"learning_rate": 2.9943948592397494e-05,
"loss": 0.0446,
"step": 4570
},
{
"epoch": 0.19476101377785338,
"grad_norm": 0.8116459250450134,
"learning_rate": 2.994323228219851e-05,
"loss": 0.0501,
"step": 4580
},
{
"epoch": 0.19518625616601462,
"grad_norm": 0.8559114933013916,
"learning_rate": 2.9942511432640113e-05,
"loss": 0.0495,
"step": 4590
},
{
"epoch": 0.19561149855417587,
"grad_norm": 1.0107638835906982,
"learning_rate": 2.9941786043941278e-05,
"loss": 0.0466,
"step": 4600
},
{
"epoch": 0.19603674094233714,
"grad_norm": 0.9044751524925232,
"learning_rate": 2.9941056116322357e-05,
"loss": 0.0476,
"step": 4610
},
{
"epoch": 0.19646198333049839,
"grad_norm": 0.8107633590698242,
"learning_rate": 2.994032165000508e-05,
"loss": 0.047,
"step": 4620
},
{
"epoch": 0.19688722571865963,
"grad_norm": 0.9792898297309875,
"learning_rate": 2.9939582645212566e-05,
"loss": 0.0483,
"step": 4630
},
{
"epoch": 0.19731246810682088,
"grad_norm": 0.7129027247428894,
"learning_rate": 2.9938839102169303e-05,
"loss": 0.0437,
"step": 4640
},
{
"epoch": 0.19773771049498215,
"grad_norm": 0.6798853278160095,
"learning_rate": 2.993809102110116e-05,
"loss": 0.045,
"step": 4650
},
{
"epoch": 0.1981629528831434,
"grad_norm": 0.8705613613128662,
"learning_rate": 2.9937338402235373e-05,
"loss": 0.0459,
"step": 4660
},
{
"epoch": 0.19858819527130464,
"grad_norm": 0.9700260758399963,
"learning_rate": 2.993658124580058e-05,
"loss": 0.0472,
"step": 4670
},
{
"epoch": 0.19901343765946589,
"grad_norm": 1.0166648626327515,
"learning_rate": 2.9935819552026785e-05,
"loss": 0.0454,
"step": 4680
},
{
"epoch": 0.19943868004762716,
"grad_norm": 1.0023534297943115,
"learning_rate": 2.9935053321145368e-05,
"loss": 0.0501,
"step": 4690
},
{
"epoch": 0.1998639224357884,
"grad_norm": 0.820027768611908,
"learning_rate": 2.9934282553389088e-05,
"loss": 0.0458,
"step": 4700
},
{
"epoch": 0.20028916482394965,
"grad_norm": 0.881223201751709,
"learning_rate": 2.9933507248992084e-05,
"loss": 0.046,
"step": 4710
},
{
"epoch": 0.2007144072121109,
"grad_norm": 0.838016927242279,
"learning_rate": 2.9932727408189876e-05,
"loss": 0.0471,
"step": 4720
},
{
"epoch": 0.20113964960027217,
"grad_norm": 0.9668949246406555,
"learning_rate": 2.9931943031219356e-05,
"loss": 0.0446,
"step": 4730
},
{
"epoch": 0.20156489198843341,
"grad_norm": 0.762941300868988,
"learning_rate": 2.9931154118318803e-05,
"loss": 0.0439,
"step": 4740
},
{
"epoch": 0.20199013437659466,
"grad_norm": 0.6991223692893982,
"learning_rate": 2.9930360669727858e-05,
"loss": 0.0476,
"step": 4750
},
{
"epoch": 0.2024153767647559,
"grad_norm": 0.744959831237793,
"learning_rate": 2.992956268568755e-05,
"loss": 0.0473,
"step": 4760
},
{
"epoch": 0.20284061915291715,
"grad_norm": 0.8024082183837891,
"learning_rate": 2.99287601664403e-05,
"loss": 0.0487,
"step": 4770
},
{
"epoch": 0.20326586154107842,
"grad_norm": 0.9071053266525269,
"learning_rate": 2.992795311222988e-05,
"loss": 0.0484,
"step": 4780
},
{
"epoch": 0.20369110392923967,
"grad_norm": 0.9340217113494873,
"learning_rate": 2.9927141523301453e-05,
"loss": 0.0473,
"step": 4790
},
{
"epoch": 0.20411634631740092,
"grad_norm": 0.9845976829528809,
"learning_rate": 2.992632539990156e-05,
"loss": 0.0447,
"step": 4800
},
{
"epoch": 0.20454158870556216,
"grad_norm": 0.8842020034790039,
"learning_rate": 2.9925504742278117e-05,
"loss": 0.0492,
"step": 4810
},
{
"epoch": 0.20496683109372343,
"grad_norm": 0.9457699060440063,
"learning_rate": 2.9924679550680416e-05,
"loss": 0.0463,
"step": 4820
},
{
"epoch": 0.20539207348188468,
"grad_norm": 0.8074204921722412,
"learning_rate": 2.9923849825359133e-05,
"loss": 0.0467,
"step": 4830
},
{
"epoch": 0.20581731587004592,
"grad_norm": 0.7934054136276245,
"learning_rate": 2.9923015566566317e-05,
"loss": 0.0462,
"step": 4840
},
{
"epoch": 0.20624255825820717,
"grad_norm": 0.8365235924720764,
"learning_rate": 2.992217677455539e-05,
"loss": 0.0447,
"step": 4850
},
{
"epoch": 0.20666780064636844,
"grad_norm": 0.8392214775085449,
"learning_rate": 2.9921333449581153e-05,
"loss": 0.0481,
"step": 4860
},
{
"epoch": 0.2070930430345297,
"grad_norm": 0.9338740110397339,
"learning_rate": 2.9920485591899795e-05,
"loss": 0.0459,
"step": 4870
},
{
"epoch": 0.20751828542269093,
"grad_norm": 0.7019702196121216,
"learning_rate": 2.991963320176886e-05,
"loss": 0.0438,
"step": 4880
},
{
"epoch": 0.20794352781085218,
"grad_norm": 0.7933806777000427,
"learning_rate": 2.991877627944729e-05,
"loss": 0.0432,
"step": 4890
},
{
"epoch": 0.20836877019901343,
"grad_norm": 0.8409459590911865,
"learning_rate": 2.9917914825195393e-05,
"loss": 0.0464,
"step": 4900
},
{
"epoch": 0.2087940125871747,
"grad_norm": 0.7470780611038208,
"learning_rate": 2.991704883927486e-05,
"loss": 0.0427,
"step": 4910
},
{
"epoch": 0.20921925497533594,
"grad_norm": 0.7226709127426147,
"learning_rate": 2.991617832194875e-05,
"loss": 0.0491,
"step": 4920
},
{
"epoch": 0.2096444973634972,
"grad_norm": 0.7474321126937866,
"learning_rate": 2.9915303273481504e-05,
"loss": 0.0484,
"step": 4930
},
{
"epoch": 0.21006973975165844,
"grad_norm": 0.8669159412384033,
"learning_rate": 2.9914423694138942e-05,
"loss": 0.044,
"step": 4940
},
{
"epoch": 0.2104949821398197,
"grad_norm": 0.9365962147712708,
"learning_rate": 2.9913539584188253e-05,
"loss": 0.0487,
"step": 4950
},
{
"epoch": 0.21092022452798095,
"grad_norm": 0.7574573755264282,
"learning_rate": 2.9912650943898008e-05,
"loss": 0.0482,
"step": 4960
},
{
"epoch": 0.2113454669161422,
"grad_norm": 0.6261996626853943,
"learning_rate": 2.9911757773538148e-05,
"loss": 0.0443,
"step": 4970
},
{
"epoch": 0.21177070930430344,
"grad_norm": 0.9774627089500427,
"learning_rate": 2.9910860073380005e-05,
"loss": 0.046,
"step": 4980
},
{
"epoch": 0.21219595169246472,
"grad_norm": 0.8433800935745239,
"learning_rate": 2.9909957843696267e-05,
"loss": 0.0458,
"step": 4990
},
{
"epoch": 0.21262119408062596,
"grad_norm": 0.6524537801742554,
"learning_rate": 2.990905108476101e-05,
"loss": 0.0488,
"step": 5000
},
{
"epoch": 0.2130464364687872,
"grad_norm": 0.6434529423713684,
"learning_rate": 2.9908139796849683e-05,
"loss": 0.0455,
"step": 5010
},
{
"epoch": 0.21347167885694845,
"grad_norm": 0.7861361503601074,
"learning_rate": 2.9907223980239114e-05,
"loss": 0.0461,
"step": 5020
},
{
"epoch": 0.2138969212451097,
"grad_norm": 0.5959680676460266,
"learning_rate": 2.99063036352075e-05,
"loss": 0.0442,
"step": 5030
},
{
"epoch": 0.21432216363327097,
"grad_norm": 0.6599105000495911,
"learning_rate": 2.9905378762034424e-05,
"loss": 0.0445,
"step": 5040
},
{
"epoch": 0.21474740602143222,
"grad_norm": 0.7693561315536499,
"learning_rate": 2.9904449361000833e-05,
"loss": 0.0441,
"step": 5050
},
{
"epoch": 0.21517264840959346,
"grad_norm": 0.8092666864395142,
"learning_rate": 2.9903515432389056e-05,
"loss": 0.0431,
"step": 5060
},
{
"epoch": 0.2155978907977547,
"grad_norm": 0.7702406644821167,
"learning_rate": 2.990257697648279e-05,
"loss": 0.0433,
"step": 5070
},
{
"epoch": 0.21602313318591598,
"grad_norm": 0.8427804708480835,
"learning_rate": 2.9901633993567125e-05,
"loss": 0.0417,
"step": 5080
},
{
"epoch": 0.21644837557407723,
"grad_norm": 0.7076109051704407,
"learning_rate": 2.99006864839285e-05,
"loss": 0.0489,
"step": 5090
},
{
"epoch": 0.21687361796223847,
"grad_norm": 0.6427155137062073,
"learning_rate": 2.989973444785476e-05,
"loss": 0.0482,
"step": 5100
},
{
"epoch": 0.21729886035039972,
"grad_norm": 0.7994742393493652,
"learning_rate": 2.989877788563509e-05,
"loss": 0.0471,
"step": 5110
},
{
"epoch": 0.217724102738561,
"grad_norm": 0.7038072943687439,
"learning_rate": 2.9897816797560085e-05,
"loss": 0.0429,
"step": 5120
},
{
"epoch": 0.21814934512672224,
"grad_norm": 0.8199881315231323,
"learning_rate": 2.989685118392169e-05,
"loss": 0.044,
"step": 5130
},
{
"epoch": 0.21857458751488348,
"grad_norm": 0.9012776017189026,
"learning_rate": 2.9895881045013232e-05,
"loss": 0.0482,
"step": 5140
},
{
"epoch": 0.21899982990304473,
"grad_norm": 0.9288299679756165,
"learning_rate": 2.9894906381129414e-05,
"loss": 0.0457,
"step": 5150
},
{
"epoch": 0.21942507229120597,
"grad_norm": 0.8141002058982849,
"learning_rate": 2.9893927192566316e-05,
"loss": 0.0465,
"step": 5160
},
{
"epoch": 0.21985031467936725,
"grad_norm": 0.8564704060554504,
"learning_rate": 2.989294347962139e-05,
"loss": 0.0433,
"step": 5170
},
{
"epoch": 0.2202755570675285,
"grad_norm": 0.7727344036102295,
"learning_rate": 2.989195524259346e-05,
"loss": 0.0444,
"step": 5180
},
{
"epoch": 0.22070079945568974,
"grad_norm": 0.9129493236541748,
"learning_rate": 2.9890962481782723e-05,
"loss": 0.0455,
"step": 5190
},
{
"epoch": 0.22112604184385098,
"grad_norm": 0.6133219003677368,
"learning_rate": 2.9889965197490757e-05,
"loss": 0.044,
"step": 5200
},
{
"epoch": 0.22155128423201226,
"grad_norm": 0.5847681760787964,
"learning_rate": 2.988896339002052e-05,
"loss": 0.0466,
"step": 5210
},
{
"epoch": 0.2219765266201735,
"grad_norm": 0.7327444553375244,
"learning_rate": 2.9887957059676315e-05,
"loss": 0.0444,
"step": 5220
},
{
"epoch": 0.22240176900833475,
"grad_norm": 0.6856175065040588,
"learning_rate": 2.9886946206763855e-05,
"loss": 0.0453,
"step": 5230
},
{
"epoch": 0.222827011396496,
"grad_norm": 0.8159507513046265,
"learning_rate": 2.9885930831590202e-05,
"loss": 0.0446,
"step": 5240
},
{
"epoch": 0.22325225378465727,
"grad_norm": 0.7875511646270752,
"learning_rate": 2.98849109344638e-05,
"loss": 0.044,
"step": 5250
},
{
"epoch": 0.2236774961728185,
"grad_norm": 0.7698711156845093,
"learning_rate": 2.9883886515694474e-05,
"loss": 0.0451,
"step": 5260
},
{
"epoch": 0.22410273856097976,
"grad_norm": 0.958143413066864,
"learning_rate": 2.988285757559341e-05,
"loss": 0.0452,
"step": 5270
},
{
"epoch": 0.224527980949141,
"grad_norm": 1.0015833377838135,
"learning_rate": 2.988182411447317e-05,
"loss": 0.045,
"step": 5280
},
{
"epoch": 0.22495322333730225,
"grad_norm": 0.7416315674781799,
"learning_rate": 2.98807861326477e-05,
"loss": 0.0457,
"step": 5290
},
{
"epoch": 0.22537846572546352,
"grad_norm": 0.6568078994750977,
"learning_rate": 2.9879743630432307e-05,
"loss": 0.0444,
"step": 5300
},
{
"epoch": 0.22580370811362477,
"grad_norm": 0.7507728338241577,
"learning_rate": 2.9878696608143677e-05,
"loss": 0.0419,
"step": 5310
},
{
"epoch": 0.226228950501786,
"grad_norm": 0.7239499688148499,
"learning_rate": 2.9877645066099862e-05,
"loss": 0.0431,
"step": 5320
},
{
"epoch": 0.22665419288994726,
"grad_norm": 0.6904134154319763,
"learning_rate": 2.9876589004620298e-05,
"loss": 0.0437,
"step": 5330
},
{
"epoch": 0.22707943527810853,
"grad_norm": 0.7209636569023132,
"learning_rate": 2.987552842402579e-05,
"loss": 0.0472,
"step": 5340
},
{
"epoch": 0.22750467766626978,
"grad_norm": 0.7524945735931396,
"learning_rate": 2.9874463324638515e-05,
"loss": 0.0423,
"step": 5350
},
{
"epoch": 0.22792992005443102,
"grad_norm": 0.8387914299964905,
"learning_rate": 2.9873393706782016e-05,
"loss": 0.0439,
"step": 5360
},
{
"epoch": 0.22835516244259227,
"grad_norm": 0.7968807220458984,
"learning_rate": 2.987231957078122e-05,
"loss": 0.0424,
"step": 5370
},
{
"epoch": 0.22878040483075354,
"grad_norm": 0.8148554563522339,
"learning_rate": 2.9871240916962414e-05,
"loss": 0.0416,
"step": 5380
},
{
"epoch": 0.2292056472189148,
"grad_norm": 0.6972390413284302,
"learning_rate": 2.9870157745653274e-05,
"loss": 0.0423,
"step": 5390
},
{
"epoch": 0.22963088960707603,
"grad_norm": 0.9805029630661011,
"learning_rate": 2.9869070057182838e-05,
"loss": 0.042,
"step": 5400
},
{
"epoch": 0.23005613199523728,
"grad_norm": 0.9213829636573792,
"learning_rate": 2.986797785188151e-05,
"loss": 0.045,
"step": 5410
},
{
"epoch": 0.23048137438339852,
"grad_norm": 0.8021851778030396,
"learning_rate": 2.9866881130081074e-05,
"loss": 0.0431,
"step": 5420
},
{
"epoch": 0.2309066167715598,
"grad_norm": 0.7615094184875488,
"learning_rate": 2.986577989211469e-05,
"loss": 0.0442,
"step": 5430
},
{
"epoch": 0.23133185915972104,
"grad_norm": 0.6527719497680664,
"learning_rate": 2.9864674138316887e-05,
"loss": 0.0434,
"step": 5440
},
{
"epoch": 0.2317571015478823,
"grad_norm": 0.7374159097671509,
"learning_rate": 2.9863563869023554e-05,
"loss": 0.0412,
"step": 5450
},
{
"epoch": 0.23218234393604353,
"grad_norm": 0.7521957159042358,
"learning_rate": 2.9862449084571972e-05,
"loss": 0.0418,
"step": 5460
},
{
"epoch": 0.2326075863242048,
"grad_norm": 0.7196958065032959,
"learning_rate": 2.9861329785300774e-05,
"loss": 0.0434,
"step": 5470
},
{
"epoch": 0.23303282871236605,
"grad_norm": 0.8486290574073792,
"learning_rate": 2.9860205971549978e-05,
"loss": 0.0423,
"step": 5480
},
{
"epoch": 0.2334580711005273,
"grad_norm": 0.8910712003707886,
"learning_rate": 2.9859077643660974e-05,
"loss": 0.0425,
"step": 5490
},
{
"epoch": 0.23388331348868854,
"grad_norm": 0.8362525701522827,
"learning_rate": 2.985794480197651e-05,
"loss": 0.0492,
"step": 5500
},
{
"epoch": 0.23430855587684982,
"grad_norm": 0.817209780216217,
"learning_rate": 2.9856807446840716e-05,
"loss": 0.0396,
"step": 5510
},
{
"epoch": 0.23473379826501106,
"grad_norm": 0.7105398178100586,
"learning_rate": 2.9855665578599093e-05,
"loss": 0.0427,
"step": 5520
},
{
"epoch": 0.2351590406531723,
"grad_norm": 0.8205658197402954,
"learning_rate": 2.9854519197598504e-05,
"loss": 0.0409,
"step": 5530
},
{
"epoch": 0.23558428304133355,
"grad_norm": 0.5791633725166321,
"learning_rate": 2.9853368304187197e-05,
"loss": 0.0423,
"step": 5540
},
{
"epoch": 0.2360095254294948,
"grad_norm": 0.6344013810157776,
"learning_rate": 2.9852212898714783e-05,
"loss": 0.0436,
"step": 5550
},
{
"epoch": 0.23643476781765607,
"grad_norm": 0.7818456888198853,
"learning_rate": 2.9851052981532233e-05,
"loss": 0.0437,
"step": 5560
},
{
"epoch": 0.23686001020581732,
"grad_norm": 0.6619543433189392,
"learning_rate": 2.9849888552991908e-05,
"loss": 0.0441,
"step": 5570
},
{
"epoch": 0.23728525259397856,
"grad_norm": 0.6021639704704285,
"learning_rate": 2.9848719613447535e-05,
"loss": 0.0496,
"step": 5580
},
{
"epoch": 0.2377104949821398,
"grad_norm": 0.6315738558769226,
"learning_rate": 2.9847546163254194e-05,
"loss": 0.0417,
"step": 5590
},
{
"epoch": 0.23813573737030108,
"grad_norm": 0.7588102221488953,
"learning_rate": 2.984636820276836e-05,
"loss": 0.0454,
"step": 5600
},
{
"epoch": 0.23856097975846233,
"grad_norm": 0.7127803564071655,
"learning_rate": 2.984518573234786e-05,
"loss": 0.0441,
"step": 5610
},
{
"epoch": 0.23898622214662357,
"grad_norm": 0.8134715557098389,
"learning_rate": 2.98439987523519e-05,
"loss": 0.0426,
"step": 5620
},
{
"epoch": 0.23941146453478482,
"grad_norm": 0.6682436466217041,
"learning_rate": 2.9842807263141052e-05,
"loss": 0.0429,
"step": 5630
},
{
"epoch": 0.2398367069229461,
"grad_norm": 0.7612809538841248,
"learning_rate": 2.9841611265077256e-05,
"loss": 0.0404,
"step": 5640
},
{
"epoch": 0.24026194931110734,
"grad_norm": 0.6678027510643005,
"learning_rate": 2.9840410758523832e-05,
"loss": 0.0426,
"step": 5650
},
{
"epoch": 0.24068719169926858,
"grad_norm": 0.5913005471229553,
"learning_rate": 2.9839205743845453e-05,
"loss": 0.0423,
"step": 5660
},
{
"epoch": 0.24111243408742983,
"grad_norm": 0.903803825378418,
"learning_rate": 2.983799622140818e-05,
"loss": 0.0451,
"step": 5670
},
{
"epoch": 0.24153767647559107,
"grad_norm": 0.6426815986633301,
"learning_rate": 2.9836782191579425e-05,
"loss": 0.0422,
"step": 5680
},
{
"epoch": 0.24196291886375235,
"grad_norm": 0.8089521527290344,
"learning_rate": 2.983556365472799e-05,
"loss": 0.0444,
"step": 5690
},
{
"epoch": 0.2423881612519136,
"grad_norm": 0.7508390545845032,
"learning_rate": 2.9834340611224022e-05,
"loss": 0.0403,
"step": 5700
},
{
"epoch": 0.24281340364007484,
"grad_norm": 0.6964208483695984,
"learning_rate": 2.9833113061439057e-05,
"loss": 0.042,
"step": 5710
},
{
"epoch": 0.24323864602823608,
"grad_norm": 0.8737379312515259,
"learning_rate": 2.9831881005745986e-05,
"loss": 0.0403,
"step": 5720
},
{
"epoch": 0.24366388841639736,
"grad_norm": 0.6892777681350708,
"learning_rate": 2.983064444451908e-05,
"loss": 0.0441,
"step": 5730
},
{
"epoch": 0.2440891308045586,
"grad_norm": 0.8702257871627808,
"learning_rate": 2.9829403378133974e-05,
"loss": 0.0443,
"step": 5740
},
{
"epoch": 0.24451437319271985,
"grad_norm": 0.7382299900054932,
"learning_rate": 2.9828157806967668e-05,
"loss": 0.0421,
"step": 5750
},
{
"epoch": 0.2449396155808811,
"grad_norm": 0.8885031938552856,
"learning_rate": 2.9826907731398533e-05,
"loss": 0.0416,
"step": 5760
},
{
"epoch": 0.24536485796904237,
"grad_norm": 0.5969104766845703,
"learning_rate": 2.9825653151806315e-05,
"loss": 0.0447,
"step": 5770
},
{
"epoch": 0.2457901003572036,
"grad_norm": 0.7052406668663025,
"learning_rate": 2.9824394068572114e-05,
"loss": 0.0427,
"step": 5780
},
{
"epoch": 0.24621534274536486,
"grad_norm": 0.5495277643203735,
"learning_rate": 2.9823130482078415e-05,
"loss": 0.0417,
"step": 5790
},
{
"epoch": 0.2466405851335261,
"grad_norm": 0.6981067657470703,
"learning_rate": 2.9821862392709054e-05,
"loss": 0.0431,
"step": 5800
},
{
"epoch": 0.24706582752168738,
"grad_norm": 0.9393956065177917,
"learning_rate": 2.982058980084925e-05,
"loss": 0.0441,
"step": 5810
},
{
"epoch": 0.24749106990984862,
"grad_norm": 0.6969560980796814,
"learning_rate": 2.9819312706885577e-05,
"loss": 0.0439,
"step": 5820
},
{
"epoch": 0.24791631229800987,
"grad_norm": 0.7812885642051697,
"learning_rate": 2.9818031111205986e-05,
"loss": 0.039,
"step": 5830
},
{
"epoch": 0.2483415546861711,
"grad_norm": 0.7769454121589661,
"learning_rate": 2.981674501419979e-05,
"loss": 0.0415,
"step": 5840
},
{
"epoch": 0.24876679707433236,
"grad_norm": 0.6432703137397766,
"learning_rate": 2.981545441625767e-05,
"loss": 0.0404,
"step": 5850
},
{
"epoch": 0.24919203946249363,
"grad_norm": 0.9023407697677612,
"learning_rate": 2.9814159317771682e-05,
"loss": 0.0432,
"step": 5860
},
{
"epoch": 0.24961728185065488,
"grad_norm": 0.9495925903320312,
"learning_rate": 2.9812859719135236e-05,
"loss": 0.0419,
"step": 5870
},
{
"epoch": 0.2500425242388161,
"grad_norm": 1.0032718181610107,
"learning_rate": 2.981155562074312e-05,
"loss": 0.0425,
"step": 5880
},
{
"epoch": 0.2504677666269774,
"grad_norm": 0.6281837224960327,
"learning_rate": 2.9810247022991483e-05,
"loss": 0.0387,
"step": 5890
},
{
"epoch": 0.2508930090151386,
"grad_norm": 0.7671382427215576,
"learning_rate": 2.9808933926277842e-05,
"loss": 0.0418,
"step": 5900
},
{
"epoch": 0.2513182514032999,
"grad_norm": 0.7822478413581848,
"learning_rate": 2.9807616331001078e-05,
"loss": 0.0418,
"step": 5910
},
{
"epoch": 0.25174349379146116,
"grad_norm": 0.7015408277511597,
"learning_rate": 2.9806294237561452e-05,
"loss": 0.0407,
"step": 5920
},
{
"epoch": 0.2521687361796224,
"grad_norm": 0.683319628238678,
"learning_rate": 2.980496764636057e-05,
"loss": 0.0439,
"step": 5930
},
{
"epoch": 0.25259397856778365,
"grad_norm": 0.6143163442611694,
"learning_rate": 2.9803636557801417e-05,
"loss": 0.042,
"step": 5940
},
{
"epoch": 0.25301922095594487,
"grad_norm": 0.8570902347564697,
"learning_rate": 2.980230097228835e-05,
"loss": 0.0438,
"step": 5950
},
{
"epoch": 0.25344446334410614,
"grad_norm": 0.6100302934646606,
"learning_rate": 2.9800960890227076e-05,
"loss": 0.0402,
"step": 5960
},
{
"epoch": 0.2538697057322674,
"grad_norm": 0.6212876439094543,
"learning_rate": 2.979961631202468e-05,
"loss": 0.0399,
"step": 5970
},
{
"epoch": 0.25429494812042863,
"grad_norm": 0.546836256980896,
"learning_rate": 2.9798267238089613e-05,
"loss": 0.0403,
"step": 5980
},
{
"epoch": 0.2547201905085899,
"grad_norm": 0.6100478172302246,
"learning_rate": 2.9796913668831684e-05,
"loss": 0.0404,
"step": 5990
},
{
"epoch": 0.2551454328967511,
"grad_norm": 0.5832479596138,
"learning_rate": 2.979555560466207e-05,
"loss": 0.0386,
"step": 6000
},
{
"epoch": 0.2555706752849124,
"grad_norm": 0.6781200766563416,
"learning_rate": 2.9794193045993317e-05,
"loss": 0.0425,
"step": 6010
},
{
"epoch": 0.25599591767307367,
"grad_norm": 0.5507116317749023,
"learning_rate": 2.9792825993239337e-05,
"loss": 0.0384,
"step": 6020
},
{
"epoch": 0.2564211600612349,
"grad_norm": 0.7260423898696899,
"learning_rate": 2.9791454446815404e-05,
"loss": 0.0438,
"step": 6030
},
{
"epoch": 0.25684640244939616,
"grad_norm": 0.6341103315353394,
"learning_rate": 2.9790078407138154e-05,
"loss": 0.0401,
"step": 6040
},
{
"epoch": 0.25727164483755743,
"grad_norm": 0.5935182571411133,
"learning_rate": 2.9788697874625588e-05,
"loss": 0.041,
"step": 6050
},
{
"epoch": 0.25769688722571865,
"grad_norm": 0.7649999260902405,
"learning_rate": 2.9787312849697088e-05,
"loss": 0.0416,
"step": 6060
},
{
"epoch": 0.2581221296138799,
"grad_norm": 0.7656498551368713,
"learning_rate": 2.978592333277338e-05,
"loss": 0.04,
"step": 6070
},
{
"epoch": 0.25854737200204114,
"grad_norm": 0.6227708458900452,
"learning_rate": 2.9784529324276557e-05,
"loss": 0.0441,
"step": 6080
},
{
"epoch": 0.2589726143902024,
"grad_norm": 0.6449046730995178,
"learning_rate": 2.9783130824630093e-05,
"loss": 0.0413,
"step": 6090
},
{
"epoch": 0.2593978567783637,
"grad_norm": 0.8230679035186768,
"learning_rate": 2.9781727834258806e-05,
"loss": 0.0409,
"step": 6100
},
{
"epoch": 0.2598230991665249,
"grad_norm": 0.810616672039032,
"learning_rate": 2.97803203535889e-05,
"loss": 0.0456,
"step": 6110
},
{
"epoch": 0.2602483415546862,
"grad_norm": 0.7623624205589294,
"learning_rate": 2.977890838304792e-05,
"loss": 0.0426,
"step": 6120
},
{
"epoch": 0.2606735839428474,
"grad_norm": 0.7329275012016296,
"learning_rate": 2.9777491923064782e-05,
"loss": 0.042,
"step": 6130
},
{
"epoch": 0.26109882633100867,
"grad_norm": 0.7706323266029358,
"learning_rate": 2.977607097406978e-05,
"loss": 0.0415,
"step": 6140
},
{
"epoch": 0.26152406871916994,
"grad_norm": 0.6100689172744751,
"learning_rate": 2.9774645536494552e-05,
"loss": 0.0384,
"step": 6150
},
{
"epoch": 0.26194931110733116,
"grad_norm": 0.516251802444458,
"learning_rate": 2.9773215610772117e-05,
"loss": 0.0401,
"step": 6160
},
{
"epoch": 0.26237455349549244,
"grad_norm": 0.7943555116653442,
"learning_rate": 2.9771781197336837e-05,
"loss": 0.0376,
"step": 6170
},
{
"epoch": 0.2627997958836537,
"grad_norm": 0.8178479671478271,
"learning_rate": 2.977034229662446e-05,
"loss": 0.0388,
"step": 6180
},
{
"epoch": 0.2632250382718149,
"grad_norm": 0.7358256578445435,
"learning_rate": 2.976889890907208e-05,
"loss": 0.0412,
"step": 6190
},
{
"epoch": 0.2636502806599762,
"grad_norm": 0.6371821761131287,
"learning_rate": 2.9767451035118164e-05,
"loss": 0.0398,
"step": 6200
},
{
"epoch": 0.2640755230481374,
"grad_norm": 0.549709141254425,
"learning_rate": 2.9765998675202536e-05,
"loss": 0.0416,
"step": 6210
},
{
"epoch": 0.2645007654362987,
"grad_norm": 0.6909655332565308,
"learning_rate": 2.9764541829766382e-05,
"loss": 0.0399,
"step": 6220
},
{
"epoch": 0.26492600782445996,
"grad_norm": 0.6212393641471863,
"learning_rate": 2.9763080499252253e-05,
"loss": 0.0377,
"step": 6230
},
{
"epoch": 0.2653512502126212,
"grad_norm": 0.5959421396255493,
"learning_rate": 2.9761614684104064e-05,
"loss": 0.0399,
"step": 6240
},
{
"epoch": 0.26577649260078245,
"grad_norm": 0.6034130454063416,
"learning_rate": 2.9760144384767095e-05,
"loss": 0.0393,
"step": 6250
},
{
"epoch": 0.2662017349889437,
"grad_norm": 0.6626703143119812,
"learning_rate": 2.9758669601687986e-05,
"loss": 0.0381,
"step": 6260
},
{
"epoch": 0.26662697737710495,
"grad_norm": 0.6859477758407593,
"learning_rate": 2.9757190335314722e-05,
"loss": 0.0406,
"step": 6270
},
{
"epoch": 0.2670522197652662,
"grad_norm": 0.7063243985176086,
"learning_rate": 2.975570658609668e-05,
"loss": 0.0391,
"step": 6280
},
{
"epoch": 0.26747746215342744,
"grad_norm": 0.5350667834281921,
"learning_rate": 2.9754218354484582e-05,
"loss": 0.0416,
"step": 6290
},
{
"epoch": 0.2679027045415887,
"grad_norm": 0.6627820134162903,
"learning_rate": 2.9752725640930504e-05,
"loss": 0.0386,
"step": 6300
},
{
"epoch": 0.26832794692975,
"grad_norm": 0.6602414846420288,
"learning_rate": 2.9751228445887905e-05,
"loss": 0.0391,
"step": 6310
},
{
"epoch": 0.2687531893179112,
"grad_norm": 0.6656312942504883,
"learning_rate": 2.9749726769811588e-05,
"loss": 0.0406,
"step": 6320
},
{
"epoch": 0.2691784317060725,
"grad_norm": 0.6035658121109009,
"learning_rate": 2.9748220613157717e-05,
"loss": 0.0396,
"step": 6330
},
{
"epoch": 0.2696036740942337,
"grad_norm": 0.589562177658081,
"learning_rate": 2.9746709976383832e-05,
"loss": 0.0377,
"step": 6340
},
{
"epoch": 0.27002891648239497,
"grad_norm": 0.5129275918006897,
"learning_rate": 2.974519485994882e-05,
"loss": 0.0451,
"step": 6350
},
{
"epoch": 0.27045415887055624,
"grad_norm": 0.6258460879325867,
"learning_rate": 2.9743675264312934e-05,
"loss": 0.0386,
"step": 6360
},
{
"epoch": 0.27087940125871746,
"grad_norm": 0.8174107670783997,
"learning_rate": 2.9742151189937784e-05,
"loss": 0.04,
"step": 6370
},
{
"epoch": 0.27130464364687873,
"grad_norm": 0.7886274456977844,
"learning_rate": 2.974062263728635e-05,
"loss": 0.0404,
"step": 6380
},
{
"epoch": 0.27172988603503995,
"grad_norm": 0.7022881507873535,
"learning_rate": 2.973908960682296e-05,
"loss": 0.0414,
"step": 6390
},
{
"epoch": 0.2721551284232012,
"grad_norm": 0.9441139698028564,
"learning_rate": 2.9737552099013313e-05,
"loss": 0.045,
"step": 6400
},
{
"epoch": 0.2725803708113625,
"grad_norm": 0.877943754196167,
"learning_rate": 2.9736010114324462e-05,
"loss": 0.042,
"step": 6410
},
{
"epoch": 0.2730056131995237,
"grad_norm": 0.851972758769989,
"learning_rate": 2.9734463653224813e-05,
"loss": 0.0423,
"step": 6420
},
{
"epoch": 0.273430855587685,
"grad_norm": 0.7306724190711975,
"learning_rate": 2.9732912716184155e-05,
"loss": 0.039,
"step": 6430
},
{
"epoch": 0.27385609797584626,
"grad_norm": 0.6330131888389587,
"learning_rate": 2.9731357303673607e-05,
"loss": 0.0411,
"step": 6440
},
{
"epoch": 0.2742813403640075,
"grad_norm": 0.8628656268119812,
"learning_rate": 2.9729797416165677e-05,
"loss": 0.0378,
"step": 6450
},
{
"epoch": 0.27470658275216875,
"grad_norm": 0.6638690233230591,
"learning_rate": 2.9728233054134202e-05,
"loss": 0.0425,
"step": 6460
},
{
"epoch": 0.27513182514032997,
"grad_norm": 0.607979953289032,
"learning_rate": 2.9726664218054405e-05,
"loss": 0.0359,
"step": 6470
},
{
"epoch": 0.27555706752849124,
"grad_norm": 0.6174076795578003,
"learning_rate": 2.9725090908402856e-05,
"loss": 0.0383,
"step": 6480
},
{
"epoch": 0.2759823099166525,
"grad_norm": 0.8638490438461304,
"learning_rate": 2.972351312565748e-05,
"loss": 0.0445,
"step": 6490
},
{
"epoch": 0.27640755230481373,
"grad_norm": 0.7757192850112915,
"learning_rate": 2.9721930870297567e-05,
"loss": 0.0422,
"step": 6500
},
{
"epoch": 0.276832794692975,
"grad_norm": 0.7784976363182068,
"learning_rate": 2.9720344142803766e-05,
"loss": 0.0427,
"step": 6510
},
{
"epoch": 0.2772580370811362,
"grad_norm": 0.7674238681793213,
"learning_rate": 2.9718752943658085e-05,
"loss": 0.0411,
"step": 6520
},
{
"epoch": 0.2776832794692975,
"grad_norm": 0.6946344971656799,
"learning_rate": 2.9717157273343885e-05,
"loss": 0.0434,
"step": 6530
},
{
"epoch": 0.27810852185745877,
"grad_norm": 0.8143453001976013,
"learning_rate": 2.971555713234589e-05,
"loss": 0.0381,
"step": 6540
},
{
"epoch": 0.27853376424562,
"grad_norm": 0.6152936220169067,
"learning_rate": 2.9713952521150176e-05,
"loss": 0.0413,
"step": 6550
},
{
"epoch": 0.27895900663378126,
"grad_norm": 0.5989219546318054,
"learning_rate": 2.971234344024419e-05,
"loss": 0.0399,
"step": 6560
},
{
"epoch": 0.27938424902194253,
"grad_norm": 0.6335272789001465,
"learning_rate": 2.971072989011672e-05,
"loss": 0.0403,
"step": 6570
},
{
"epoch": 0.27980949141010375,
"grad_norm": 0.5582631826400757,
"learning_rate": 2.9709111871257927e-05,
"loss": 0.0394,
"step": 6580
},
{
"epoch": 0.280234733798265,
"grad_norm": 0.5173953175544739,
"learning_rate": 2.9707489384159318e-05,
"loss": 0.0389,
"step": 6590
},
{
"epoch": 0.28065997618642624,
"grad_norm": 0.6251932382583618,
"learning_rate": 2.9705862429313763e-05,
"loss": 0.0412,
"step": 6600
},
{
"epoch": 0.2810852185745875,
"grad_norm": 0.5652943253517151,
"learning_rate": 2.9704231007215488e-05,
"loss": 0.0393,
"step": 6610
},
{
"epoch": 0.2815104609627488,
"grad_norm": 0.6986289620399475,
"learning_rate": 2.9702595118360077e-05,
"loss": 0.0424,
"step": 6620
},
{
"epoch": 0.28193570335091,
"grad_norm": 0.6257848739624023,
"learning_rate": 2.9700954763244465e-05,
"loss": 0.042,
"step": 6630
},
{
"epoch": 0.2823609457390713,
"grad_norm": 0.5983774662017822,
"learning_rate": 2.9699309942366953e-05,
"loss": 0.0409,
"step": 6640
},
{
"epoch": 0.2827861881272325,
"grad_norm": 0.6213322281837463,
"learning_rate": 2.969766065622719e-05,
"loss": 0.0416,
"step": 6650
},
{
"epoch": 0.28321143051539377,
"grad_norm": 0.6498160362243652,
"learning_rate": 2.96960069053262e-05,
"loss": 0.0384,
"step": 6660
},
{
"epoch": 0.28363667290355504,
"grad_norm": 0.6931298971176147,
"learning_rate": 2.9694348690166327e-05,
"loss": 0.04,
"step": 6670
},
{
"epoch": 0.28406191529171626,
"grad_norm": 0.5969164967536926,
"learning_rate": 2.969268601125131e-05,
"loss": 0.0375,
"step": 6680
},
{
"epoch": 0.28448715767987753,
"grad_norm": 0.7202589511871338,
"learning_rate": 2.969101886908622e-05,
"loss": 0.0388,
"step": 6690
},
{
"epoch": 0.2849124000680388,
"grad_norm": 0.709220826625824,
"learning_rate": 2.968934726417749e-05,
"loss": 0.0391,
"step": 6700
},
{
"epoch": 0.2853376424562,
"grad_norm": 0.7698434591293335,
"learning_rate": 2.968767119703291e-05,
"loss": 0.0385,
"step": 6710
},
{
"epoch": 0.2857628848443613,
"grad_norm": 0.6566237807273865,
"learning_rate": 2.9685990668161628e-05,
"loss": 0.0419,
"step": 6720
},
{
"epoch": 0.2861881272325225,
"grad_norm": 0.6004732847213745,
"learning_rate": 2.9684305678074137e-05,
"loss": 0.0388,
"step": 6730
},
{
"epoch": 0.2866133696206838,
"grad_norm": 0.736395537853241,
"learning_rate": 2.9682616227282304e-05,
"loss": 0.0405,
"step": 6740
},
{
"epoch": 0.28703861200884506,
"grad_norm": 0.6645581722259521,
"learning_rate": 2.968092231629933e-05,
"loss": 0.045,
"step": 6750
},
{
"epoch": 0.2874638543970063,
"grad_norm": 0.5871290564537048,
"learning_rate": 2.967922394563978e-05,
"loss": 0.0409,
"step": 6760
},
{
"epoch": 0.28788909678516755,
"grad_norm": 0.6334441304206848,
"learning_rate": 2.9677521115819585e-05,
"loss": 0.0416,
"step": 6770
},
{
"epoch": 0.28831433917332877,
"grad_norm": 0.5969275236129761,
"learning_rate": 2.967581382735601e-05,
"loss": 0.0387,
"step": 6780
},
{
"epoch": 0.28873958156149004,
"grad_norm": 0.7136366367340088,
"learning_rate": 2.9674102080767685e-05,
"loss": 0.0374,
"step": 6790
},
{
"epoch": 0.2891648239496513,
"grad_norm": 0.7302682399749756,
"learning_rate": 2.9672385876574597e-05,
"loss": 0.0393,
"step": 6800
},
{
"epoch": 0.28959006633781254,
"grad_norm": 0.665640652179718,
"learning_rate": 2.967066521529808e-05,
"loss": 0.0372,
"step": 6810
},
{
"epoch": 0.2900153087259738,
"grad_norm": 0.5665135383605957,
"learning_rate": 2.966894009746083e-05,
"loss": 0.0393,
"step": 6820
},
{
"epoch": 0.2904405511141351,
"grad_norm": 0.7281858921051025,
"learning_rate": 2.9667210523586888e-05,
"loss": 0.038,
"step": 6830
},
{
"epoch": 0.2908657935022963,
"grad_norm": 0.6732358336448669,
"learning_rate": 2.966547649420165e-05,
"loss": 0.0394,
"step": 6840
},
{
"epoch": 0.2912910358904576,
"grad_norm": 0.7290246486663818,
"learning_rate": 2.9663738009831877e-05,
"loss": 0.0377,
"step": 6850
},
{
"epoch": 0.2917162782786188,
"grad_norm": 0.6827811598777771,
"learning_rate": 2.966199507100567e-05,
"loss": 0.0398,
"step": 6860
},
{
"epoch": 0.29214152066678006,
"grad_norm": 0.46444040536880493,
"learning_rate": 2.9660247678252484e-05,
"loss": 0.0387,
"step": 6870
},
{
"epoch": 0.29256676305494134,
"grad_norm": 0.5850991606712341,
"learning_rate": 2.965849583210314e-05,
"loss": 0.0401,
"step": 6880
},
{
"epoch": 0.29299200544310255,
"grad_norm": 0.5107231140136719,
"learning_rate": 2.9656739533089794e-05,
"loss": 0.0372,
"step": 6890
},
{
"epoch": 0.2934172478312638,
"grad_norm": 0.6337826251983643,
"learning_rate": 2.9654978781745968e-05,
"loss": 0.0389,
"step": 6900
},
{
"epoch": 0.29384249021942505,
"grad_norm": 0.6117849349975586,
"learning_rate": 2.9653213578606534e-05,
"loss": 0.039,
"step": 6910
},
{
"epoch": 0.2942677326075863,
"grad_norm": 0.8882553577423096,
"learning_rate": 2.9651443924207704e-05,
"loss": 0.0398,
"step": 6920
},
{
"epoch": 0.2946929749957476,
"grad_norm": 0.7033731937408447,
"learning_rate": 2.964966981908706e-05,
"loss": 0.0391,
"step": 6930
},
{
"epoch": 0.2951182173839088,
"grad_norm": 0.6146150231361389,
"learning_rate": 2.964789126378353e-05,
"loss": 0.0377,
"step": 6940
},
{
"epoch": 0.2955434597720701,
"grad_norm": 0.7496012449264526,
"learning_rate": 2.9646108258837388e-05,
"loss": 0.0386,
"step": 6950
},
{
"epoch": 0.29596870216023136,
"grad_norm": 0.639735996723175,
"learning_rate": 2.9644320804790267e-05,
"loss": 0.0388,
"step": 6960
},
{
"epoch": 0.2963939445483926,
"grad_norm": 0.6214953064918518,
"learning_rate": 2.9642528902185142e-05,
"loss": 0.037,
"step": 6970
},
{
"epoch": 0.29681918693655385,
"grad_norm": 0.6856206059455872,
"learning_rate": 2.9640732551566354e-05,
"loss": 0.04,
"step": 6980
},
{
"epoch": 0.29724442932471506,
"grad_norm": 0.6917822957038879,
"learning_rate": 2.9638931753479578e-05,
"loss": 0.0362,
"step": 6990
},
{
"epoch": 0.29766967171287634,
"grad_norm": 0.752424955368042,
"learning_rate": 2.9637126508471858e-05,
"loss": 0.0382,
"step": 7000
},
{
"epoch": 0.2980949141010376,
"grad_norm": 0.6131302714347839,
"learning_rate": 2.9635316817091577e-05,
"loss": 0.0408,
"step": 7010
},
{
"epoch": 0.29852015648919883,
"grad_norm": 0.6820628046989441,
"learning_rate": 2.9633502679888467e-05,
"loss": 0.0367,
"step": 7020
},
{
"epoch": 0.2989453988773601,
"grad_norm": 0.5934157967567444,
"learning_rate": 2.963168409741362e-05,
"loss": 0.0365,
"step": 7030
},
{
"epoch": 0.2993706412655213,
"grad_norm": 0.6045647859573364,
"learning_rate": 2.9629861070219467e-05,
"loss": 0.0388,
"step": 7040
},
{
"epoch": 0.2997958836536826,
"grad_norm": 0.7339217662811279,
"learning_rate": 2.9628033598859807e-05,
"loss": 0.038,
"step": 7050
},
{
"epoch": 0.30022112604184387,
"grad_norm": 0.7751262784004211,
"learning_rate": 2.9626201683889764e-05,
"loss": 0.0351,
"step": 7060
},
{
"epoch": 0.3006463684300051,
"grad_norm": 0.710809051990509,
"learning_rate": 2.9624365325865835e-05,
"loss": 0.0419,
"step": 7070
},
{
"epoch": 0.30107161081816636,
"grad_norm": 0.7651996612548828,
"learning_rate": 2.962252452534585e-05,
"loss": 0.0381,
"step": 7080
},
{
"epoch": 0.30149685320632763,
"grad_norm": 0.8094403743743896,
"learning_rate": 2.9620679282889006e-05,
"loss": 0.0405,
"step": 7090
},
{
"epoch": 0.30192209559448885,
"grad_norm": 0.6429983973503113,
"learning_rate": 2.9618829599055833e-05,
"loss": 0.0386,
"step": 7100
},
{
"epoch": 0.3023473379826501,
"grad_norm": 0.5724627375602722,
"learning_rate": 2.961697547440821e-05,
"loss": 0.0394,
"step": 7110
},
{
"epoch": 0.30277258037081134,
"grad_norm": 0.5370784401893616,
"learning_rate": 2.9615116909509383e-05,
"loss": 0.0374,
"step": 7120
},
{
"epoch": 0.3031978227589726,
"grad_norm": 0.6105824708938599,
"learning_rate": 2.9613253904923924e-05,
"loss": 0.0392,
"step": 7130
},
{
"epoch": 0.3036230651471339,
"grad_norm": 0.6746649146080017,
"learning_rate": 2.9611386461217772e-05,
"loss": 0.0368,
"step": 7140
},
{
"epoch": 0.3040483075352951,
"grad_norm": 0.5744502544403076,
"learning_rate": 2.96095145789582e-05,
"loss": 0.0349,
"step": 7150
},
{
"epoch": 0.3044735499234564,
"grad_norm": 0.5895463228225708,
"learning_rate": 2.9607638258713845e-05,
"loss": 0.0395,
"step": 7160
},
{
"epoch": 0.3048987923116176,
"grad_norm": 0.7200367450714111,
"learning_rate": 2.960575750105468e-05,
"loss": 0.0393,
"step": 7170
},
{
"epoch": 0.30532403469977887,
"grad_norm": 0.8259462118148804,
"learning_rate": 2.9603872306552025e-05,
"loss": 0.0396,
"step": 7180
},
{
"epoch": 0.30574927708794014,
"grad_norm": 0.7393692135810852,
"learning_rate": 2.9601982675778562e-05,
"loss": 0.0409,
"step": 7190
},
{
"epoch": 0.30617451947610136,
"grad_norm": 0.7287253141403198,
"learning_rate": 2.9600088609308294e-05,
"loss": 0.0371,
"step": 7200
},
{
"epoch": 0.30659976186426263,
"grad_norm": 0.8086169958114624,
"learning_rate": 2.959819010771661e-05,
"loss": 0.042,
"step": 7210
},
{
"epoch": 0.3070250042524239,
"grad_norm": 0.8077754974365234,
"learning_rate": 2.959628717158021e-05,
"loss": 0.0371,
"step": 7220
},
{
"epoch": 0.3074502466405851,
"grad_norm": 0.6154179573059082,
"learning_rate": 2.959437980147716e-05,
"loss": 0.0387,
"step": 7230
},
{
"epoch": 0.3078754890287464,
"grad_norm": 0.6824501752853394,
"learning_rate": 2.959246799798687e-05,
"loss": 0.0408,
"step": 7240
},
{
"epoch": 0.3083007314169076,
"grad_norm": 0.6912359595298767,
"learning_rate": 2.9590551761690095e-05,
"loss": 0.0368,
"step": 7250
},
{
"epoch": 0.3087259738050689,
"grad_norm": 0.6031985282897949,
"learning_rate": 2.9588631093168936e-05,
"loss": 0.0361,
"step": 7260
},
{
"epoch": 0.30915121619323016,
"grad_norm": 0.7554166316986084,
"learning_rate": 2.9586705993006837e-05,
"loss": 0.0392,
"step": 7270
},
{
"epoch": 0.3095764585813914,
"grad_norm": 0.5785640478134155,
"learning_rate": 2.95847764617886e-05,
"loss": 0.0384,
"step": 7280
},
{
"epoch": 0.31000170096955265,
"grad_norm": 0.5476772785186768,
"learning_rate": 2.9582842500100364e-05,
"loss": 0.0386,
"step": 7290
},
{
"epoch": 0.31042694335771387,
"grad_norm": 0.5620119571685791,
"learning_rate": 2.958090410852961e-05,
"loss": 0.0361,
"step": 7300
},
{
"epoch": 0.31085218574587514,
"grad_norm": 0.5480359792709351,
"learning_rate": 2.9578961287665175e-05,
"loss": 0.0367,
"step": 7310
},
{
"epoch": 0.3112774281340364,
"grad_norm": 0.6803015470504761,
"learning_rate": 2.9577014038097238e-05,
"loss": 0.0405,
"step": 7320
},
{
"epoch": 0.31170267052219763,
"grad_norm": 0.6066681742668152,
"learning_rate": 2.9575062360417324e-05,
"loss": 0.036,
"step": 7330
},
{
"epoch": 0.3121279129103589,
"grad_norm": 0.7002130150794983,
"learning_rate": 2.957310625521829e-05,
"loss": 0.038,
"step": 7340
},
{
"epoch": 0.3125531552985202,
"grad_norm": 0.5165942907333374,
"learning_rate": 2.957114572309436e-05,
"loss": 0.036,
"step": 7350
},
{
"epoch": 0.3129783976866814,
"grad_norm": 0.6666249632835388,
"learning_rate": 2.956918076464109e-05,
"loss": 0.0464,
"step": 7360
},
{
"epoch": 0.31340364007484267,
"grad_norm": 0.6441757082939148,
"learning_rate": 2.9567211380455376e-05,
"loss": 0.0408,
"step": 7370
},
{
"epoch": 0.3138288824630039,
"grad_norm": 0.696550726890564,
"learning_rate": 2.9565237571135472e-05,
"loss": 0.0381,
"step": 7380
},
{
"epoch": 0.31425412485116516,
"grad_norm": 0.7473100423812866,
"learning_rate": 2.9563259337280967e-05,
"loss": 0.0402,
"step": 7390
},
{
"epoch": 0.31467936723932644,
"grad_norm": 0.7424477934837341,
"learning_rate": 2.9561276679492794e-05,
"loss": 0.0375,
"step": 7400
},
{
"epoch": 0.31510460962748765,
"grad_norm": 0.774580180644989,
"learning_rate": 2.9559289598373236e-05,
"loss": 0.0395,
"step": 7410
},
{
"epoch": 0.3155298520156489,
"grad_norm": 0.7219531536102295,
"learning_rate": 2.9557298094525913e-05,
"loss": 0.038,
"step": 7420
},
{
"epoch": 0.3159550944038102,
"grad_norm": 0.533015251159668,
"learning_rate": 2.955530216855579e-05,
"loss": 0.0364,
"step": 7430
},
{
"epoch": 0.3163803367919714,
"grad_norm": 0.5288422703742981,
"learning_rate": 2.955330182106918e-05,
"loss": 0.0372,
"step": 7440
},
{
"epoch": 0.3168055791801327,
"grad_norm": 0.5094618797302246,
"learning_rate": 2.9551297052673734e-05,
"loss": 0.0358,
"step": 7450
},
{
"epoch": 0.3172308215682939,
"grad_norm": 0.7423509359359741,
"learning_rate": 2.954928786397845e-05,
"loss": 0.0405,
"step": 7460
},
{
"epoch": 0.3176560639564552,
"grad_norm": 0.8391866087913513,
"learning_rate": 2.9547274255593665e-05,
"loss": 0.0359,
"step": 7470
},
{
"epoch": 0.31808130634461645,
"grad_norm": 0.7100056409835815,
"learning_rate": 2.9545256228131058e-05,
"loss": 0.0354,
"step": 7480
},
{
"epoch": 0.3185065487327777,
"grad_norm": 0.4895709156990051,
"learning_rate": 2.9543233782203653e-05,
"loss": 0.0357,
"step": 7490
},
{
"epoch": 0.31893179112093895,
"grad_norm": 0.7405874729156494,
"learning_rate": 2.954120691842582e-05,
"loss": 0.0385,
"step": 7500
},
{
"epoch": 0.31935703350910016,
"grad_norm": 1.0487643480300903,
"learning_rate": 2.9539175637413264e-05,
"loss": 0.0356,
"step": 7510
},
{
"epoch": 0.31978227589726144,
"grad_norm": 0.6350162029266357,
"learning_rate": 2.9537139939783033e-05,
"loss": 0.0392,
"step": 7520
},
{
"epoch": 0.3202075182854227,
"grad_norm": 0.4637593924999237,
"learning_rate": 2.9535099826153516e-05,
"loss": 0.0374,
"step": 7530
},
{
"epoch": 0.3206327606735839,
"grad_norm": 0.5955590009689331,
"learning_rate": 2.9533055297144454e-05,
"loss": 0.0349,
"step": 7540
},
{
"epoch": 0.3210580030617452,
"grad_norm": 0.599247932434082,
"learning_rate": 2.953100635337691e-05,
"loss": 0.0368,
"step": 7550
},
{
"epoch": 0.3214832454499065,
"grad_norm": 0.6445111036300659,
"learning_rate": 2.952895299547331e-05,
"loss": 0.0398,
"step": 7560
},
{
"epoch": 0.3219084878380677,
"grad_norm": 0.765993058681488,
"learning_rate": 2.9526895224057402e-05,
"loss": 0.0372,
"step": 7570
},
{
"epoch": 0.32233373022622896,
"grad_norm": 0.7807483673095703,
"learning_rate": 2.9524833039754284e-05,
"loss": 0.0365,
"step": 7580
},
{
"epoch": 0.3227589726143902,
"grad_norm": 0.4911043345928192,
"learning_rate": 2.9522766443190398e-05,
"loss": 0.0369,
"step": 7590
},
{
"epoch": 0.32318421500255146,
"grad_norm": 0.6260459423065186,
"learning_rate": 2.9520695434993516e-05,
"loss": 0.0379,
"step": 7600
},
{
"epoch": 0.32360945739071273,
"grad_norm": 0.4841105341911316,
"learning_rate": 2.9518620015792757e-05,
"loss": 0.0345,
"step": 7610
},
{
"epoch": 0.32403469977887395,
"grad_norm": 0.5489952564239502,
"learning_rate": 2.951654018621858e-05,
"loss": 0.0367,
"step": 7620
},
{
"epoch": 0.3244599421670352,
"grad_norm": 0.5816491842269897,
"learning_rate": 2.951445594690278e-05,
"loss": 0.0378,
"step": 7630
},
{
"epoch": 0.32488518455519644,
"grad_norm": 0.6203845739364624,
"learning_rate": 2.95123672984785e-05,
"loss": 0.0355,
"step": 7640
},
{
"epoch": 0.3253104269433577,
"grad_norm": 0.6808344721794128,
"learning_rate": 2.9510274241580207e-05,
"loss": 0.0379,
"step": 7650
},
{
"epoch": 0.325735669331519,
"grad_norm": 0.6007257103919983,
"learning_rate": 2.9508176776843726e-05,
"loss": 0.0398,
"step": 7660
},
{
"epoch": 0.3261609117196802,
"grad_norm": 0.6604802012443542,
"learning_rate": 2.95060749049062e-05,
"loss": 0.0364,
"step": 7670
},
{
"epoch": 0.3265861541078415,
"grad_norm": 0.7024301886558533,
"learning_rate": 2.9503968626406133e-05,
"loss": 0.0368,
"step": 7680
},
{
"epoch": 0.32701139649600275,
"grad_norm": 0.5574946999549866,
"learning_rate": 2.9501857941983354e-05,
"loss": 0.0361,
"step": 7690
},
{
"epoch": 0.32743663888416397,
"grad_norm": 0.6038932204246521,
"learning_rate": 2.9499742852279025e-05,
"loss": 0.0405,
"step": 7700
},
{
"epoch": 0.32786188127232524,
"grad_norm": 0.5846895575523376,
"learning_rate": 2.9497623357935666e-05,
"loss": 0.0385,
"step": 7710
},
{
"epoch": 0.32828712366048646,
"grad_norm": 0.5325697660446167,
"learning_rate": 2.9495499459597116e-05,
"loss": 0.0362,
"step": 7720
},
{
"epoch": 0.32871236604864773,
"grad_norm": 0.5643430948257446,
"learning_rate": 2.9493371157908563e-05,
"loss": 0.0359,
"step": 7730
},
{
"epoch": 0.329137608436809,
"grad_norm": 0.5376736521720886,
"learning_rate": 2.9491238453516524e-05,
"loss": 0.0358,
"step": 7740
},
{
"epoch": 0.3295628508249702,
"grad_norm": 0.5130335688591003,
"learning_rate": 2.9489101347068868e-05,
"loss": 0.0367,
"step": 7750
},
{
"epoch": 0.3299880932131315,
"grad_norm": 0.5577582120895386,
"learning_rate": 2.948695983921478e-05,
"loss": 0.0364,
"step": 7760
},
{
"epoch": 0.3304133356012927,
"grad_norm": 0.580887496471405,
"learning_rate": 2.9484813930604804e-05,
"loss": 0.0419,
"step": 7770
},
{
"epoch": 0.330838577989454,
"grad_norm": 0.5507401823997498,
"learning_rate": 2.9482663621890804e-05,
"loss": 0.0378,
"step": 7780
},
{
"epoch": 0.33126382037761526,
"grad_norm": 0.6311460137367249,
"learning_rate": 2.9480508913725986e-05,
"loss": 0.035,
"step": 7790
},
{
"epoch": 0.3316890627657765,
"grad_norm": 0.6687628626823425,
"learning_rate": 2.9478349806764895e-05,
"loss": 0.0363,
"step": 7800
},
{
"epoch": 0.33211430515393775,
"grad_norm": 0.5091450810432434,
"learning_rate": 2.9476186301663414e-05,
"loss": 0.0357,
"step": 7810
},
{
"epoch": 0.332539547542099,
"grad_norm": 0.6998031139373779,
"learning_rate": 2.9474018399078752e-05,
"loss": 0.0331,
"step": 7820
},
{
"epoch": 0.33296478993026024,
"grad_norm": 0.578231692314148,
"learning_rate": 2.947184609966947e-05,
"loss": 0.0378,
"step": 7830
},
{
"epoch": 0.3333900323184215,
"grad_norm": 0.7172752618789673,
"learning_rate": 2.9469669404095447e-05,
"loss": 0.0339,
"step": 7840
},
{
"epoch": 0.33381527470658273,
"grad_norm": 0.6786145567893982,
"learning_rate": 2.9467488313017908e-05,
"loss": 0.0349,
"step": 7850
},
{
"epoch": 0.334240517094744,
"grad_norm": 0.5483919382095337,
"learning_rate": 2.9465302827099412e-05,
"loss": 0.0354,
"step": 7860
},
{
"epoch": 0.3346657594829053,
"grad_norm": 0.4744562804698944,
"learning_rate": 2.9463112947003854e-05,
"loss": 0.0346,
"step": 7870
},
{
"epoch": 0.3350910018710665,
"grad_norm": 0.47334733605384827,
"learning_rate": 2.9460918673396455e-05,
"loss": 0.035,
"step": 7880
},
{
"epoch": 0.33551624425922777,
"grad_norm": 0.6834809184074402,
"learning_rate": 2.9458720006943784e-05,
"loss": 0.0376,
"step": 7890
},
{
"epoch": 0.335941486647389,
"grad_norm": 0.6203020215034485,
"learning_rate": 2.9456516948313736e-05,
"loss": 0.0367,
"step": 7900
},
{
"epoch": 0.33636672903555026,
"grad_norm": 0.6990686058998108,
"learning_rate": 2.9454309498175538e-05,
"loss": 0.0379,
"step": 7910
},
{
"epoch": 0.33679197142371153,
"grad_norm": 0.5946453213691711,
"learning_rate": 2.9452097657199762e-05,
"loss": 0.0362,
"step": 7920
},
{
"epoch": 0.33721721381187275,
"grad_norm": 0.698390007019043,
"learning_rate": 2.9449881426058303e-05,
"loss": 0.0371,
"step": 7930
},
{
"epoch": 0.337642456200034,
"grad_norm": 0.8408201932907104,
"learning_rate": 2.9447660805424388e-05,
"loss": 0.0387,
"step": 7940
},
{
"epoch": 0.3380676985881953,
"grad_norm": 0.7631927132606506,
"learning_rate": 2.94454357959726e-05,
"loss": 0.0382,
"step": 7950
},
{
"epoch": 0.3384929409763565,
"grad_norm": 0.7121431827545166,
"learning_rate": 2.9443206398378815e-05,
"loss": 0.0395,
"step": 7960
},
{
"epoch": 0.3389181833645178,
"grad_norm": 0.6551350951194763,
"learning_rate": 2.9440972613320282e-05,
"loss": 0.0401,
"step": 7970
},
{
"epoch": 0.339343425752679,
"grad_norm": 0.5990687608718872,
"learning_rate": 2.943873444147556e-05,
"loss": 0.0377,
"step": 7980
},
{
"epoch": 0.3397686681408403,
"grad_norm": 0.6591171622276306,
"learning_rate": 2.9436491883524547e-05,
"loss": 0.0369,
"step": 7990
},
{
"epoch": 0.34019391052900155,
"grad_norm": 0.6975916624069214,
"learning_rate": 2.9434244940148472e-05,
"loss": 0.0339,
"step": 8000
},
{
"epoch": 0.34061915291716277,
"grad_norm": 0.6451482176780701,
"learning_rate": 2.9431993612029903e-05,
"loss": 0.032,
"step": 8010
},
{
"epoch": 0.34104439530532404,
"grad_norm": 0.7803753018379211,
"learning_rate": 2.9429737899852728e-05,
"loss": 0.0349,
"step": 8020
},
{
"epoch": 0.34146963769348526,
"grad_norm": 0.8223358988761902,
"learning_rate": 2.9427477804302174e-05,
"loss": 0.0365,
"step": 8030
},
{
"epoch": 0.34189488008164653,
"grad_norm": 0.5982056856155396,
"learning_rate": 2.9425213326064797e-05,
"loss": 0.0364,
"step": 8040
},
{
"epoch": 0.3423201224698078,
"grad_norm": 0.6571147441864014,
"learning_rate": 2.942294446582849e-05,
"loss": 0.0352,
"step": 8050
},
{
"epoch": 0.342745364857969,
"grad_norm": 0.49468785524368286,
"learning_rate": 2.9420671224282468e-05,
"loss": 0.0362,
"step": 8060
},
{
"epoch": 0.3431706072461303,
"grad_norm": 0.5896933078765869,
"learning_rate": 2.9418393602117284e-05,
"loss": 0.0353,
"step": 8070
},
{
"epoch": 0.3435958496342916,
"grad_norm": 0.7391409873962402,
"learning_rate": 2.9416111600024816e-05,
"loss": 0.0383,
"step": 8080
},
{
"epoch": 0.3440210920224528,
"grad_norm": 0.7604628205299377,
"learning_rate": 2.9413825218698284e-05,
"loss": 0.0352,
"step": 8090
},
{
"epoch": 0.34444633441061406,
"grad_norm": 0.5802039504051208,
"learning_rate": 2.9411534458832222e-05,
"loss": 0.038,
"step": 8100
},
{
"epoch": 0.3448715767987753,
"grad_norm": 0.5897963047027588,
"learning_rate": 2.9409239321122513e-05,
"loss": 0.0334,
"step": 8110
},
{
"epoch": 0.34529681918693655,
"grad_norm": 0.6740944385528564,
"learning_rate": 2.940693980626634e-05,
"loss": 0.0361,
"step": 8120
},
{
"epoch": 0.3457220615750978,
"grad_norm": 0.6636224389076233,
"learning_rate": 2.9404635914962258e-05,
"loss": 0.035,
"step": 8130
},
{
"epoch": 0.34614730396325905,
"grad_norm": 0.5794883370399475,
"learning_rate": 2.9402327647910113e-05,
"loss": 0.0339,
"step": 8140
},
{
"epoch": 0.3465725463514203,
"grad_norm": 0.5011441707611084,
"learning_rate": 2.94000150058111e-05,
"loss": 0.036,
"step": 8150
},
{
"epoch": 0.34699778873958154,
"grad_norm": 0.637493371963501,
"learning_rate": 2.939769798936774e-05,
"loss": 0.0359,
"step": 8160
},
{
"epoch": 0.3474230311277428,
"grad_norm": 0.6575992107391357,
"learning_rate": 2.9395376599283878e-05,
"loss": 0.0357,
"step": 8170
},
{
"epoch": 0.3478482735159041,
"grad_norm": 0.710395872592926,
"learning_rate": 2.9393050836264697e-05,
"loss": 0.0363,
"step": 8180
},
{
"epoch": 0.3482735159040653,
"grad_norm": 0.6287087202072144,
"learning_rate": 2.9390720701016693e-05,
"loss": 0.0355,
"step": 8190
},
{
"epoch": 0.3486987582922266,
"grad_norm": 0.5436473488807678,
"learning_rate": 2.938838619424771e-05,
"loss": 0.0347,
"step": 8200
},
{
"epoch": 0.34912400068038785,
"grad_norm": 0.6140791177749634,
"learning_rate": 2.93860473166669e-05,
"loss": 0.0358,
"step": 8210
},
{
"epoch": 0.34954924306854906,
"grad_norm": 0.6407830119132996,
"learning_rate": 2.938370406898476e-05,
"loss": 0.0318,
"step": 8220
},
{
"epoch": 0.34997448545671034,
"grad_norm": 0.5551308393478394,
"learning_rate": 2.9381356451913103e-05,
"loss": 0.0349,
"step": 8230
},
{
"epoch": 0.35039972784487156,
"grad_norm": 0.649509072303772,
"learning_rate": 2.9379004466165072e-05,
"loss": 0.0342,
"step": 8240
},
{
"epoch": 0.35082497023303283,
"grad_norm": 0.5699142813682556,
"learning_rate": 2.937664811245514e-05,
"loss": 0.0366,
"step": 8250
},
{
"epoch": 0.3512502126211941,
"grad_norm": 0.6970945596694946,
"learning_rate": 2.9374287391499108e-05,
"loss": 0.0344,
"step": 8260
},
{
"epoch": 0.3516754550093553,
"grad_norm": 0.7967458367347717,
"learning_rate": 2.9371922304014093e-05,
"loss": 0.0353,
"step": 8270
},
{
"epoch": 0.3521006973975166,
"grad_norm": 0.6777560114860535,
"learning_rate": 2.9369552850718557e-05,
"loss": 0.0361,
"step": 8280
},
{
"epoch": 0.3525259397856778,
"grad_norm": 0.7028951644897461,
"learning_rate": 2.9367179032332262e-05,
"loss": 0.0386,
"step": 8290
},
{
"epoch": 0.3529511821738391,
"grad_norm": 0.6818631291389465,
"learning_rate": 2.936480084957633e-05,
"loss": 0.0329,
"step": 8300
},
{
"epoch": 0.35337642456200036,
"grad_norm": 0.7123552560806274,
"learning_rate": 2.9362418303173174e-05,
"loss": 0.0369,
"step": 8310
},
{
"epoch": 0.3538016669501616,
"grad_norm": 0.6206834316253662,
"learning_rate": 2.9360031393846558e-05,
"loss": 0.0349,
"step": 8320
},
{
"epoch": 0.35422690933832285,
"grad_norm": 0.5408953428268433,
"learning_rate": 2.9357640122321557e-05,
"loss": 0.0352,
"step": 8330
},
{
"epoch": 0.3546521517264841,
"grad_norm": 0.5239614844322205,
"learning_rate": 2.935524448932458e-05,
"loss": 0.0333,
"step": 8340
},
{
"epoch": 0.35507739411464534,
"grad_norm": 0.6259095668792725,
"learning_rate": 2.9352844495583356e-05,
"loss": 0.0361,
"step": 8350
},
{
"epoch": 0.3555026365028066,
"grad_norm": 0.5152319073677063,
"learning_rate": 2.9350440141826938e-05,
"loss": 0.0351,
"step": 8360
},
{
"epoch": 0.35592787889096783,
"grad_norm": 0.5210868120193481,
"learning_rate": 2.9348031428785705e-05,
"loss": 0.0318,
"step": 8370
},
{
"epoch": 0.3563531212791291,
"grad_norm": 0.49504750967025757,
"learning_rate": 2.9345618357191363e-05,
"loss": 0.0331,
"step": 8380
},
{
"epoch": 0.3567783636672904,
"grad_norm": 0.47331687808036804,
"learning_rate": 2.934320092777694e-05,
"loss": 0.035,
"step": 8390
},
{
"epoch": 0.3572036060554516,
"grad_norm": 0.3879975378513336,
"learning_rate": 2.9340779141276782e-05,
"loss": 0.034,
"step": 8400
},
{
"epoch": 0.35762884844361287,
"grad_norm": 0.4745931327342987,
"learning_rate": 2.933835299842657e-05,
"loss": 0.0374,
"step": 8410
},
{
"epoch": 0.3580540908317741,
"grad_norm": 0.6969006657600403,
"learning_rate": 2.93359224999633e-05,
"loss": 0.0328,
"step": 8420
},
{
"epoch": 0.35847933321993536,
"grad_norm": 0.5800720453262329,
"learning_rate": 2.933348764662529e-05,
"loss": 0.0334,
"step": 8430
},
{
"epoch": 0.35890457560809663,
"grad_norm": 0.6239498853683472,
"learning_rate": 2.933104843915219e-05,
"loss": 0.034,
"step": 8440
},
{
"epoch": 0.35932981799625785,
"grad_norm": 0.7754019498825073,
"learning_rate": 2.9328604878284963e-05,
"loss": 0.0406,
"step": 8450
},
{
"epoch": 0.3597550603844191,
"grad_norm": 0.543395459651947,
"learning_rate": 2.9326156964765892e-05,
"loss": 0.0338,
"step": 8460
},
{
"epoch": 0.3601803027725804,
"grad_norm": 0.702585756778717,
"learning_rate": 2.9323704699338603e-05,
"loss": 0.0349,
"step": 8470
},
{
"epoch": 0.3606055451607416,
"grad_norm": 0.6561062932014465,
"learning_rate": 2.932124808274802e-05,
"loss": 0.0343,
"step": 8480
},
{
"epoch": 0.3610307875489029,
"grad_norm": 0.5744190216064453,
"learning_rate": 2.93187871157404e-05,
"loss": 0.0358,
"step": 8490
},
{
"epoch": 0.3614560299370641,
"grad_norm": 0.6018226146697998,
"learning_rate": 2.931632179906332e-05,
"loss": 0.0381,
"step": 8500
},
{
"epoch": 0.3618812723252254,
"grad_norm": 0.543805718421936,
"learning_rate": 2.931385213346568e-05,
"loss": 0.0333,
"step": 8510
},
{
"epoch": 0.36230651471338665,
"grad_norm": 0.5329932570457458,
"learning_rate": 2.9311378119697694e-05,
"loss": 0.035,
"step": 8520
},
{
"epoch": 0.36273175710154787,
"grad_norm": 0.5564429759979248,
"learning_rate": 2.9308899758510903e-05,
"loss": 0.0333,
"step": 8530
},
{
"epoch": 0.36315699948970914,
"grad_norm": 0.6484778523445129,
"learning_rate": 2.9306417050658172e-05,
"loss": 0.0363,
"step": 8540
},
{
"epoch": 0.36358224187787036,
"grad_norm": 0.7013140320777893,
"learning_rate": 2.9303929996893687e-05,
"loss": 0.0338,
"step": 8550
},
{
"epoch": 0.36400748426603163,
"grad_norm": 0.7455886602401733,
"learning_rate": 2.9301438597972932e-05,
"loss": 0.0353,
"step": 8560
},
{
"epoch": 0.3644327266541929,
"grad_norm": 0.5060240030288696,
"learning_rate": 2.9298942854652744e-05,
"loss": 0.0399,
"step": 8570
},
{
"epoch": 0.3648579690423541,
"grad_norm": 0.44752374291419983,
"learning_rate": 2.9296442767691257e-05,
"loss": 0.0344,
"step": 8580
},
{
"epoch": 0.3652832114305154,
"grad_norm": 0.6356979608535767,
"learning_rate": 2.9293938337847936e-05,
"loss": 0.0348,
"step": 8590
},
{
"epoch": 0.36570845381867667,
"grad_norm": 0.6637558341026306,
"learning_rate": 2.9291429565883554e-05,
"loss": 0.0331,
"step": 8600
},
{
"epoch": 0.3661336962068379,
"grad_norm": 0.5308704376220703,
"learning_rate": 2.9288916452560214e-05,
"loss": 0.0359,
"step": 8610
},
{
"epoch": 0.36655893859499916,
"grad_norm": 0.6599563360214233,
"learning_rate": 2.9286398998641334e-05,
"loss": 0.0342,
"step": 8620
},
{
"epoch": 0.3669841809831604,
"grad_norm": 0.5241801738739014,
"learning_rate": 2.9283877204891645e-05,
"loss": 0.0366,
"step": 8630
},
{
"epoch": 0.36740942337132165,
"grad_norm": 0.6455984711647034,
"learning_rate": 2.9281351072077208e-05,
"loss": 0.0339,
"step": 8640
},
{
"epoch": 0.3678346657594829,
"grad_norm": 0.6421254277229309,
"learning_rate": 2.9278820600965393e-05,
"loss": 0.035,
"step": 8650
},
{
"epoch": 0.36825990814764414,
"grad_norm": 0.4384848475456238,
"learning_rate": 2.9276285792324887e-05,
"loss": 0.0329,
"step": 8660
},
{
"epoch": 0.3686851505358054,
"grad_norm": 0.6649608016014099,
"learning_rate": 2.9273746646925703e-05,
"loss": 0.0346,
"step": 8670
},
{
"epoch": 0.36911039292396663,
"grad_norm": 0.6458434462547302,
"learning_rate": 2.9271203165539166e-05,
"loss": 0.0348,
"step": 8680
},
{
"epoch": 0.3695356353121279,
"grad_norm": 0.6240953803062439,
"learning_rate": 2.9268655348937913e-05,
"loss": 0.0313,
"step": 8690
},
{
"epoch": 0.3699608777002892,
"grad_norm": 0.5618213415145874,
"learning_rate": 2.926610319789591e-05,
"loss": 0.0318,
"step": 8700
},
{
"epoch": 0.3703861200884504,
"grad_norm": 0.7894103527069092,
"learning_rate": 2.926354671318843e-05,
"loss": 0.0358,
"step": 8710
},
{
"epoch": 0.3708113624766117,
"grad_norm": 0.6642913222312927,
"learning_rate": 2.926098589559206e-05,
"loss": 0.0339,
"step": 8720
},
{
"epoch": 0.37123660486477295,
"grad_norm": 0.6823061108589172,
"learning_rate": 2.925842074588472e-05,
"loss": 0.0332,
"step": 8730
},
{
"epoch": 0.37166184725293416,
"grad_norm": 0.5889248251914978,
"learning_rate": 2.925585126484563e-05,
"loss": 0.0349,
"step": 8740
},
{
"epoch": 0.37208708964109544,
"grad_norm": 0.5245225429534912,
"learning_rate": 2.9253277453255326e-05,
"loss": 0.0307,
"step": 8750
},
{
"epoch": 0.37251233202925665,
"grad_norm": 0.5833045840263367,
"learning_rate": 2.925069931189567e-05,
"loss": 0.0319,
"step": 8760
},
{
"epoch": 0.3729375744174179,
"grad_norm": 0.5893930196762085,
"learning_rate": 2.924811684154983e-05,
"loss": 0.0347,
"step": 8770
},
{
"epoch": 0.3733628168055792,
"grad_norm": 0.5016536116600037,
"learning_rate": 2.9245530043002293e-05,
"loss": 0.0347,
"step": 8780
},
{
"epoch": 0.3737880591937404,
"grad_norm": 0.5982713103294373,
"learning_rate": 2.9242938917038863e-05,
"loss": 0.0336,
"step": 8790
},
{
"epoch": 0.3742133015819017,
"grad_norm": 0.48951834440231323,
"learning_rate": 2.9240343464446647e-05,
"loss": 0.03,
"step": 8800
},
{
"epoch": 0.3746385439700629,
"grad_norm": 0.5397206544876099,
"learning_rate": 2.923774368601409e-05,
"loss": 0.031,
"step": 8810
},
{
"epoch": 0.3750637863582242,
"grad_norm": 0.5121596455574036,
"learning_rate": 2.9235139582530918e-05,
"loss": 0.0342,
"step": 8820
},
{
"epoch": 0.37548902874638546,
"grad_norm": 0.5357693433761597,
"learning_rate": 2.9232531154788203e-05,
"loss": 0.0323,
"step": 8830
},
{
"epoch": 0.3759142711345467,
"grad_norm": 0.5267826914787292,
"learning_rate": 2.9229918403578313e-05,
"loss": 0.0353,
"step": 8840
},
{
"epoch": 0.37633951352270795,
"grad_norm": 0.5199102759361267,
"learning_rate": 2.922730132969493e-05,
"loss": 0.0344,
"step": 8850
},
{
"epoch": 0.3767647559108692,
"grad_norm": 0.5712948441505432,
"learning_rate": 2.922467993393305e-05,
"loss": 0.0327,
"step": 8860
},
{
"epoch": 0.37718999829903044,
"grad_norm": 0.5483639240264893,
"learning_rate": 2.9222054217088985e-05,
"loss": 0.0351,
"step": 8870
},
{
"epoch": 0.3776152406871917,
"grad_norm": 0.44184285402297974,
"learning_rate": 2.921942417996037e-05,
"loss": 0.0333,
"step": 8880
},
{
"epoch": 0.37804048307535293,
"grad_norm": 0.6121577024459839,
"learning_rate": 2.921678982334612e-05,
"loss": 0.0361,
"step": 8890
},
{
"epoch": 0.3784657254635142,
"grad_norm": 0.5963767766952515,
"learning_rate": 2.92141511480465e-05,
"loss": 0.0369,
"step": 8900
},
{
"epoch": 0.3788909678516755,
"grad_norm": 0.6200764775276184,
"learning_rate": 2.9211508154863066e-05,
"loss": 0.0323,
"step": 8910
},
{
"epoch": 0.3793162102398367,
"grad_norm": 0.5460836887359619,
"learning_rate": 2.9208860844598684e-05,
"loss": 0.0351,
"step": 8920
},
{
"epoch": 0.37974145262799797,
"grad_norm": 0.6484760642051697,
"learning_rate": 2.9206209218057543e-05,
"loss": 0.0369,
"step": 8930
},
{
"epoch": 0.3801666950161592,
"grad_norm": 0.7471164464950562,
"learning_rate": 2.9203553276045135e-05,
"loss": 0.032,
"step": 8940
},
{
"epoch": 0.38059193740432046,
"grad_norm": 0.47716355323791504,
"learning_rate": 2.9200893019368263e-05,
"loss": 0.0321,
"step": 8950
},
{
"epoch": 0.38101717979248173,
"grad_norm": 0.5212956666946411,
"learning_rate": 2.9198228448835045e-05,
"loss": 0.034,
"step": 8960
},
{
"epoch": 0.38144242218064295,
"grad_norm": 0.6043409109115601,
"learning_rate": 2.9195559565254908e-05,
"loss": 0.0345,
"step": 8970
},
{
"epoch": 0.3818676645688042,
"grad_norm": 0.5945512056350708,
"learning_rate": 2.919288636943858e-05,
"loss": 0.0324,
"step": 8980
},
{
"epoch": 0.3822929069569655,
"grad_norm": 0.5137171149253845,
"learning_rate": 2.9190208862198126e-05,
"loss": 0.036,
"step": 8990
},
{
"epoch": 0.3827181493451267,
"grad_norm": 0.5771226286888123,
"learning_rate": 2.9187527044346883e-05,
"loss": 0.0341,
"step": 9000
},
{
"epoch": 0.383143391733288,
"grad_norm": 0.7187480926513672,
"learning_rate": 2.918484091669953e-05,
"loss": 0.0345,
"step": 9010
},
{
"epoch": 0.3835686341214492,
"grad_norm": 0.6605693697929382,
"learning_rate": 2.9182150480072026e-05,
"loss": 0.0361,
"step": 9020
},
{
"epoch": 0.3839938765096105,
"grad_norm": 0.4109347462654114,
"learning_rate": 2.917945573528167e-05,
"loss": 0.035,
"step": 9030
},
{
"epoch": 0.38441911889777175,
"grad_norm": 0.5347878336906433,
"learning_rate": 2.9176756683147052e-05,
"loss": 0.032,
"step": 9040
},
{
"epoch": 0.38484436128593297,
"grad_norm": 0.5180054903030396,
"learning_rate": 2.9174053324488063e-05,
"loss": 0.0329,
"step": 9050
},
{
"epoch": 0.38526960367409424,
"grad_norm": 0.5243205428123474,
"learning_rate": 2.917134566012592e-05,
"loss": 0.0319,
"step": 9060
},
{
"epoch": 0.38569484606225546,
"grad_norm": 0.5306262373924255,
"learning_rate": 2.916863369088314e-05,
"loss": 0.033,
"step": 9070
},
{
"epoch": 0.38612008845041673,
"grad_norm": 0.46671026945114136,
"learning_rate": 2.9165917417583544e-05,
"loss": 0.0337,
"step": 9080
},
{
"epoch": 0.386545330838578,
"grad_norm": 0.5129649043083191,
"learning_rate": 2.916319684105227e-05,
"loss": 0.0325,
"step": 9090
},
{
"epoch": 0.3869705732267392,
"grad_norm": 0.5589383840560913,
"learning_rate": 2.916047196211575e-05,
"loss": 0.0314,
"step": 9100
},
{
"epoch": 0.3873958156149005,
"grad_norm": 0.5646084547042847,
"learning_rate": 2.915774278160173e-05,
"loss": 0.0298,
"step": 9110
},
{
"epoch": 0.38782105800306177,
"grad_norm": 0.5653718113899231,
"learning_rate": 2.9155009300339268e-05,
"loss": 0.0341,
"step": 9120
},
{
"epoch": 0.388246300391223,
"grad_norm": 0.6720633506774902,
"learning_rate": 2.9152271519158724e-05,
"loss": 0.0326,
"step": 9130
},
{
"epoch": 0.38867154277938426,
"grad_norm": 0.595745325088501,
"learning_rate": 2.9149529438891764e-05,
"loss": 0.03,
"step": 9140
},
{
"epoch": 0.3890967851675455,
"grad_norm": 0.6727629899978638,
"learning_rate": 2.9146783060371352e-05,
"loss": 0.0333,
"step": 9150
},
{
"epoch": 0.38952202755570675,
"grad_norm": 0.5202494859695435,
"learning_rate": 2.914403238443177e-05,
"loss": 0.0316,
"step": 9160
},
{
"epoch": 0.389947269943868,
"grad_norm": 0.46365416049957275,
"learning_rate": 2.9141277411908607e-05,
"loss": 0.0331,
"step": 9170
},
{
"epoch": 0.39037251233202924,
"grad_norm": 0.5766396522521973,
"learning_rate": 2.9138518143638744e-05,
"loss": 0.0317,
"step": 9180
},
{
"epoch": 0.3907977547201905,
"grad_norm": 0.5727835297584534,
"learning_rate": 2.9135754580460373e-05,
"loss": 0.0331,
"step": 9190
},
{
"epoch": 0.39122299710835173,
"grad_norm": 0.4820510447025299,
"learning_rate": 2.9132986723212992e-05,
"loss": 0.0331,
"step": 9200
},
{
"epoch": 0.391648239496513,
"grad_norm": 0.49070218205451965,
"learning_rate": 2.9130214572737403e-05,
"loss": 0.0338,
"step": 9210
},
{
"epoch": 0.3920734818846743,
"grad_norm": 0.48210710287094116,
"learning_rate": 2.9127438129875717e-05,
"loss": 0.0314,
"step": 9220
},
{
"epoch": 0.3924987242728355,
"grad_norm": 0.5722389817237854,
"learning_rate": 2.912465739547134e-05,
"loss": 0.0337,
"step": 9230
},
{
"epoch": 0.39292396666099677,
"grad_norm": 0.560624361038208,
"learning_rate": 2.9121872370368988e-05,
"loss": 0.0309,
"step": 9240
},
{
"epoch": 0.39334920904915804,
"grad_norm": 0.6490792632102966,
"learning_rate": 2.911908305541467e-05,
"loss": 0.0329,
"step": 9250
},
{
"epoch": 0.39377445143731926,
"grad_norm": 0.5507360100746155,
"learning_rate": 2.9116289451455717e-05,
"loss": 0.0356,
"step": 9260
},
{
"epoch": 0.39419969382548053,
"grad_norm": 0.761702299118042,
"learning_rate": 2.911349155934075e-05,
"loss": 0.0369,
"step": 9270
},
{
"epoch": 0.39462493621364175,
"grad_norm": 0.4436461925506592,
"learning_rate": 2.9110689379919687e-05,
"loss": 0.0321,
"step": 9280
},
{
"epoch": 0.395050178601803,
"grad_norm": 0.47194549441337585,
"learning_rate": 2.9107882914043767e-05,
"loss": 0.031,
"step": 9290
},
{
"epoch": 0.3954754209899643,
"grad_norm": 0.5178126096725464,
"learning_rate": 2.910507216256551e-05,
"loss": 0.0344,
"step": 9300
},
{
"epoch": 0.3959006633781255,
"grad_norm": 0.5823556184768677,
"learning_rate": 2.910225712633876e-05,
"loss": 0.0306,
"step": 9310
},
{
"epoch": 0.3963259057662868,
"grad_norm": 0.6231299042701721,
"learning_rate": 2.9099437806218637e-05,
"loss": 0.0306,
"step": 9320
},
{
"epoch": 0.396751148154448,
"grad_norm": 0.5591525435447693,
"learning_rate": 2.9096614203061584e-05,
"loss": 0.0324,
"step": 9330
},
{
"epoch": 0.3971763905426093,
"grad_norm": 0.5873363614082336,
"learning_rate": 2.9093786317725336e-05,
"loss": 0.0366,
"step": 9340
},
{
"epoch": 0.39760163293077055,
"grad_norm": 0.5950449705123901,
"learning_rate": 2.909095415106893e-05,
"loss": 0.0325,
"step": 9350
},
{
"epoch": 0.39802687531893177,
"grad_norm": 0.507709264755249,
"learning_rate": 2.9088117703952698e-05,
"loss": 0.0307,
"step": 9360
},
{
"epoch": 0.39845211770709305,
"grad_norm": 0.5411258935928345,
"learning_rate": 2.908527697723829e-05,
"loss": 0.0308,
"step": 9370
},
{
"epoch": 0.3988773600952543,
"grad_norm": 0.8558620810508728,
"learning_rate": 2.9082431971788635e-05,
"loss": 0.0303,
"step": 9380
},
{
"epoch": 0.39930260248341554,
"grad_norm": 0.5814989805221558,
"learning_rate": 2.9079582688467972e-05,
"loss": 0.0309,
"step": 9390
},
{
"epoch": 0.3997278448715768,
"grad_norm": 0.5498828291893005,
"learning_rate": 2.9076729128141833e-05,
"loss": 0.033,
"step": 9400
},
{
"epoch": 0.400153087259738,
"grad_norm": 0.6426417827606201,
"learning_rate": 2.9073871291677063e-05,
"loss": 0.0326,
"step": 9410
},
{
"epoch": 0.4005783296478993,
"grad_norm": 0.593120813369751,
"learning_rate": 2.9071009179941797e-05,
"loss": 0.0307,
"step": 9420
},
{
"epoch": 0.4010035720360606,
"grad_norm": 0.5962951183319092,
"learning_rate": 2.906814279380546e-05,
"loss": 0.0299,
"step": 9430
},
{
"epoch": 0.4014288144242218,
"grad_norm": 0.49784666299819946,
"learning_rate": 2.9065272134138795e-05,
"loss": 0.0302,
"step": 9440
},
{
"epoch": 0.40185405681238306,
"grad_norm": 0.5332371592521667,
"learning_rate": 2.9062397201813823e-05,
"loss": 0.0302,
"step": 9450
},
{
"epoch": 0.40227929920054434,
"grad_norm": 0.47895118594169617,
"learning_rate": 2.905951799770388e-05,
"loss": 0.0321,
"step": 9460
},
{
"epoch": 0.40270454158870556,
"grad_norm": 0.5205760598182678,
"learning_rate": 2.9056634522683586e-05,
"loss": 0.0297,
"step": 9470
},
{
"epoch": 0.40312978397686683,
"grad_norm": 0.6770009994506836,
"learning_rate": 2.9053746777628872e-05,
"loss": 0.0312,
"step": 9480
},
{
"epoch": 0.40355502636502805,
"grad_norm": 0.5336707830429077,
"learning_rate": 2.9050854763416952e-05,
"loss": 0.0288,
"step": 9490
},
{
"epoch": 0.4039802687531893,
"grad_norm": 0.5179953575134277,
"learning_rate": 2.9047958480926346e-05,
"loss": 0.0286,
"step": 9500
},
{
"epoch": 0.4044055111413506,
"grad_norm": 0.5594274401664734,
"learning_rate": 2.9045057931036874e-05,
"loss": 0.0314,
"step": 9510
},
{
"epoch": 0.4048307535295118,
"grad_norm": 0.5783358812332153,
"learning_rate": 2.9042153114629636e-05,
"loss": 0.0299,
"step": 9520
},
{
"epoch": 0.4052559959176731,
"grad_norm": 0.49959713220596313,
"learning_rate": 2.9039244032587043e-05,
"loss": 0.0266,
"step": 9530
},
{
"epoch": 0.4056812383058343,
"grad_norm": 0.5821396708488464,
"learning_rate": 2.90363306857928e-05,
"loss": 0.0317,
"step": 9540
},
{
"epoch": 0.4061064806939956,
"grad_norm": 0.6275321245193481,
"learning_rate": 2.90334130751319e-05,
"loss": 0.0303,
"step": 9550
},
{
"epoch": 0.40653172308215685,
"grad_norm": 0.5117196440696716,
"learning_rate": 2.903049120149064e-05,
"loss": 0.0279,
"step": 9560
},
{
"epoch": 0.40695696547031807,
"grad_norm": 0.4848882853984833,
"learning_rate": 2.9027565065756608e-05,
"loss": 0.03,
"step": 9570
},
{
"epoch": 0.40738220785847934,
"grad_norm": 0.5670190453529358,
"learning_rate": 2.9024634668818682e-05,
"loss": 0.0318,
"step": 9580
},
{
"epoch": 0.4078074502466406,
"grad_norm": 0.4296819269657135,
"learning_rate": 2.9021700011567045e-05,
"loss": 0.0311,
"step": 9590
},
{
"epoch": 0.40823269263480183,
"grad_norm": 0.4154737889766693,
"learning_rate": 2.9018761094893166e-05,
"loss": 0.0306,
"step": 9600
},
{
"epoch": 0.4086579350229631,
"grad_norm": 0.5340800881385803,
"learning_rate": 2.9015817919689808e-05,
"loss": 0.0282,
"step": 9610
},
{
"epoch": 0.4090831774111243,
"grad_norm": 0.5465659499168396,
"learning_rate": 2.9012870486851034e-05,
"loss": 0.0296,
"step": 9620
},
{
"epoch": 0.4095084197992856,
"grad_norm": 0.617377519607544,
"learning_rate": 2.9009918797272195e-05,
"loss": 0.0307,
"step": 9630
},
{
"epoch": 0.40993366218744687,
"grad_norm": 0.5425435900688171,
"learning_rate": 2.9006962851849933e-05,
"loss": 0.0279,
"step": 9640
},
{
"epoch": 0.4103589045756081,
"grad_norm": 0.6728000044822693,
"learning_rate": 2.9004002651482192e-05,
"loss": 0.0298,
"step": 9650
},
{
"epoch": 0.41078414696376936,
"grad_norm": 0.5605682730674744,
"learning_rate": 2.9001038197068198e-05,
"loss": 0.0291,
"step": 9660
},
{
"epoch": 0.4112093893519306,
"grad_norm": 0.9502558708190918,
"learning_rate": 2.899806948950848e-05,
"loss": 0.0311,
"step": 9670
},
{
"epoch": 0.41163463174009185,
"grad_norm": 0.5846401453018188,
"learning_rate": 2.8995096529704846e-05,
"loss": 0.0308,
"step": 9680
},
{
"epoch": 0.4120598741282531,
"grad_norm": 0.6297990083694458,
"learning_rate": 2.8992119318560403e-05,
"loss": 0.0296,
"step": 9690
},
{
"epoch": 0.41248511651641434,
"grad_norm": 0.49979668855667114,
"learning_rate": 2.8989137856979555e-05,
"loss": 0.0297,
"step": 9700
},
{
"epoch": 0.4129103589045756,
"grad_norm": 0.47791269421577454,
"learning_rate": 2.8986152145867983e-05,
"loss": 0.0309,
"step": 9710
},
{
"epoch": 0.4133356012927369,
"grad_norm": 0.5595065951347351,
"learning_rate": 2.8983162186132678e-05,
"loss": 0.0302,
"step": 9720
},
{
"epoch": 0.4137608436808981,
"grad_norm": 0.5153464078903198,
"learning_rate": 2.8980167978681905e-05,
"loss": 0.0292,
"step": 9730
},
{
"epoch": 0.4141860860690594,
"grad_norm": 0.5208001732826233,
"learning_rate": 2.897716952442522e-05,
"loss": 0.0326,
"step": 9740
},
{
"epoch": 0.4146113284572206,
"grad_norm": 0.491243451833725,
"learning_rate": 2.8974166824273482e-05,
"loss": 0.0324,
"step": 9750
},
{
"epoch": 0.41503657084538187,
"grad_norm": 0.45093971490859985,
"learning_rate": 2.897115987913883e-05,
"loss": 0.0284,
"step": 9760
},
{
"epoch": 0.41546181323354314,
"grad_norm": 0.4437636137008667,
"learning_rate": 2.896814868993469e-05,
"loss": 0.0298,
"step": 9770
},
{
"epoch": 0.41588705562170436,
"grad_norm": 0.7555100917816162,
"learning_rate": 2.896513325757579e-05,
"loss": 0.0356,
"step": 9780
},
{
"epoch": 0.41631229800986563,
"grad_norm": 0.6052760481834412,
"learning_rate": 2.896211358297813e-05,
"loss": 0.0317,
"step": 9790
},
{
"epoch": 0.41673754039802685,
"grad_norm": 0.6751075387001038,
"learning_rate": 2.8959089667059014e-05,
"loss": 0.0297,
"step": 9800
},
{
"epoch": 0.4171627827861881,
"grad_norm": 0.6517361998558044,
"learning_rate": 2.8956061510737027e-05,
"loss": 0.0278,
"step": 9810
},
{
"epoch": 0.4175880251743494,
"grad_norm": 0.4893549680709839,
"learning_rate": 2.8953029114932042e-05,
"loss": 0.0288,
"step": 9820
},
{
"epoch": 0.4180132675625106,
"grad_norm": 0.4782169759273529,
"learning_rate": 2.8949992480565214e-05,
"loss": 0.028,
"step": 9830
},
{
"epoch": 0.4184385099506719,
"grad_norm": 0.6266399621963501,
"learning_rate": 2.894695160855901e-05,
"loss": 0.0305,
"step": 9840
},
{
"epoch": 0.41886375233883316,
"grad_norm": 0.44131985306739807,
"learning_rate": 2.8943906499837146e-05,
"loss": 0.0328,
"step": 9850
},
{
"epoch": 0.4192889947269944,
"grad_norm": 0.7677248120307922,
"learning_rate": 2.8940857155324655e-05,
"loss": 0.0295,
"step": 9860
},
{
"epoch": 0.41971423711515565,
"grad_norm": 0.5036348104476929,
"learning_rate": 2.893780357594785e-05,
"loss": 0.0295,
"step": 9870
},
{
"epoch": 0.42013947950331687,
"grad_norm": 0.47260379791259766,
"learning_rate": 2.8934745762634326e-05,
"loss": 0.0307,
"step": 9880
},
{
"epoch": 0.42056472189147814,
"grad_norm": 0.4747141897678375,
"learning_rate": 2.8931683716312964e-05,
"loss": 0.0295,
"step": 9890
},
{
"epoch": 0.4209899642796394,
"grad_norm": 0.4908016324043274,
"learning_rate": 2.8928617437913932e-05,
"loss": 0.0305,
"step": 9900
},
{
"epoch": 0.42141520666780063,
"grad_norm": 0.6114606261253357,
"learning_rate": 2.8925546928368688e-05,
"loss": 0.0296,
"step": 9910
},
{
"epoch": 0.4218404490559619,
"grad_norm": 0.5824898481369019,
"learning_rate": 2.8922472188609968e-05,
"loss": 0.0329,
"step": 9920
},
{
"epoch": 0.4222656914441231,
"grad_norm": 0.5461505651473999,
"learning_rate": 2.8919393219571805e-05,
"loss": 0.0332,
"step": 9930
},
{
"epoch": 0.4226909338322844,
"grad_norm": 0.5584800839424133,
"learning_rate": 2.8916310022189495e-05,
"loss": 0.0289,
"step": 9940
},
{
"epoch": 0.42311617622044567,
"grad_norm": 0.5313165187835693,
"learning_rate": 2.891322259739964e-05,
"loss": 0.0301,
"step": 9950
},
{
"epoch": 0.4235414186086069,
"grad_norm": 0.46237021684646606,
"learning_rate": 2.891013094614012e-05,
"loss": 0.0268,
"step": 9960
},
{
"epoch": 0.42396666099676816,
"grad_norm": 0.45541614294052124,
"learning_rate": 2.8907035069350093e-05,
"loss": 0.0284,
"step": 9970
},
{
"epoch": 0.42439190338492944,
"grad_norm": 0.6681911945343018,
"learning_rate": 2.8903934967970007e-05,
"loss": 0.0298,
"step": 9980
},
{
"epoch": 0.42481714577309065,
"grad_norm": 0.5190245509147644,
"learning_rate": 2.8900830642941583e-05,
"loss": 0.0287,
"step": 9990
},
{
"epoch": 0.4252423881612519,
"grad_norm": 0.5697410702705383,
"learning_rate": 2.889772209520785e-05,
"loss": 0.0279,
"step": 10000
},
{
"epoch": 0.42566763054941315,
"grad_norm": 0.5547255873680115,
"learning_rate": 2.8894609325713087e-05,
"loss": 0.0273,
"step": 10010
},
{
"epoch": 0.4260928729375744,
"grad_norm": 0.5132760405540466,
"learning_rate": 2.889149233540288e-05,
"loss": 0.0289,
"step": 10020
},
{
"epoch": 0.4265181153257357,
"grad_norm": 0.5643351078033447,
"learning_rate": 2.8888371125224084e-05,
"loss": 0.029,
"step": 10030
},
{
"epoch": 0.4269433577138969,
"grad_norm": 0.43181145191192627,
"learning_rate": 2.8885245696124835e-05,
"loss": 0.0276,
"step": 10040
},
{
"epoch": 0.4273686001020582,
"grad_norm": 0.47269269824028015,
"learning_rate": 2.888211604905457e-05,
"loss": 0.0271,
"step": 10050
},
{
"epoch": 0.4277938424902194,
"grad_norm": 0.5423154234886169,
"learning_rate": 2.8878982184963986e-05,
"loss": 0.0273,
"step": 10060
},
{
"epoch": 0.4282190848783807,
"grad_norm": 0.4239635467529297,
"learning_rate": 2.8875844104805066e-05,
"loss": 0.0273,
"step": 10070
},
{
"epoch": 0.42864432726654195,
"grad_norm": 0.541154146194458,
"learning_rate": 2.887270180953107e-05,
"loss": 0.0313,
"step": 10080
},
{
"epoch": 0.42906956965470316,
"grad_norm": 0.528346836566925,
"learning_rate": 2.886955530009656e-05,
"loss": 0.0316,
"step": 10090
},
{
"epoch": 0.42949481204286444,
"grad_norm": 0.5038647055625916,
"learning_rate": 2.8866404577457352e-05,
"loss": 0.0319,
"step": 10100
},
{
"epoch": 0.4299200544310257,
"grad_norm": 0.5295943021774292,
"learning_rate": 2.8863249642570557e-05,
"loss": 0.0317,
"step": 10110
},
{
"epoch": 0.43034529681918693,
"grad_norm": 0.5857188701629639,
"learning_rate": 2.8860090496394552e-05,
"loss": 0.0296,
"step": 10120
},
{
"epoch": 0.4307705392073482,
"grad_norm": 0.43627995252609253,
"learning_rate": 2.8856927139889015e-05,
"loss": 0.0279,
"step": 10130
},
{
"epoch": 0.4311957815955094,
"grad_norm": 0.4689892530441284,
"learning_rate": 2.8853759574014878e-05,
"loss": 0.0278,
"step": 10140
},
{
"epoch": 0.4316210239836707,
"grad_norm": 0.42289119958877563,
"learning_rate": 2.8850587799734372e-05,
"loss": 0.0265,
"step": 10150
},
{
"epoch": 0.43204626637183197,
"grad_norm": 0.48674267530441284,
"learning_rate": 2.8847411818011e-05,
"loss": 0.0256,
"step": 10160
},
{
"epoch": 0.4324715087599932,
"grad_norm": 0.37926095724105835,
"learning_rate": 2.8844231629809534e-05,
"loss": 0.033,
"step": 10170
},
{
"epoch": 0.43289675114815446,
"grad_norm": 0.4297679364681244,
"learning_rate": 2.8841047236096037e-05,
"loss": 0.0273,
"step": 10180
},
{
"epoch": 0.4333219935363157,
"grad_norm": 0.4957326650619507,
"learning_rate": 2.8837858637837838e-05,
"loss": 0.0265,
"step": 10190
},
{
"epoch": 0.43374723592447695,
"grad_norm": 0.5052283406257629,
"learning_rate": 2.8834665836003556e-05,
"loss": 0.0275,
"step": 10200
},
{
"epoch": 0.4341724783126382,
"grad_norm": 0.49201151728630066,
"learning_rate": 2.8831468831563075e-05,
"loss": 0.0289,
"step": 10210
},
{
"epoch": 0.43459772070079944,
"grad_norm": 0.6140077710151672,
"learning_rate": 2.882826762548756e-05,
"loss": 0.0301,
"step": 10220
},
{
"epoch": 0.4350229630889607,
"grad_norm": 0.6304380297660828,
"learning_rate": 2.8825062218749456e-05,
"loss": 0.0271,
"step": 10230
},
{
"epoch": 0.435448205477122,
"grad_norm": 0.6733559966087341,
"learning_rate": 2.8821852612322477e-05,
"loss": 0.0288,
"step": 10240
},
{
"epoch": 0.4358734478652832,
"grad_norm": 0.5495876669883728,
"learning_rate": 2.8818638807181622e-05,
"loss": 0.0272,
"step": 10250
},
{
"epoch": 0.4362986902534445,
"grad_norm": 0.5263335108757019,
"learning_rate": 2.8815420804303154e-05,
"loss": 0.0291,
"step": 10260
},
{
"epoch": 0.4367239326416057,
"grad_norm": 0.5337483286857605,
"learning_rate": 2.8812198604664627e-05,
"loss": 0.028,
"step": 10270
},
{
"epoch": 0.43714917502976697,
"grad_norm": 0.4586282968521118,
"learning_rate": 2.8808972209244848e-05,
"loss": 0.0299,
"step": 10280
},
{
"epoch": 0.43757441741792824,
"grad_norm": 0.46955880522727966,
"learning_rate": 2.8805741619023922e-05,
"loss": 0.0288,
"step": 10290
},
{
"epoch": 0.43799965980608946,
"grad_norm": 0.5017499923706055,
"learning_rate": 2.880250683498321e-05,
"loss": 0.0291,
"step": 10300
},
{
"epoch": 0.43842490219425073,
"grad_norm": 0.5041956305503845,
"learning_rate": 2.879926785810535e-05,
"loss": 0.0284,
"step": 10310
},
{
"epoch": 0.43885014458241195,
"grad_norm": 0.43060925602912903,
"learning_rate": 2.8796024689374267e-05,
"loss": 0.0264,
"step": 10320
},
{
"epoch": 0.4392753869705732,
"grad_norm": 0.5580827593803406,
"learning_rate": 2.8792777329775148e-05,
"loss": 0.0291,
"step": 10330
},
{
"epoch": 0.4397006293587345,
"grad_norm": 0.38732340931892395,
"learning_rate": 2.878952578029445e-05,
"loss": 0.0268,
"step": 10340
},
{
"epoch": 0.4401258717468957,
"grad_norm": 0.5076254606246948,
"learning_rate": 2.8786270041919915e-05,
"loss": 0.0284,
"step": 10350
},
{
"epoch": 0.440551114135057,
"grad_norm": 0.39968782663345337,
"learning_rate": 2.8783010115640546e-05,
"loss": 0.026,
"step": 10360
},
{
"epoch": 0.44097635652321826,
"grad_norm": 0.4572618901729584,
"learning_rate": 2.877974600244662e-05,
"loss": 0.0267,
"step": 10370
},
{
"epoch": 0.4414015989113795,
"grad_norm": 0.4569093883037567,
"learning_rate": 2.8776477703329696e-05,
"loss": 0.0256,
"step": 10380
},
{
"epoch": 0.44182684129954075,
"grad_norm": 0.5506418943405151,
"learning_rate": 2.877320521928259e-05,
"loss": 0.0292,
"step": 10390
},
{
"epoch": 0.44225208368770197,
"grad_norm": 0.5269330143928528,
"learning_rate": 2.87699285512994e-05,
"loss": 0.0285,
"step": 10400
},
{
"epoch": 0.44267732607586324,
"grad_norm": 0.5772727727890015,
"learning_rate": 2.876664770037549e-05,
"loss": 0.0276,
"step": 10410
},
{
"epoch": 0.4431025684640245,
"grad_norm": 0.47063735127449036,
"learning_rate": 2.8763362667507495e-05,
"loss": 0.0276,
"step": 10420
},
{
"epoch": 0.44352781085218573,
"grad_norm": 0.4042012691497803,
"learning_rate": 2.8760073453693322e-05,
"loss": 0.0248,
"step": 10430
},
{
"epoch": 0.443953053240347,
"grad_norm": 0.5305367112159729,
"learning_rate": 2.8756780059932146e-05,
"loss": 0.0253,
"step": 10440
},
{
"epoch": 0.4443782956285082,
"grad_norm": 0.5205299258232117,
"learning_rate": 2.875348248722442e-05,
"loss": 0.0271,
"step": 10450
},
{
"epoch": 0.4448035380166695,
"grad_norm": 0.47778600454330444,
"learning_rate": 2.8750180736571848e-05,
"loss": 0.0286,
"step": 10460
},
{
"epoch": 0.44522878040483077,
"grad_norm": 0.6566385626792908,
"learning_rate": 2.8746874808977424e-05,
"loss": 0.0276,
"step": 10470
},
{
"epoch": 0.445654022792992,
"grad_norm": 0.6271282434463501,
"learning_rate": 2.87435647054454e-05,
"loss": 0.0275,
"step": 10480
},
{
"epoch": 0.44607926518115326,
"grad_norm": 0.4872870147228241,
"learning_rate": 2.8740250426981292e-05,
"loss": 0.0277,
"step": 10490
},
{
"epoch": 0.44650450756931453,
"grad_norm": 0.5417566299438477,
"learning_rate": 2.8736931974591895e-05,
"loss": 0.0323,
"step": 10500
},
{
"epoch": 0.44692974995747575,
"grad_norm": 0.48499515652656555,
"learning_rate": 2.8733609349285268e-05,
"loss": 0.0239,
"step": 10510
},
{
"epoch": 0.447354992345637,
"grad_norm": 0.5535517334938049,
"learning_rate": 2.8730282552070734e-05,
"loss": 0.0253,
"step": 10520
},
{
"epoch": 0.44778023473379824,
"grad_norm": 0.45956945419311523,
"learning_rate": 2.872695158395889e-05,
"loss": 0.0285,
"step": 10530
},
{
"epoch": 0.4482054771219595,
"grad_norm": 0.6099970936775208,
"learning_rate": 2.872361644596159e-05,
"loss": 0.0291,
"step": 10540
},
{
"epoch": 0.4486307195101208,
"grad_norm": 0.5071263909339905,
"learning_rate": 2.8720277139091972e-05,
"loss": 0.0272,
"step": 10550
},
{
"epoch": 0.449055961898282,
"grad_norm": 0.49766653776168823,
"learning_rate": 2.8716933664364417e-05,
"loss": 0.0262,
"step": 10560
},
{
"epoch": 0.4494812042864433,
"grad_norm": 0.4527025520801544,
"learning_rate": 2.871358602279459e-05,
"loss": 0.0272,
"step": 10570
},
{
"epoch": 0.4499064466746045,
"grad_norm": 0.47787800431251526,
"learning_rate": 2.871023421539942e-05,
"loss": 0.0256,
"step": 10580
},
{
"epoch": 0.45033168906276577,
"grad_norm": 0.6697449088096619,
"learning_rate": 2.8706878243197085e-05,
"loss": 0.0266,
"step": 10590
},
{
"epoch": 0.45075693145092705,
"grad_norm": 0.6518754363059998,
"learning_rate": 2.8703518107207052e-05,
"loss": 0.0271,
"step": 10600
},
{
"epoch": 0.45118217383908826,
"grad_norm": 0.4889909327030182,
"learning_rate": 2.870015380845004e-05,
"loss": 0.0249,
"step": 10610
},
{
"epoch": 0.45160741622724954,
"grad_norm": 0.5126760005950928,
"learning_rate": 2.8696785347948035e-05,
"loss": 0.0283,
"step": 10620
},
{
"epoch": 0.4520326586154108,
"grad_norm": 0.43273210525512695,
"learning_rate": 2.8693412726724282e-05,
"loss": 0.0276,
"step": 10630
},
{
"epoch": 0.452457901003572,
"grad_norm": 0.5461113452911377,
"learning_rate": 2.869003594580329e-05,
"loss": 0.0275,
"step": 10640
},
{
"epoch": 0.4528831433917333,
"grad_norm": 0.5773922801017761,
"learning_rate": 2.868665500621085e-05,
"loss": 0.0288,
"step": 10650
},
{
"epoch": 0.4533083857798945,
"grad_norm": 0.47226929664611816,
"learning_rate": 2.8683269908973992e-05,
"loss": 0.0254,
"step": 10660
},
{
"epoch": 0.4537336281680558,
"grad_norm": 0.4792795777320862,
"learning_rate": 2.867988065512102e-05,
"loss": 0.0261,
"step": 10670
},
{
"epoch": 0.45415887055621706,
"grad_norm": 0.5739585161209106,
"learning_rate": 2.8676487245681507e-05,
"loss": 0.0288,
"step": 10680
},
{
"epoch": 0.4545841129443783,
"grad_norm": 0.48658737540245056,
"learning_rate": 2.867308968168627e-05,
"loss": 0.0288,
"step": 10690
},
{
"epoch": 0.45500935533253956,
"grad_norm": 0.5436782240867615,
"learning_rate": 2.8669687964167404e-05,
"loss": 0.0249,
"step": 10700
},
{
"epoch": 0.4554345977207008,
"grad_norm": 0.49603909254074097,
"learning_rate": 2.8666282094158264e-05,
"loss": 0.0267,
"step": 10710
},
{
"epoch": 0.45585984010886205,
"grad_norm": 0.5125226974487305,
"learning_rate": 2.866287207269346e-05,
"loss": 0.0289,
"step": 10720
},
{
"epoch": 0.4562850824970233,
"grad_norm": 0.447691410779953,
"learning_rate": 2.8659457900808868e-05,
"loss": 0.0273,
"step": 10730
},
{
"epoch": 0.45671032488518454,
"grad_norm": 0.39385440945625305,
"learning_rate": 2.8656039579541628e-05,
"loss": 0.026,
"step": 10740
},
{
"epoch": 0.4571355672733458,
"grad_norm": 0.5474128127098083,
"learning_rate": 2.865261710993012e-05,
"loss": 0.0258,
"step": 10750
},
{
"epoch": 0.4575608096615071,
"grad_norm": 0.5764764547348022,
"learning_rate": 2.864919049301402e-05,
"loss": 0.0272,
"step": 10760
},
{
"epoch": 0.4579860520496683,
"grad_norm": 0.6347236037254333,
"learning_rate": 2.8645759729834227e-05,
"loss": 0.0238,
"step": 10770
},
{
"epoch": 0.4584112944378296,
"grad_norm": 0.5578680038452148,
"learning_rate": 2.864232482143293e-05,
"loss": 0.0244,
"step": 10780
},
{
"epoch": 0.4588365368259908,
"grad_norm": 0.51310133934021,
"learning_rate": 2.8638885768853548e-05,
"loss": 0.0287,
"step": 10790
},
{
"epoch": 0.45926177921415207,
"grad_norm": 0.5770761370658875,
"learning_rate": 2.8635442573140793e-05,
"loss": 0.028,
"step": 10800
},
{
"epoch": 0.45968702160231334,
"grad_norm": 0.47170472145080566,
"learning_rate": 2.8631995235340606e-05,
"loss": 0.028,
"step": 10810
},
{
"epoch": 0.46011226399047456,
"grad_norm": 0.3460540473461151,
"learning_rate": 2.8628543756500192e-05,
"loss": 0.0257,
"step": 10820
},
{
"epoch": 0.46053750637863583,
"grad_norm": 0.40834179520606995,
"learning_rate": 2.862508813766803e-05,
"loss": 0.0243,
"step": 10830
},
{
"epoch": 0.46096274876679705,
"grad_norm": 0.5245689749717712,
"learning_rate": 2.8621628379893837e-05,
"loss": 0.0255,
"step": 10840
},
{
"epoch": 0.4613879911549583,
"grad_norm": 0.4288097321987152,
"learning_rate": 2.861816448422861e-05,
"loss": 0.0263,
"step": 10850
},
{
"epoch": 0.4618132335431196,
"grad_norm": 0.5491788387298584,
"learning_rate": 2.8614696451724573e-05,
"loss": 0.0239,
"step": 10860
},
{
"epoch": 0.4622384759312808,
"grad_norm": 0.6117849349975586,
"learning_rate": 2.861122428343523e-05,
"loss": 0.0259,
"step": 10870
},
{
"epoch": 0.4626637183194421,
"grad_norm": 0.5702128410339355,
"learning_rate": 2.8607747980415333e-05,
"loss": 0.0269,
"step": 10880
},
{
"epoch": 0.46308896070760336,
"grad_norm": 0.663136899471283,
"learning_rate": 2.860426754372089e-05,
"loss": 0.0263,
"step": 10890
},
{
"epoch": 0.4635142030957646,
"grad_norm": 0.4230470061302185,
"learning_rate": 2.860078297440917e-05,
"loss": 0.0267,
"step": 10900
},
{
"epoch": 0.46393944548392585,
"grad_norm": 0.5185224413871765,
"learning_rate": 2.8597294273538687e-05,
"loss": 0.0279,
"step": 10910
},
{
"epoch": 0.46436468787208707,
"grad_norm": 0.509982705116272,
"learning_rate": 2.8593801442169223e-05,
"loss": 0.0264,
"step": 10920
},
{
"epoch": 0.46478993026024834,
"grad_norm": 0.4910407066345215,
"learning_rate": 2.85903044813618e-05,
"loss": 0.027,
"step": 10930
},
{
"epoch": 0.4652151726484096,
"grad_norm": 0.5604979395866394,
"learning_rate": 2.8586803392178715e-05,
"loss": 0.0263,
"step": 10940
},
{
"epoch": 0.46564041503657083,
"grad_norm": 0.49472281336784363,
"learning_rate": 2.858329817568349e-05,
"loss": 0.0247,
"step": 10950
},
{
"epoch": 0.4660656574247321,
"grad_norm": 0.5663736462593079,
"learning_rate": 2.8579788832940925e-05,
"loss": 0.0241,
"step": 10960
},
{
"epoch": 0.4664908998128933,
"grad_norm": 0.5932319760322571,
"learning_rate": 2.8576275365017073e-05,
"loss": 0.0285,
"step": 10970
},
{
"epoch": 0.4669161422010546,
"grad_norm": 0.5209352970123291,
"learning_rate": 2.857275777297922e-05,
"loss": 0.026,
"step": 10980
},
{
"epoch": 0.46734138458921587,
"grad_norm": 0.5121971368789673,
"learning_rate": 2.8569236057895924e-05,
"loss": 0.0273,
"step": 10990
},
{
"epoch": 0.4677666269773771,
"grad_norm": 0.5635631084442139,
"learning_rate": 2.8565710220836994e-05,
"loss": 0.0266,
"step": 11000
},
{
"epoch": 0.46819186936553836,
"grad_norm": 0.42411789298057556,
"learning_rate": 2.8562180262873474e-05,
"loss": 0.0255,
"step": 11010
},
{
"epoch": 0.46861711175369963,
"grad_norm": 0.49095433950424194,
"learning_rate": 2.8558646185077682e-05,
"loss": 0.0277,
"step": 11020
},
{
"epoch": 0.46904235414186085,
"grad_norm": 0.39923784136772156,
"learning_rate": 2.8555107988523177e-05,
"loss": 0.0232,
"step": 11030
},
{
"epoch": 0.4694675965300221,
"grad_norm": 0.5151148438453674,
"learning_rate": 2.8551565674284762e-05,
"loss": 0.0248,
"step": 11040
},
{
"epoch": 0.46989283891818334,
"grad_norm": 0.4995056390762329,
"learning_rate": 2.8548019243438503e-05,
"loss": 0.0249,
"step": 11050
},
{
"epoch": 0.4703180813063446,
"grad_norm": 0.39380818605422974,
"learning_rate": 2.8544468697061715e-05,
"loss": 0.028,
"step": 11060
},
{
"epoch": 0.4707433236945059,
"grad_norm": 0.5959387421607971,
"learning_rate": 2.854091403623296e-05,
"loss": 0.0238,
"step": 11070
},
{
"epoch": 0.4711685660826671,
"grad_norm": 0.4565434455871582,
"learning_rate": 2.8537355262032047e-05,
"loss": 0.0251,
"step": 11080
},
{
"epoch": 0.4715938084708284,
"grad_norm": 0.5781095623970032,
"learning_rate": 2.853379237554004e-05,
"loss": 0.0253,
"step": 11090
},
{
"epoch": 0.4720190508589896,
"grad_norm": 0.4889158010482788,
"learning_rate": 2.853022537783924e-05,
"loss": 0.0245,
"step": 11100
},
{
"epoch": 0.47244429324715087,
"grad_norm": 0.5270752906799316,
"learning_rate": 2.8526654270013223e-05,
"loss": 0.0264,
"step": 11110
},
{
"epoch": 0.47286953563531214,
"grad_norm": 0.5031289458274841,
"learning_rate": 2.8523079053146785e-05,
"loss": 0.0276,
"step": 11120
},
{
"epoch": 0.47329477802347336,
"grad_norm": 0.5778294801712036,
"learning_rate": 2.851949972832599e-05,
"loss": 0.0258,
"step": 11130
},
{
"epoch": 0.47372002041163463,
"grad_norm": 0.5123775601387024,
"learning_rate": 2.851591629663814e-05,
"loss": 0.0246,
"step": 11140
},
{
"epoch": 0.4741452627997959,
"grad_norm": 0.5043962001800537,
"learning_rate": 2.8512328759171783e-05,
"loss": 0.0226,
"step": 11150
},
{
"epoch": 0.4745705051879571,
"grad_norm": 0.41966044902801514,
"learning_rate": 2.8508737117016722e-05,
"loss": 0.0249,
"step": 11160
},
{
"epoch": 0.4749957475761184,
"grad_norm": 0.4980928897857666,
"learning_rate": 2.8505141371264005e-05,
"loss": 0.0256,
"step": 11170
},
{
"epoch": 0.4754209899642796,
"grad_norm": 0.43403160572052,
"learning_rate": 2.8501541523005918e-05,
"loss": 0.0236,
"step": 11180
},
{
"epoch": 0.4758462323524409,
"grad_norm": 0.41079947352409363,
"learning_rate": 2.8497937573336007e-05,
"loss": 0.0273,
"step": 11190
},
{
"epoch": 0.47627147474060216,
"grad_norm": 0.41930046677589417,
"learning_rate": 2.8494329523349053e-05,
"loss": 0.0277,
"step": 11200
},
{
"epoch": 0.4766967171287634,
"grad_norm": 0.356673002243042,
"learning_rate": 2.8490717374141088e-05,
"loss": 0.0272,
"step": 11210
},
{
"epoch": 0.47712195951692465,
"grad_norm": 0.5610805749893188,
"learning_rate": 2.8487101126809384e-05,
"loss": 0.0239,
"step": 11220
},
{
"epoch": 0.47754720190508587,
"grad_norm": 0.44382742047309875,
"learning_rate": 2.8483480782452463e-05,
"loss": 0.0299,
"step": 11230
},
{
"epoch": 0.47797244429324715,
"grad_norm": 0.45406636595726013,
"learning_rate": 2.8479856342170097e-05,
"loss": 0.0241,
"step": 11240
},
{
"epoch": 0.4783976866814084,
"grad_norm": 0.3877098262310028,
"learning_rate": 2.8476227807063283e-05,
"loss": 0.025,
"step": 11250
},
{
"epoch": 0.47882292906956964,
"grad_norm": 0.45907872915267944,
"learning_rate": 2.8472595178234284e-05,
"loss": 0.0246,
"step": 11260
},
{
"epoch": 0.4792481714577309,
"grad_norm": 0.4462440311908722,
"learning_rate": 2.846895845678659e-05,
"loss": 0.023,
"step": 11270
},
{
"epoch": 0.4796734138458922,
"grad_norm": 0.48364442586898804,
"learning_rate": 2.8465317643824945e-05,
"loss": 0.0232,
"step": 11280
},
{
"epoch": 0.4800986562340534,
"grad_norm": 0.5409104824066162,
"learning_rate": 2.8461672740455334e-05,
"loss": 0.0252,
"step": 11290
},
{
"epoch": 0.4805238986222147,
"grad_norm": 0.5658287405967712,
"learning_rate": 2.8458023747784976e-05,
"loss": 0.0232,
"step": 11300
},
{
"epoch": 0.4809491410103759,
"grad_norm": 0.3975183665752411,
"learning_rate": 2.8454370666922343e-05,
"loss": 0.0246,
"step": 11310
},
{
"epoch": 0.48137438339853716,
"grad_norm": 0.3229718804359436,
"learning_rate": 2.8450713498977145e-05,
"loss": 0.024,
"step": 11320
},
{
"epoch": 0.48179962578669844,
"grad_norm": 0.38045093417167664,
"learning_rate": 2.844705224506033e-05,
"loss": 0.0236,
"step": 11330
},
{
"epoch": 0.48222486817485966,
"grad_norm": 0.3866424560546875,
"learning_rate": 2.844338690628409e-05,
"loss": 0.0256,
"step": 11340
},
{
"epoch": 0.48265011056302093,
"grad_norm": 0.42420685291290283,
"learning_rate": 2.8439717483761863e-05,
"loss": 0.0254,
"step": 11350
},
{
"epoch": 0.48307535295118215,
"grad_norm": 0.5029059648513794,
"learning_rate": 2.843604397860832e-05,
"loss": 0.0234,
"step": 11360
},
{
"epoch": 0.4835005953393434,
"grad_norm": 0.47720468044281006,
"learning_rate": 2.8432366391939373e-05,
"loss": 0.0257,
"step": 11370
},
{
"epoch": 0.4839258377275047,
"grad_norm": 0.5582727193832397,
"learning_rate": 2.8428684724872176e-05,
"loss": 0.0254,
"step": 11380
},
{
"epoch": 0.4843510801156659,
"grad_norm": 0.36047664284706116,
"learning_rate": 2.8424998978525123e-05,
"loss": 0.0244,
"step": 11390
},
{
"epoch": 0.4847763225038272,
"grad_norm": 0.4934903383255005,
"learning_rate": 2.842130915401785e-05,
"loss": 0.0263,
"step": 11400
},
{
"epoch": 0.48520156489198846,
"grad_norm": 0.5368421673774719,
"learning_rate": 2.8417615252471226e-05,
"loss": 0.0272,
"step": 11410
},
{
"epoch": 0.4856268072801497,
"grad_norm": 0.5921639800071716,
"learning_rate": 2.841391727500735e-05,
"loss": 0.0225,
"step": 11420
},
{
"epoch": 0.48605204966831095,
"grad_norm": 0.6646623611450195,
"learning_rate": 2.8410215222749593e-05,
"loss": 0.0248,
"step": 11430
},
{
"epoch": 0.48647729205647217,
"grad_norm": 0.5030350089073181,
"learning_rate": 2.840650909682252e-05,
"loss": 0.0238,
"step": 11440
},
{
"epoch": 0.48690253444463344,
"grad_norm": 0.5150179862976074,
"learning_rate": 2.8402798898351963e-05,
"loss": 0.0253,
"step": 11450
},
{
"epoch": 0.4873277768327947,
"grad_norm": 0.5101315379142761,
"learning_rate": 2.8399084628464986e-05,
"loss": 0.0279,
"step": 11460
},
{
"epoch": 0.48775301922095593,
"grad_norm": 0.3990659713745117,
"learning_rate": 2.8395366288289876e-05,
"loss": 0.0234,
"step": 11470
},
{
"epoch": 0.4881782616091172,
"grad_norm": 0.4439125955104828,
"learning_rate": 2.839164387895617e-05,
"loss": 0.0233,
"step": 11480
},
{
"epoch": 0.4886035039972784,
"grad_norm": 0.5056343674659729,
"learning_rate": 2.8387917401594644e-05,
"loss": 0.023,
"step": 11490
},
{
"epoch": 0.4890287463854397,
"grad_norm": 0.38938045501708984,
"learning_rate": 2.8384186857337296e-05,
"loss": 0.0245,
"step": 11500
},
{
"epoch": 0.48945398877360097,
"grad_norm": 0.48951399326324463,
"learning_rate": 2.838045224731737e-05,
"loss": 0.0246,
"step": 11510
},
{
"epoch": 0.4898792311617622,
"grad_norm": 0.4403155446052551,
"learning_rate": 2.8376713572669348e-05,
"loss": 0.0248,
"step": 11520
},
{
"epoch": 0.49030447354992346,
"grad_norm": 0.5052725672721863,
"learning_rate": 2.8372970834528924e-05,
"loss": 0.028,
"step": 11530
},
{
"epoch": 0.49072971593808473,
"grad_norm": 0.4916299879550934,
"learning_rate": 2.836922403403306e-05,
"loss": 0.0265,
"step": 11540
},
{
"epoch": 0.49115495832624595,
"grad_norm": 0.4234394133090973,
"learning_rate": 2.8365473172319926e-05,
"loss": 0.0259,
"step": 11550
},
{
"epoch": 0.4915802007144072,
"grad_norm": 0.4441545307636261,
"learning_rate": 2.8361718250528936e-05,
"loss": 0.0256,
"step": 11560
},
{
"epoch": 0.49200544310256844,
"grad_norm": 0.49031588435173035,
"learning_rate": 2.835795926980074e-05,
"loss": 0.0233,
"step": 11570
},
{
"epoch": 0.4924306854907297,
"grad_norm": 0.6478508114814758,
"learning_rate": 2.8354196231277213e-05,
"loss": 0.0269,
"step": 11580
},
{
"epoch": 0.492855927878891,
"grad_norm": 0.5133887529373169,
"learning_rate": 2.835042913610147e-05,
"loss": 0.0256,
"step": 11590
},
{
"epoch": 0.4932811702670522,
"grad_norm": 0.35141903162002563,
"learning_rate": 2.8346657985417846e-05,
"loss": 0.0245,
"step": 11600
},
{
"epoch": 0.4937064126552135,
"grad_norm": 0.4691372811794281,
"learning_rate": 2.8342882780371932e-05,
"loss": 0.0275,
"step": 11610
},
{
"epoch": 0.49413165504337475,
"grad_norm": 0.4540161192417145,
"learning_rate": 2.833910352211053e-05,
"loss": 0.0292,
"step": 11620
},
{
"epoch": 0.49455689743153597,
"grad_norm": 0.5981717705726624,
"learning_rate": 2.8335320211781667e-05,
"loss": 0.0293,
"step": 11630
},
{
"epoch": 0.49498213981969724,
"grad_norm": 0.5212690234184265,
"learning_rate": 2.8331532850534628e-05,
"loss": 0.0259,
"step": 11640
},
{
"epoch": 0.49540738220785846,
"grad_norm": 0.47971609234809875,
"learning_rate": 2.8327741439519906e-05,
"loss": 0.0267,
"step": 11650
},
{
"epoch": 0.49583262459601973,
"grad_norm": 0.43671026825904846,
"learning_rate": 2.8323945979889233e-05,
"loss": 0.0241,
"step": 11660
},
{
"epoch": 0.496257866984181,
"grad_norm": 0.5726038217544556,
"learning_rate": 2.8320146472795572e-05,
"loss": 0.0245,
"step": 11670
},
{
"epoch": 0.4966831093723422,
"grad_norm": 0.41962766647338867,
"learning_rate": 2.831634291939311e-05,
"loss": 0.0233,
"step": 11680
},
{
"epoch": 0.4971083517605035,
"grad_norm": 0.49509820342063904,
"learning_rate": 2.8312535320837266e-05,
"loss": 0.0247,
"step": 11690
},
{
"epoch": 0.4975335941486647,
"grad_norm": 0.44511812925338745,
"learning_rate": 2.8308723678284687e-05,
"loss": 0.0264,
"step": 11700
},
{
"epoch": 0.497958836536826,
"grad_norm": 0.42542317509651184,
"learning_rate": 2.8304907992893254e-05,
"loss": 0.0238,
"step": 11710
},
{
"epoch": 0.49838407892498726,
"grad_norm": 0.5425060391426086,
"learning_rate": 2.8301088265822066e-05,
"loss": 0.0234,
"step": 11720
},
{
"epoch": 0.4988093213131485,
"grad_norm": 0.520889937877655,
"learning_rate": 2.829726449823146e-05,
"loss": 0.025,
"step": 11730
},
{
"epoch": 0.49923456370130975,
"grad_norm": 0.49344781041145325,
"learning_rate": 2.8293436691282993e-05,
"loss": 0.025,
"step": 11740
},
{
"epoch": 0.499659806089471,
"grad_norm": 0.41864946484565735,
"learning_rate": 2.828960484613945e-05,
"loss": 0.0239,
"step": 11750
},
{
"epoch": 0.5000850484776322,
"grad_norm": 0.34268102049827576,
"learning_rate": 2.828576896396485e-05,
"loss": 0.0228,
"step": 11760
},
{
"epoch": 0.5005102908657935,
"grad_norm": 0.37487515807151794,
"learning_rate": 2.828192904592443e-05,
"loss": 0.0239,
"step": 11770
},
{
"epoch": 0.5009355332539548,
"grad_norm": 0.4595807194709778,
"learning_rate": 2.8278085093184648e-05,
"loss": 0.0223,
"step": 11780
},
{
"epoch": 0.501360775642116,
"grad_norm": 0.5990208983421326,
"learning_rate": 2.827423710691321e-05,
"loss": 0.0233,
"step": 11790
},
{
"epoch": 0.5017860180302772,
"grad_norm": 0.7175838947296143,
"learning_rate": 2.827038508827902e-05,
"loss": 0.026,
"step": 11800
},
{
"epoch": 0.5022112604184386,
"grad_norm": 0.6296750903129578,
"learning_rate": 2.8266529038452223e-05,
"loss": 0.0258,
"step": 11810
},
{
"epoch": 0.5026365028065998,
"grad_norm": 0.5214424729347229,
"learning_rate": 2.826266895860419e-05,
"loss": 0.0242,
"step": 11820
},
{
"epoch": 0.503061745194761,
"grad_norm": 0.5066425800323486,
"learning_rate": 2.8258804849907508e-05,
"loss": 0.0257,
"step": 11830
},
{
"epoch": 0.5034869875829223,
"grad_norm": 0.5052611827850342,
"learning_rate": 2.825493671353599e-05,
"loss": 0.0244,
"step": 11840
},
{
"epoch": 0.5039122299710835,
"grad_norm": 0.4577106237411499,
"learning_rate": 2.8251064550664673e-05,
"loss": 0.024,
"step": 11850
},
{
"epoch": 0.5043374723592448,
"grad_norm": 0.5954710245132446,
"learning_rate": 2.8247188362469823e-05,
"loss": 0.0239,
"step": 11860
},
{
"epoch": 0.504762714747406,
"grad_norm": 0.4054025709629059,
"learning_rate": 2.824330815012892e-05,
"loss": 0.0265,
"step": 11870
},
{
"epoch": 0.5051879571355673,
"grad_norm": 0.6234331727027893,
"learning_rate": 2.8239423914820668e-05,
"loss": 0.0261,
"step": 11880
},
{
"epoch": 0.5056131995237285,
"grad_norm": 0.5679658651351929,
"learning_rate": 2.8235535657724997e-05,
"loss": 0.0241,
"step": 11890
},
{
"epoch": 0.5060384419118897,
"grad_norm": 0.5622302293777466,
"learning_rate": 2.823164338002306e-05,
"loss": 0.0244,
"step": 11900
},
{
"epoch": 0.5064636843000511,
"grad_norm": 0.6095677018165588,
"learning_rate": 2.8227747082897223e-05,
"loss": 0.0265,
"step": 11910
},
{
"epoch": 0.5068889266882123,
"grad_norm": 0.406110554933548,
"learning_rate": 2.8223846767531084e-05,
"loss": 0.0257,
"step": 11920
},
{
"epoch": 0.5073141690763735,
"grad_norm": 0.4320657551288605,
"learning_rate": 2.821994243510945e-05,
"loss": 0.0291,
"step": 11930
},
{
"epoch": 0.5077394114645348,
"grad_norm": 0.40571796894073486,
"learning_rate": 2.821603408681835e-05,
"loss": 0.0281,
"step": 11940
},
{
"epoch": 0.508164653852696,
"grad_norm": 0.44595959782600403,
"learning_rate": 2.8212121723845054e-05,
"loss": 0.0241,
"step": 11950
},
{
"epoch": 0.5085898962408573,
"grad_norm": 0.36468973755836487,
"learning_rate": 2.820820534737802e-05,
"loss": 0.0284,
"step": 11960
},
{
"epoch": 0.5090151386290186,
"grad_norm": 0.42328399419784546,
"learning_rate": 2.8204284958606946e-05,
"loss": 0.0237,
"step": 11970
},
{
"epoch": 0.5094403810171798,
"grad_norm": 0.4866717755794525,
"learning_rate": 2.8200360558722738e-05,
"loss": 0.0239,
"step": 11980
},
{
"epoch": 0.509865623405341,
"grad_norm": 0.4124220311641693,
"learning_rate": 2.819643214891753e-05,
"loss": 0.0231,
"step": 11990
},
{
"epoch": 0.5102908657935022,
"grad_norm": 0.5854120850563049,
"learning_rate": 2.8192499730384668e-05,
"loss": 0.0265,
"step": 12000
},
{
"epoch": 0.5107161081816636,
"grad_norm": 0.40588247776031494,
"learning_rate": 2.818856330431871e-05,
"loss": 0.0271,
"step": 12010
},
{
"epoch": 0.5111413505698248,
"grad_norm": 0.427511990070343,
"learning_rate": 2.8184622871915446e-05,
"loss": 0.0238,
"step": 12020
},
{
"epoch": 0.511566592957986,
"grad_norm": 0.4201594293117523,
"learning_rate": 2.8180678434371874e-05,
"loss": 0.0244,
"step": 12030
},
{
"epoch": 0.5119918353461473,
"grad_norm": 0.3546863794326782,
"learning_rate": 2.817672999288621e-05,
"loss": 0.0227,
"step": 12040
},
{
"epoch": 0.5124170777343086,
"grad_norm": 0.36736375093460083,
"learning_rate": 2.8172777548657886e-05,
"loss": 0.0242,
"step": 12050
},
{
"epoch": 0.5128423201224698,
"grad_norm": 0.5919041633605957,
"learning_rate": 2.8168821102887545e-05,
"loss": 0.0243,
"step": 12060
},
{
"epoch": 0.5132675625106311,
"grad_norm": 0.5262941718101501,
"learning_rate": 2.816486065677706e-05,
"loss": 0.0219,
"step": 12070
},
{
"epoch": 0.5136928048987923,
"grad_norm": 0.4629955291748047,
"learning_rate": 2.8160896211529506e-05,
"loss": 0.025,
"step": 12080
},
{
"epoch": 0.5141180472869535,
"grad_norm": 0.4440573751926422,
"learning_rate": 2.8156927768349175e-05,
"loss": 0.0228,
"step": 12090
},
{
"epoch": 0.5145432896751149,
"grad_norm": 0.34329354763031006,
"learning_rate": 2.815295532844158e-05,
"loss": 0.0237,
"step": 12100
},
{
"epoch": 0.5149685320632761,
"grad_norm": 0.42704781889915466,
"learning_rate": 2.8148978893013434e-05,
"loss": 0.0233,
"step": 12110
},
{
"epoch": 0.5153937744514373,
"grad_norm": 0.4133966565132141,
"learning_rate": 2.814499846327268e-05,
"loss": 0.0243,
"step": 12120
},
{
"epoch": 0.5158190168395985,
"grad_norm": 0.4624313712120056,
"learning_rate": 2.8141014040428468e-05,
"loss": 0.0241,
"step": 12130
},
{
"epoch": 0.5162442592277598,
"grad_norm": 0.5375241041183472,
"learning_rate": 2.8137025625691163e-05,
"loss": 0.0234,
"step": 12140
},
{
"epoch": 0.5166695016159211,
"grad_norm": 0.44683730602264404,
"learning_rate": 2.813303322027233e-05,
"loss": 0.0227,
"step": 12150
},
{
"epoch": 0.5170947440040823,
"grad_norm": 0.4118146598339081,
"learning_rate": 2.8129036825384763e-05,
"loss": 0.0248,
"step": 12160
},
{
"epoch": 0.5175199863922436,
"grad_norm": 0.4741264581680298,
"learning_rate": 2.8125036442242466e-05,
"loss": 0.0229,
"step": 12170
},
{
"epoch": 0.5179452287804048,
"grad_norm": 0.5060192942619324,
"learning_rate": 2.8121032072060635e-05,
"loss": 0.025,
"step": 12180
},
{
"epoch": 0.518370471168566,
"grad_norm": 0.44125476479530334,
"learning_rate": 2.811702371605571e-05,
"loss": 0.0231,
"step": 12190
},
{
"epoch": 0.5187957135567274,
"grad_norm": 0.45274418592453003,
"learning_rate": 2.8113011375445308e-05,
"loss": 0.0252,
"step": 12200
},
{
"epoch": 0.5192209559448886,
"grad_norm": 0.4670751392841339,
"learning_rate": 2.8108995051448284e-05,
"loss": 0.0266,
"step": 12210
},
{
"epoch": 0.5196461983330498,
"grad_norm": 0.41676709055900574,
"learning_rate": 2.8104974745284685e-05,
"loss": 0.0218,
"step": 12220
},
{
"epoch": 0.5200714407212111,
"grad_norm": 0.4893173575401306,
"learning_rate": 2.8100950458175775e-05,
"loss": 0.0254,
"step": 12230
},
{
"epoch": 0.5204966831093724,
"grad_norm": 0.49697211384773254,
"learning_rate": 2.8096922191344027e-05,
"loss": 0.0256,
"step": 12240
},
{
"epoch": 0.5209219254975336,
"grad_norm": 0.4751816987991333,
"learning_rate": 2.809288994601312e-05,
"loss": 0.0213,
"step": 12250
},
{
"epoch": 0.5213471678856948,
"grad_norm": 0.45922544598579407,
"learning_rate": 2.8088853723407946e-05,
"loss": 0.0231,
"step": 12260
},
{
"epoch": 0.5217724102738561,
"grad_norm": 0.568095326423645,
"learning_rate": 2.8084813524754602e-05,
"loss": 0.0265,
"step": 12270
},
{
"epoch": 0.5221976526620173,
"grad_norm": 0.5587965846061707,
"learning_rate": 2.8080769351280395e-05,
"loss": 0.0224,
"step": 12280
},
{
"epoch": 0.5226228950501786,
"grad_norm": 0.4851609468460083,
"learning_rate": 2.8076721204213835e-05,
"loss": 0.0217,
"step": 12290
},
{
"epoch": 0.5230481374383399,
"grad_norm": 0.5293141603469849,
"learning_rate": 2.8072669084784642e-05,
"loss": 0.0268,
"step": 12300
},
{
"epoch": 0.5234733798265011,
"grad_norm": 0.511161208152771,
"learning_rate": 2.8068612994223747e-05,
"loss": 0.0249,
"step": 12310
},
{
"epoch": 0.5238986222146623,
"grad_norm": 0.4113911986351013,
"learning_rate": 2.8064552933763284e-05,
"loss": 0.0242,
"step": 12320
},
{
"epoch": 0.5243238646028237,
"grad_norm": 0.4587953984737396,
"learning_rate": 2.8060488904636586e-05,
"loss": 0.0225,
"step": 12330
},
{
"epoch": 0.5247491069909849,
"grad_norm": 0.4895862340927124,
"learning_rate": 2.80564209080782e-05,
"loss": 0.0248,
"step": 12340
},
{
"epoch": 0.5251743493791461,
"grad_norm": 0.573306679725647,
"learning_rate": 2.8052348945323877e-05,
"loss": 0.0255,
"step": 12350
},
{
"epoch": 0.5255995917673074,
"grad_norm": 0.39072391390800476,
"learning_rate": 2.8048273017610574e-05,
"loss": 0.0234,
"step": 12360
},
{
"epoch": 0.5260248341554686,
"grad_norm": 0.5370450019836426,
"learning_rate": 2.804419312617645e-05,
"loss": 0.0247,
"step": 12370
},
{
"epoch": 0.5264500765436299,
"grad_norm": 0.5759648084640503,
"learning_rate": 2.8040109272260862e-05,
"loss": 0.0226,
"step": 12380
},
{
"epoch": 0.5268753189317911,
"grad_norm": 0.41309723258018494,
"learning_rate": 2.8036021457104384e-05,
"loss": 0.023,
"step": 12390
},
{
"epoch": 0.5273005613199524,
"grad_norm": 0.4322841465473175,
"learning_rate": 2.803192968194878e-05,
"loss": 0.0247,
"step": 12400
},
{
"epoch": 0.5277258037081136,
"grad_norm": 0.39251482486724854,
"learning_rate": 2.8027833948037032e-05,
"loss": 0.0198,
"step": 12410
},
{
"epoch": 0.5281510460962748,
"grad_norm": 0.5345434546470642,
"learning_rate": 2.802373425661331e-05,
"loss": 0.0235,
"step": 12420
},
{
"epoch": 0.5285762884844362,
"grad_norm": 0.4239960014820099,
"learning_rate": 2.8019630608922988e-05,
"loss": 0.025,
"step": 12430
},
{
"epoch": 0.5290015308725974,
"grad_norm": 0.38847601413726807,
"learning_rate": 2.801552300621265e-05,
"loss": 0.0227,
"step": 12440
},
{
"epoch": 0.5294267732607586,
"grad_norm": 0.3944767415523529,
"learning_rate": 2.8011411449730084e-05,
"loss": 0.0221,
"step": 12450
},
{
"epoch": 0.5298520156489199,
"grad_norm": 0.3829469084739685,
"learning_rate": 2.800729594072426e-05,
"loss": 0.0229,
"step": 12460
},
{
"epoch": 0.5302772580370811,
"grad_norm": 0.49871960282325745,
"learning_rate": 2.8003176480445373e-05,
"loss": 0.0252,
"step": 12470
},
{
"epoch": 0.5307025004252424,
"grad_norm": 0.479323148727417,
"learning_rate": 2.7999053070144793e-05,
"loss": 0.0222,
"step": 12480
},
{
"epoch": 0.5311277428134037,
"grad_norm": 0.4209604859352112,
"learning_rate": 2.7994925711075117e-05,
"loss": 0.0265,
"step": 12490
},
{
"epoch": 0.5315529852015649,
"grad_norm": 0.44869011640548706,
"learning_rate": 2.7990794404490118e-05,
"loss": 0.0224,
"step": 12500
},
{
"epoch": 0.5319782275897261,
"grad_norm": 0.4455290138721466,
"learning_rate": 2.798665915164478e-05,
"loss": 0.0247,
"step": 12510
},
{
"epoch": 0.5324034699778873,
"grad_norm": 0.4481496810913086,
"learning_rate": 2.7982519953795284e-05,
"loss": 0.0214,
"step": 12520
},
{
"epoch": 0.5328287123660487,
"grad_norm": 0.38179811835289,
"learning_rate": 2.7978376812199013e-05,
"loss": 0.0231,
"step": 12530
},
{
"epoch": 0.5332539547542099,
"grad_norm": 0.44815391302108765,
"learning_rate": 2.797422972811454e-05,
"loss": 0.0209,
"step": 12540
},
{
"epoch": 0.5336791971423711,
"grad_norm": 0.5632169842720032,
"learning_rate": 2.7970078702801635e-05,
"loss": 0.0227,
"step": 12550
},
{
"epoch": 0.5341044395305324,
"grad_norm": 0.4357597231864929,
"learning_rate": 2.7965923737521283e-05,
"loss": 0.0255,
"step": 12560
},
{
"epoch": 0.5345296819186937,
"grad_norm": 0.47203293442726135,
"learning_rate": 2.796176483353564e-05,
"loss": 0.0238,
"step": 12570
},
{
"epoch": 0.5349549243068549,
"grad_norm": 0.4765610694885254,
"learning_rate": 2.795760199210808e-05,
"loss": 0.0264,
"step": 12580
},
{
"epoch": 0.5353801666950162,
"grad_norm": 0.46927255392074585,
"learning_rate": 2.7953435214503157e-05,
"loss": 0.0208,
"step": 12590
},
{
"epoch": 0.5358054090831774,
"grad_norm": 0.48647135496139526,
"learning_rate": 2.7949264501986634e-05,
"loss": 0.022,
"step": 12600
},
{
"epoch": 0.5362306514713386,
"grad_norm": 0.39700639247894287,
"learning_rate": 2.794508985582546e-05,
"loss": 0.0193,
"step": 12610
},
{
"epoch": 0.5366558938595,
"grad_norm": 0.43265703320503235,
"learning_rate": 2.7940911277287777e-05,
"loss": 0.0216,
"step": 12620
},
{
"epoch": 0.5370811362476612,
"grad_norm": 0.5016583204269409,
"learning_rate": 2.793672876764294e-05,
"loss": 0.0249,
"step": 12630
},
{
"epoch": 0.5375063786358224,
"grad_norm": 0.4855128228664398,
"learning_rate": 2.7932542328161478e-05,
"loss": 0.0244,
"step": 12640
},
{
"epoch": 0.5379316210239836,
"grad_norm": 0.46733221411705017,
"learning_rate": 2.792835196011512e-05,
"loss": 0.0211,
"step": 12650
},
{
"epoch": 0.538356863412145,
"grad_norm": 0.44542211294174194,
"learning_rate": 2.792415766477679e-05,
"loss": 0.0234,
"step": 12660
},
{
"epoch": 0.5387821058003062,
"grad_norm": 0.3643878400325775,
"learning_rate": 2.79199594434206e-05,
"loss": 0.0219,
"step": 12670
},
{
"epoch": 0.5392073481884674,
"grad_norm": 0.3483383357524872,
"learning_rate": 2.7915757297321867e-05,
"loss": 0.0213,
"step": 12680
},
{
"epoch": 0.5396325905766287,
"grad_norm": 0.49537405371665955,
"learning_rate": 2.7911551227757085e-05,
"loss": 0.0249,
"step": 12690
},
{
"epoch": 0.5400578329647899,
"grad_norm": 0.40428227186203003,
"learning_rate": 2.790734123600395e-05,
"loss": 0.0217,
"step": 12700
},
{
"epoch": 0.5404830753529511,
"grad_norm": 0.4799230396747589,
"learning_rate": 2.7903127323341347e-05,
"loss": 0.0236,
"step": 12710
},
{
"epoch": 0.5409083177411125,
"grad_norm": 0.4571576416492462,
"learning_rate": 2.7898909491049353e-05,
"loss": 0.0255,
"step": 12720
},
{
"epoch": 0.5413335601292737,
"grad_norm": 0.38204875588417053,
"learning_rate": 2.789468774040923e-05,
"loss": 0.0264,
"step": 12730
},
{
"epoch": 0.5417588025174349,
"grad_norm": 0.5351446270942688,
"learning_rate": 2.7890462072703433e-05,
"loss": 0.0213,
"step": 12740
},
{
"epoch": 0.5421840449055962,
"grad_norm": 0.44234591722488403,
"learning_rate": 2.7886232489215617e-05,
"loss": 0.022,
"step": 12750
},
{
"epoch": 0.5426092872937575,
"grad_norm": 0.3830717206001282,
"learning_rate": 2.7881998991230606e-05,
"loss": 0.0241,
"step": 12760
},
{
"epoch": 0.5430345296819187,
"grad_norm": 0.44614824652671814,
"learning_rate": 2.7877761580034436e-05,
"loss": 0.0225,
"step": 12770
},
{
"epoch": 0.5434597720700799,
"grad_norm": 0.5328401327133179,
"learning_rate": 2.787352025691431e-05,
"loss": 0.0235,
"step": 12780
},
{
"epoch": 0.5438850144582412,
"grad_norm": 0.4961501657962799,
"learning_rate": 2.7869275023158642e-05,
"loss": 0.024,
"step": 12790
},
{
"epoch": 0.5443102568464024,
"grad_norm": 0.4256893992424011,
"learning_rate": 2.7865025880057014e-05,
"loss": 0.0254,
"step": 12800
},
{
"epoch": 0.5447354992345637,
"grad_norm": 0.40367719531059265,
"learning_rate": 2.78607728289002e-05,
"loss": 0.0214,
"step": 12810
},
{
"epoch": 0.545160741622725,
"grad_norm": 0.3512210249900818,
"learning_rate": 2.7856515870980176e-05,
"loss": 0.0231,
"step": 12820
},
{
"epoch": 0.5455859840108862,
"grad_norm": 0.5113899111747742,
"learning_rate": 2.785225500759008e-05,
"loss": 0.0227,
"step": 12830
},
{
"epoch": 0.5460112263990474,
"grad_norm": 0.42159557342529297,
"learning_rate": 2.7847990240024263e-05,
"loss": 0.0234,
"step": 12840
},
{
"epoch": 0.5464364687872088,
"grad_norm": 0.6201958060264587,
"learning_rate": 2.784372156957824e-05,
"loss": 0.0243,
"step": 12850
},
{
"epoch": 0.54686171117537,
"grad_norm": 0.5295283198356628,
"learning_rate": 2.7839448997548728e-05,
"loss": 0.0229,
"step": 12860
},
{
"epoch": 0.5472869535635312,
"grad_norm": 0.34550079703330994,
"learning_rate": 2.783517252523361e-05,
"loss": 0.0236,
"step": 12870
},
{
"epoch": 0.5477121959516925,
"grad_norm": 0.4696100354194641,
"learning_rate": 2.7830892153931975e-05,
"loss": 0.0224,
"step": 12880
},
{
"epoch": 0.5481374383398537,
"grad_norm": 0.5960995554924011,
"learning_rate": 2.7826607884944083e-05,
"loss": 0.024,
"step": 12890
},
{
"epoch": 0.548562680728015,
"grad_norm": 0.43218421936035156,
"learning_rate": 2.782231971957138e-05,
"loss": 0.021,
"step": 12900
},
{
"epoch": 0.5489879231161762,
"grad_norm": 0.3802073299884796,
"learning_rate": 2.78180276591165e-05,
"loss": 0.0212,
"step": 12910
},
{
"epoch": 0.5494131655043375,
"grad_norm": 0.5046935081481934,
"learning_rate": 2.781373170488326e-05,
"loss": 0.0243,
"step": 12920
},
{
"epoch": 0.5498384078924987,
"grad_norm": 0.3973044753074646,
"learning_rate": 2.7809431858176655e-05,
"loss": 0.0258,
"step": 12930
},
{
"epoch": 0.5502636502806599,
"grad_norm": 0.48014792799949646,
"learning_rate": 2.7805128120302865e-05,
"loss": 0.0206,
"step": 12940
},
{
"epoch": 0.5506888926688213,
"grad_norm": 0.444076269865036,
"learning_rate": 2.7800820492569254e-05,
"loss": 0.0252,
"step": 12950
},
{
"epoch": 0.5511141350569825,
"grad_norm": 0.37103697657585144,
"learning_rate": 2.779650897628436e-05,
"loss": 0.0221,
"step": 12960
},
{
"epoch": 0.5515393774451437,
"grad_norm": 1.3059979677200317,
"learning_rate": 2.7792193572757915e-05,
"loss": 0.0217,
"step": 12970
},
{
"epoch": 0.551964619833305,
"grad_norm": 0.48660945892333984,
"learning_rate": 2.7787874283300812e-05,
"loss": 0.0232,
"step": 12980
},
{
"epoch": 0.5523898622214662,
"grad_norm": 0.4161160886287689,
"learning_rate": 2.7783551109225155e-05,
"loss": 0.0233,
"step": 12990
},
{
"epoch": 0.5528151046096275,
"grad_norm": 0.5009236335754395,
"learning_rate": 2.7779224051844195e-05,
"loss": 0.0256,
"step": 13000
},
{
"epoch": 0.5532403469977888,
"grad_norm": 0.5999890565872192,
"learning_rate": 2.777489311247239e-05,
"loss": 0.0209,
"step": 13010
},
{
"epoch": 0.55366558938595,
"grad_norm": 0.42301124334335327,
"learning_rate": 2.7770558292425355e-05,
"loss": 0.0232,
"step": 13020
},
{
"epoch": 0.5540908317741112,
"grad_norm": 0.4028618037700653,
"learning_rate": 2.77662195930199e-05,
"loss": 0.0223,
"step": 13030
},
{
"epoch": 0.5545160741622724,
"grad_norm": 0.3619602620601654,
"learning_rate": 2.7761877015574005e-05,
"loss": 0.0238,
"step": 13040
},
{
"epoch": 0.5549413165504338,
"grad_norm": 0.3421330153942108,
"learning_rate": 2.775753056140683e-05,
"loss": 0.0249,
"step": 13050
},
{
"epoch": 0.555366558938595,
"grad_norm": 0.4019469618797302,
"learning_rate": 2.775318023183871e-05,
"loss": 0.0236,
"step": 13060
},
{
"epoch": 0.5557918013267562,
"grad_norm": 0.43583133816719055,
"learning_rate": 2.774882602819117e-05,
"loss": 0.0203,
"step": 13070
},
{
"epoch": 0.5562170437149175,
"grad_norm": 0.4878927171230316,
"learning_rate": 2.7744467951786892e-05,
"loss": 0.0235,
"step": 13080
},
{
"epoch": 0.5566422861030788,
"grad_norm": 0.3605678081512451,
"learning_rate": 2.7740106003949747e-05,
"loss": 0.0225,
"step": 13090
},
{
"epoch": 0.55706752849124,
"grad_norm": 0.43232229351997375,
"learning_rate": 2.7735740186004785e-05,
"loss": 0.0214,
"step": 13100
},
{
"epoch": 0.5574927708794013,
"grad_norm": 0.46627750992774963,
"learning_rate": 2.7731370499278222e-05,
"loss": 0.0237,
"step": 13110
},
{
"epoch": 0.5579180132675625,
"grad_norm": 0.405368447303772,
"learning_rate": 2.772699694509745e-05,
"loss": 0.0234,
"step": 13120
},
{
"epoch": 0.5583432556557237,
"grad_norm": 0.4305224120616913,
"learning_rate": 2.7722619524791046e-05,
"loss": 0.0235,
"step": 13130
},
{
"epoch": 0.5587684980438851,
"grad_norm": 0.4244700074195862,
"learning_rate": 2.7718238239688747e-05,
"loss": 0.0209,
"step": 13140
},
{
"epoch": 0.5591937404320463,
"grad_norm": 0.4601515531539917,
"learning_rate": 2.7713853091121484e-05,
"loss": 0.0211,
"step": 13150
},
{
"epoch": 0.5596189828202075,
"grad_norm": 0.4351855516433716,
"learning_rate": 2.7709464080421338e-05,
"loss": 0.0215,
"step": 13160
},
{
"epoch": 0.5600442252083687,
"grad_norm": 0.43480363488197327,
"learning_rate": 2.770507120892158e-05,
"loss": 0.0212,
"step": 13170
},
{
"epoch": 0.56046946759653,
"grad_norm": 0.3325427770614624,
"learning_rate": 2.7700674477956643e-05,
"loss": 0.021,
"step": 13180
},
{
"epoch": 0.5608947099846913,
"grad_norm": 0.4659954905509949,
"learning_rate": 2.7696273888862145e-05,
"loss": 0.023,
"step": 13190
},
{
"epoch": 0.5613199523728525,
"grad_norm": 0.4178023636341095,
"learning_rate": 2.7691869442974864e-05,
"loss": 0.0199,
"step": 13200
},
{
"epoch": 0.5617451947610138,
"grad_norm": 0.4274429678916931,
"learning_rate": 2.7687461141632754e-05,
"loss": 0.0212,
"step": 13210
},
{
"epoch": 0.562170437149175,
"grad_norm": 0.44344186782836914,
"learning_rate": 2.7683048986174945e-05,
"loss": 0.0219,
"step": 13220
},
{
"epoch": 0.5625956795373362,
"grad_norm": 0.4258417785167694,
"learning_rate": 2.7678632977941727e-05,
"loss": 0.0229,
"step": 13230
},
{
"epoch": 0.5630209219254976,
"grad_norm": 0.44375917315483093,
"learning_rate": 2.767421311827457e-05,
"loss": 0.021,
"step": 13240
},
{
"epoch": 0.5634461643136588,
"grad_norm": 0.38370025157928467,
"learning_rate": 2.7669789408516117e-05,
"loss": 0.0217,
"step": 13250
},
{
"epoch": 0.56387140670182,
"grad_norm": 0.4034651219844818,
"learning_rate": 2.766536185001016e-05,
"loss": 0.0248,
"step": 13260
},
{
"epoch": 0.5642966490899813,
"grad_norm": 0.3681996464729309,
"learning_rate": 2.7660930444101692e-05,
"loss": 0.0227,
"step": 13270
},
{
"epoch": 0.5647218914781426,
"grad_norm": 0.48673391342163086,
"learning_rate": 2.7656495192136842e-05,
"loss": 0.0225,
"step": 13280
},
{
"epoch": 0.5651471338663038,
"grad_norm": 0.34889769554138184,
"learning_rate": 2.7652056095462926e-05,
"loss": 0.0241,
"step": 13290
},
{
"epoch": 0.565572376254465,
"grad_norm": 0.39446333050727844,
"learning_rate": 2.764761315542843e-05,
"loss": 0.0231,
"step": 13300
},
{
"epoch": 0.5659976186426263,
"grad_norm": 0.42651620507240295,
"learning_rate": 2.7643166373383e-05,
"loss": 0.0207,
"step": 13310
},
{
"epoch": 0.5664228610307875,
"grad_norm": 0.44455501437187195,
"learning_rate": 2.763871575067744e-05,
"loss": 0.0263,
"step": 13320
},
{
"epoch": 0.5668481034189488,
"grad_norm": 0.40686893463134766,
"learning_rate": 2.763426128866375e-05,
"loss": 0.0227,
"step": 13330
},
{
"epoch": 0.5672733458071101,
"grad_norm": 0.6360083818435669,
"learning_rate": 2.7629802988695064e-05,
"loss": 0.0232,
"step": 13340
},
{
"epoch": 0.5676985881952713,
"grad_norm": 0.47458934783935547,
"learning_rate": 2.7625340852125703e-05,
"loss": 0.0207,
"step": 13350
},
{
"epoch": 0.5681238305834325,
"grad_norm": 0.518570601940155,
"learning_rate": 2.7620874880311148e-05,
"loss": 0.0242,
"step": 13360
},
{
"epoch": 0.5685490729715938,
"grad_norm": 0.36844056844711304,
"learning_rate": 2.7616405074608033e-05,
"loss": 0.0233,
"step": 13370
},
{
"epoch": 0.5689743153597551,
"grad_norm": 0.4950474202632904,
"learning_rate": 2.7611931436374173e-05,
"loss": 0.0223,
"step": 13380
},
{
"epoch": 0.5693995577479163,
"grad_norm": 0.3924030661582947,
"learning_rate": 2.7607453966968545e-05,
"loss": 0.0216,
"step": 13390
},
{
"epoch": 0.5698248001360776,
"grad_norm": 0.44798216223716736,
"learning_rate": 2.760297266775128e-05,
"loss": 0.0235,
"step": 13400
},
{
"epoch": 0.5702500425242388,
"grad_norm": 0.4270899295806885,
"learning_rate": 2.759848754008368e-05,
"loss": 0.0231,
"step": 13410
},
{
"epoch": 0.5706752849124,
"grad_norm": 0.38340136408805847,
"learning_rate": 2.759399858532821e-05,
"loss": 0.0238,
"step": 13420
},
{
"epoch": 0.5711005273005613,
"grad_norm": 0.38224777579307556,
"learning_rate": 2.75895058048485e-05,
"loss": 0.024,
"step": 13430
},
{
"epoch": 0.5715257696887226,
"grad_norm": 0.3722180128097534,
"learning_rate": 2.7585009200009326e-05,
"loss": 0.0231,
"step": 13440
},
{
"epoch": 0.5719510120768838,
"grad_norm": 0.4371410608291626,
"learning_rate": 2.7580508772176648e-05,
"loss": 0.021,
"step": 13450
},
{
"epoch": 0.572376254465045,
"grad_norm": 0.5341368317604065,
"learning_rate": 2.7576004522717577e-05,
"loss": 0.0227,
"step": 13460
},
{
"epoch": 0.5728014968532064,
"grad_norm": 0.4262879192829132,
"learning_rate": 2.7571496453000377e-05,
"loss": 0.0236,
"step": 13470
},
{
"epoch": 0.5732267392413676,
"grad_norm": 0.5993068814277649,
"learning_rate": 2.7566984564394487e-05,
"loss": 0.0242,
"step": 13480
},
{
"epoch": 0.5736519816295288,
"grad_norm": 0.5745378136634827,
"learning_rate": 2.7562468858270498e-05,
"loss": 0.0225,
"step": 13490
},
{
"epoch": 0.5740772240176901,
"grad_norm": 0.5052975416183472,
"learning_rate": 2.7557949336000165e-05,
"loss": 0.0233,
"step": 13500
},
{
"epoch": 0.5745024664058513,
"grad_norm": 0.463056743144989,
"learning_rate": 2.7553425998956395e-05,
"loss": 0.0227,
"step": 13510
},
{
"epoch": 0.5749277087940126,
"grad_norm": 0.5643817782402039,
"learning_rate": 2.7548898848513264e-05,
"loss": 0.0207,
"step": 13520
},
{
"epoch": 0.5753529511821739,
"grad_norm": 0.514615535736084,
"learning_rate": 2.754436788604599e-05,
"loss": 0.0209,
"step": 13530
},
{
"epoch": 0.5757781935703351,
"grad_norm": 0.40102750062942505,
"learning_rate": 2.7539833112930973e-05,
"loss": 0.0224,
"step": 13540
},
{
"epoch": 0.5762034359584963,
"grad_norm": 0.4101232588291168,
"learning_rate": 2.753529453054575e-05,
"loss": 0.0239,
"step": 13550
},
{
"epoch": 0.5766286783466575,
"grad_norm": 0.3861386775970459,
"learning_rate": 2.7530752140269028e-05,
"loss": 0.0213,
"step": 13560
},
{
"epoch": 0.5770539207348189,
"grad_norm": 0.45340144634246826,
"learning_rate": 2.7526205943480657e-05,
"loss": 0.023,
"step": 13570
},
{
"epoch": 0.5774791631229801,
"grad_norm": 0.39935538172721863,
"learning_rate": 2.7521655941561653e-05,
"loss": 0.0226,
"step": 13580
},
{
"epoch": 0.5779044055111413,
"grad_norm": 0.32109445333480835,
"learning_rate": 2.751710213589419e-05,
"loss": 0.0242,
"step": 13590
},
{
"epoch": 0.5783296478993026,
"grad_norm": 0.36769798398017883,
"learning_rate": 2.7512544527861597e-05,
"loss": 0.0231,
"step": 13600
},
{
"epoch": 0.5787548902874639,
"grad_norm": 0.4025172293186188,
"learning_rate": 2.7507983118848353e-05,
"loss": 0.0201,
"step": 13610
},
{
"epoch": 0.5791801326756251,
"grad_norm": 0.40505367517471313,
"learning_rate": 2.7503417910240085e-05,
"loss": 0.0205,
"step": 13620
},
{
"epoch": 0.5796053750637864,
"grad_norm": 0.32008427381515503,
"learning_rate": 2.7498848903423593e-05,
"loss": 0.022,
"step": 13630
},
{
"epoch": 0.5800306174519476,
"grad_norm": 0.35272738337516785,
"learning_rate": 2.7494276099786817e-05,
"loss": 0.0226,
"step": 13640
},
{
"epoch": 0.5804558598401088,
"grad_norm": 0.6926178336143494,
"learning_rate": 2.7489699500718854e-05,
"loss": 0.0248,
"step": 13650
},
{
"epoch": 0.5808811022282702,
"grad_norm": 0.4260108470916748,
"learning_rate": 2.7485119107609953e-05,
"loss": 0.021,
"step": 13660
},
{
"epoch": 0.5813063446164314,
"grad_norm": 0.32744520902633667,
"learning_rate": 2.748053492185152e-05,
"loss": 0.021,
"step": 13670
},
{
"epoch": 0.5817315870045926,
"grad_norm": 0.35791370272636414,
"learning_rate": 2.7475946944836103e-05,
"loss": 0.0236,
"step": 13680
},
{
"epoch": 0.5821568293927538,
"grad_norm": 0.3699380159378052,
"learning_rate": 2.7471355177957414e-05,
"loss": 0.0232,
"step": 13690
},
{
"epoch": 0.5825820717809151,
"grad_norm": 0.4098301827907562,
"learning_rate": 2.746675962261031e-05,
"loss": 0.0239,
"step": 13700
},
{
"epoch": 0.5830073141690764,
"grad_norm": 0.4216383099555969,
"learning_rate": 2.74621602801908e-05,
"loss": 0.0228,
"step": 13710
},
{
"epoch": 0.5834325565572376,
"grad_norm": 0.491955041885376,
"learning_rate": 2.745755715209604e-05,
"loss": 0.0206,
"step": 13720
},
{
"epoch": 0.5838577989453989,
"grad_norm": 0.4711659252643585,
"learning_rate": 2.7452950239724337e-05,
"loss": 0.026,
"step": 13730
},
{
"epoch": 0.5842830413335601,
"grad_norm": 0.4391963481903076,
"learning_rate": 2.7448339544475156e-05,
"loss": 0.021,
"step": 13740
},
{
"epoch": 0.5847082837217213,
"grad_norm": 0.5280522108078003,
"learning_rate": 2.7443725067749098e-05,
"loss": 0.025,
"step": 13750
},
{
"epoch": 0.5851335261098827,
"grad_norm": 0.465518981218338,
"learning_rate": 2.7439106810947923e-05,
"loss": 0.0236,
"step": 13760
},
{
"epoch": 0.5855587684980439,
"grad_norm": 0.5963212251663208,
"learning_rate": 2.7434484775474532e-05,
"loss": 0.0208,
"step": 13770
},
{
"epoch": 0.5859840108862051,
"grad_norm": 0.5224639773368835,
"learning_rate": 2.7429858962732984e-05,
"loss": 0.0198,
"step": 13780
},
{
"epoch": 0.5864092532743664,
"grad_norm": 0.6179777979850769,
"learning_rate": 2.7425229374128475e-05,
"loss": 0.0224,
"step": 13790
},
{
"epoch": 0.5868344956625277,
"grad_norm": 0.5582897067070007,
"learning_rate": 2.742059601106735e-05,
"loss": 0.0211,
"step": 13800
},
{
"epoch": 0.5872597380506889,
"grad_norm": 0.4970376789569855,
"learning_rate": 2.74159588749571e-05,
"loss": 0.0199,
"step": 13810
},
{
"epoch": 0.5876849804388501,
"grad_norm": 0.4438394010066986,
"learning_rate": 2.7411317967206373e-05,
"loss": 0.0223,
"step": 13820
},
{
"epoch": 0.5881102228270114,
"grad_norm": 0.40164047479629517,
"learning_rate": 2.7406673289224945e-05,
"loss": 0.0226,
"step": 13830
},
{
"epoch": 0.5885354652151726,
"grad_norm": 0.520200788974762,
"learning_rate": 2.740202484242376e-05,
"loss": 0.0242,
"step": 13840
},
{
"epoch": 0.5889607076033339,
"grad_norm": 0.40304872393608093,
"learning_rate": 2.7397372628214873e-05,
"loss": 0.0208,
"step": 13850
},
{
"epoch": 0.5893859499914952,
"grad_norm": 0.39076563715934753,
"learning_rate": 2.739271664801152e-05,
"loss": 0.0192,
"step": 13860
},
{
"epoch": 0.5898111923796564,
"grad_norm": 0.2574770152568817,
"learning_rate": 2.738805690322806e-05,
"loss": 0.0226,
"step": 13870
},
{
"epoch": 0.5902364347678176,
"grad_norm": 0.39357537031173706,
"learning_rate": 2.7383393395280004e-05,
"loss": 0.0213,
"step": 13880
},
{
"epoch": 0.590661677155979,
"grad_norm": 0.4306516945362091,
"learning_rate": 2.7378726125584e-05,
"loss": 0.0196,
"step": 13890
},
{
"epoch": 0.5910869195441402,
"grad_norm": 0.42869049310684204,
"learning_rate": 2.7374055095557832e-05,
"loss": 0.02,
"step": 13900
},
{
"epoch": 0.5915121619323014,
"grad_norm": 0.41680777072906494,
"learning_rate": 2.7369380306620446e-05,
"loss": 0.0237,
"step": 13910
},
{
"epoch": 0.5919374043204627,
"grad_norm": 0.36180999875068665,
"learning_rate": 2.7364701760191923e-05,
"loss": 0.0202,
"step": 13920
},
{
"epoch": 0.5923626467086239,
"grad_norm": 0.44238200783729553,
"learning_rate": 2.7360019457693468e-05,
"loss": 0.0195,
"step": 13930
},
{
"epoch": 0.5927878890967851,
"grad_norm": 0.3397182524204254,
"learning_rate": 2.7355333400547457e-05,
"loss": 0.02,
"step": 13940
},
{
"epoch": 0.5932131314849464,
"grad_norm": 0.4424580931663513,
"learning_rate": 2.7350643590177378e-05,
"loss": 0.0221,
"step": 13950
},
{
"epoch": 0.5936383738731077,
"grad_norm": 0.4530566334724426,
"learning_rate": 2.7345950028007878e-05,
"loss": 0.022,
"step": 13960
},
{
"epoch": 0.5940636162612689,
"grad_norm": 0.4371461570262909,
"learning_rate": 2.7341252715464736e-05,
"loss": 0.0231,
"step": 13970
},
{
"epoch": 0.5944888586494301,
"grad_norm": 0.41568294167518616,
"learning_rate": 2.7336551653974868e-05,
"loss": 0.0202,
"step": 13980
},
{
"epoch": 0.5949141010375915,
"grad_norm": 0.4857887327671051,
"learning_rate": 2.733184684496634e-05,
"loss": 0.0218,
"step": 13990
},
{
"epoch": 0.5953393434257527,
"grad_norm": 0.32679444551467896,
"learning_rate": 2.7327138289868337e-05,
"loss": 0.0189,
"step": 14000
},
{
"epoch": 0.5957645858139139,
"grad_norm": 0.4400253891944885,
"learning_rate": 2.7322425990111208e-05,
"loss": 0.0224,
"step": 14010
},
{
"epoch": 0.5961898282020752,
"grad_norm": 0.3958609402179718,
"learning_rate": 2.7317709947126416e-05,
"loss": 0.0223,
"step": 14020
},
{
"epoch": 0.5966150705902364,
"grad_norm": 0.3876691162586212,
"learning_rate": 2.731299016234657e-05,
"loss": 0.0222,
"step": 14030
},
{
"epoch": 0.5970403129783977,
"grad_norm": 0.48080649971961975,
"learning_rate": 2.730826663720542e-05,
"loss": 0.0194,
"step": 14040
},
{
"epoch": 0.597465555366559,
"grad_norm": 0.4989674687385559,
"learning_rate": 2.730353937313785e-05,
"loss": 0.0225,
"step": 14050
},
{
"epoch": 0.5978907977547202,
"grad_norm": 0.3855687379837036,
"learning_rate": 2.729880837157987e-05,
"loss": 0.0197,
"step": 14060
},
{
"epoch": 0.5983160401428814,
"grad_norm": 0.3866235315799713,
"learning_rate": 2.729407363396864e-05,
"loss": 0.0175,
"step": 14070
},
{
"epoch": 0.5987412825310426,
"grad_norm": 0.5367085933685303,
"learning_rate": 2.7289335161742444e-05,
"loss": 0.0196,
"step": 14080
},
{
"epoch": 0.599166524919204,
"grad_norm": 0.42395225167274475,
"learning_rate": 2.7284592956340707e-05,
"loss": 0.0193,
"step": 14090
},
{
"epoch": 0.5995917673073652,
"grad_norm": 0.43578681349754333,
"learning_rate": 2.727984701920399e-05,
"loss": 0.0201,
"step": 14100
},
{
"epoch": 0.6000170096955264,
"grad_norm": 0.5050670504570007,
"learning_rate": 2.7275097351773977e-05,
"loss": 0.0218,
"step": 14110
},
{
"epoch": 0.6004422520836877,
"grad_norm": 0.44479116797447205,
"learning_rate": 2.727034395549349e-05,
"loss": 0.0218,
"step": 14120
},
{
"epoch": 0.600867494471849,
"grad_norm": 0.4565848708152771,
"learning_rate": 2.726558683180649e-05,
"loss": 0.022,
"step": 14130
},
{
"epoch": 0.6012927368600102,
"grad_norm": 0.4360315799713135,
"learning_rate": 2.7260825982158067e-05,
"loss": 0.0201,
"step": 14140
},
{
"epoch": 0.6017179792481715,
"grad_norm": 0.361296683549881,
"learning_rate": 2.725606140799444e-05,
"loss": 0.0179,
"step": 14150
},
{
"epoch": 0.6021432216363327,
"grad_norm": 0.40813711285591125,
"learning_rate": 2.7251293110762957e-05,
"loss": 0.0212,
"step": 14160
},
{
"epoch": 0.6025684640244939,
"grad_norm": 0.3440335988998413,
"learning_rate": 2.7246521091912106e-05,
"loss": 0.022,
"step": 14170
},
{
"epoch": 0.6029937064126553,
"grad_norm": 0.40166258811950684,
"learning_rate": 2.7241745352891497e-05,
"loss": 0.0194,
"step": 14180
},
{
"epoch": 0.6034189488008165,
"grad_norm": 0.4059109687805176,
"learning_rate": 2.7236965895151874e-05,
"loss": 0.0219,
"step": 14190
},
{
"epoch": 0.6038441911889777,
"grad_norm": 0.4299823045730591,
"learning_rate": 2.7232182720145105e-05,
"loss": 0.0225,
"step": 14200
},
{
"epoch": 0.6042694335771389,
"grad_norm": 0.5142824053764343,
"learning_rate": 2.7227395829324208e-05,
"loss": 0.0229,
"step": 14210
},
{
"epoch": 0.6046946759653002,
"grad_norm": 0.36144012212753296,
"learning_rate": 2.7222605224143298e-05,
"loss": 0.0222,
"step": 14220
},
{
"epoch": 0.6051199183534615,
"grad_norm": 0.43536025285720825,
"learning_rate": 2.721781090605764e-05,
"loss": 0.0228,
"step": 14230
},
{
"epoch": 0.6055451607416227,
"grad_norm": 0.3937072455883026,
"learning_rate": 2.721301287652362e-05,
"loss": 0.0215,
"step": 14240
},
{
"epoch": 0.605970403129784,
"grad_norm": 0.3862617313861847,
"learning_rate": 2.7208211136998758e-05,
"loss": 0.0223,
"step": 14250
},
{
"epoch": 0.6063956455179452,
"grad_norm": 0.41137275099754333,
"learning_rate": 2.7203405688941693e-05,
"loss": 0.0209,
"step": 14260
},
{
"epoch": 0.6068208879061064,
"grad_norm": 0.46557578444480896,
"learning_rate": 2.719859653381219e-05,
"loss": 0.0221,
"step": 14270
},
{
"epoch": 0.6072461302942678,
"grad_norm": 0.4357418119907379,
"learning_rate": 2.719378367307114e-05,
"loss": 0.0191,
"step": 14280
},
{
"epoch": 0.607671372682429,
"grad_norm": 0.4667029082775116,
"learning_rate": 2.7188967108180574e-05,
"loss": 0.0198,
"step": 14290
},
{
"epoch": 0.6080966150705902,
"grad_norm": 0.3489953577518463,
"learning_rate": 2.7184146840603634e-05,
"loss": 0.0212,
"step": 14300
},
{
"epoch": 0.6085218574587515,
"grad_norm": 0.34787535667419434,
"learning_rate": 2.7179322871804584e-05,
"loss": 0.0231,
"step": 14310
},
{
"epoch": 0.6089470998469128,
"grad_norm": 0.39182159304618835,
"learning_rate": 2.717449520324882e-05,
"loss": 0.0216,
"step": 14320
},
{
"epoch": 0.609372342235074,
"grad_norm": 0.3901592791080475,
"learning_rate": 2.7169663836402864e-05,
"loss": 0.0187,
"step": 14330
},
{
"epoch": 0.6097975846232352,
"grad_norm": 0.35816362500190735,
"learning_rate": 2.7164828772734352e-05,
"loss": 0.022,
"step": 14340
},
{
"epoch": 0.6102228270113965,
"grad_norm": 0.35063478350639343,
"learning_rate": 2.7159990013712057e-05,
"loss": 0.0192,
"step": 14350
},
{
"epoch": 0.6106480693995577,
"grad_norm": 0.42145827412605286,
"learning_rate": 2.7155147560805863e-05,
"loss": 0.0224,
"step": 14360
},
{
"epoch": 0.611073311787719,
"grad_norm": 0.439269095659256,
"learning_rate": 2.7150301415486777e-05,
"loss": 0.0202,
"step": 14370
},
{
"epoch": 0.6114985541758803,
"grad_norm": 0.44516879320144653,
"learning_rate": 2.7145451579226932e-05,
"loss": 0.0207,
"step": 14380
},
{
"epoch": 0.6119237965640415,
"grad_norm": 0.4259718358516693,
"learning_rate": 2.7140598053499577e-05,
"loss": 0.0226,
"step": 14390
},
{
"epoch": 0.6123490389522027,
"grad_norm": 0.44744691252708435,
"learning_rate": 2.7135740839779087e-05,
"loss": 0.02,
"step": 14400
},
{
"epoch": 0.612774281340364,
"grad_norm": 0.5334987640380859,
"learning_rate": 2.7130879939540957e-05,
"loss": 0.0191,
"step": 14410
},
{
"epoch": 0.6131995237285253,
"grad_norm": 0.41623225808143616,
"learning_rate": 2.7126015354261798e-05,
"loss": 0.0186,
"step": 14420
},
{
"epoch": 0.6136247661166865,
"grad_norm": 0.44290414452552795,
"learning_rate": 2.712114708541934e-05,
"loss": 0.0212,
"step": 14430
},
{
"epoch": 0.6140500085048478,
"grad_norm": 0.4709611237049103,
"learning_rate": 2.7116275134492445e-05,
"loss": 0.0226,
"step": 14440
},
{
"epoch": 0.614475250893009,
"grad_norm": 0.46769437193870544,
"learning_rate": 2.7111399502961073e-05,
"loss": 0.0205,
"step": 14450
},
{
"epoch": 0.6149004932811702,
"grad_norm": 0.38859257102012634,
"learning_rate": 2.710652019230631e-05,
"loss": 0.0212,
"step": 14460
},
{
"epoch": 0.6153257356693315,
"grad_norm": 0.3308449685573578,
"learning_rate": 2.710163720401037e-05,
"loss": 0.0206,
"step": 14470
},
{
"epoch": 0.6157509780574928,
"grad_norm": 0.48803281784057617,
"learning_rate": 2.709675053955658e-05,
"loss": 0.0194,
"step": 14480
},
{
"epoch": 0.616176220445654,
"grad_norm": 0.4074990451335907,
"learning_rate": 2.7091860200429362e-05,
"loss": 0.0205,
"step": 14490
},
{
"epoch": 0.6166014628338152,
"grad_norm": 0.3850661814212799,
"learning_rate": 2.7086966188114285e-05,
"loss": 0.0224,
"step": 14500
},
{
"epoch": 0.6170267052219766,
"grad_norm": 0.380643755197525,
"learning_rate": 2.708206850409802e-05,
"loss": 0.0222,
"step": 14510
},
{
"epoch": 0.6174519476101378,
"grad_norm": 0.4678167700767517,
"learning_rate": 2.707716714986835e-05,
"loss": 0.0195,
"step": 14520
},
{
"epoch": 0.617877189998299,
"grad_norm": 0.39859825372695923,
"learning_rate": 2.707226212691418e-05,
"loss": 0.0206,
"step": 14530
},
{
"epoch": 0.6183024323864603,
"grad_norm": 0.4449918568134308,
"learning_rate": 2.7067353436725525e-05,
"loss": 0.0203,
"step": 14540
},
{
"epoch": 0.6187276747746215,
"grad_norm": 0.4654516279697418,
"learning_rate": 2.706244108079352e-05,
"loss": 0.0226,
"step": 14550
},
{
"epoch": 0.6191529171627828,
"grad_norm": 0.4559080898761749,
"learning_rate": 2.7057525060610396e-05,
"loss": 0.0226,
"step": 14560
},
{
"epoch": 0.6195781595509441,
"grad_norm": 0.34623491764068604,
"learning_rate": 2.7052605377669527e-05,
"loss": 0.0205,
"step": 14570
},
{
"epoch": 0.6200034019391053,
"grad_norm": 0.4986266791820526,
"learning_rate": 2.704768203346537e-05,
"loss": 0.0211,
"step": 14580
},
{
"epoch": 0.6204286443272665,
"grad_norm": 0.4883112907409668,
"learning_rate": 2.7042755029493513e-05,
"loss": 0.0218,
"step": 14590
},
{
"epoch": 0.6208538867154277,
"grad_norm": 0.5301306247711182,
"learning_rate": 2.703782436725065e-05,
"loss": 0.0219,
"step": 14600
},
{
"epoch": 0.6212791291035891,
"grad_norm": 0.42371174693107605,
"learning_rate": 2.7032890048234585e-05,
"loss": 0.022,
"step": 14610
},
{
"epoch": 0.6217043714917503,
"grad_norm": 0.5062300562858582,
"learning_rate": 2.702795207394423e-05,
"loss": 0.0199,
"step": 14620
},
{
"epoch": 0.6221296138799115,
"grad_norm": 0.48883020877838135,
"learning_rate": 2.702301044587962e-05,
"loss": 0.0205,
"step": 14630
},
{
"epoch": 0.6225548562680728,
"grad_norm": 0.5089893937110901,
"learning_rate": 2.7018065165541885e-05,
"loss": 0.0206,
"step": 14640
},
{
"epoch": 0.622980098656234,
"grad_norm": 0.48434340953826904,
"learning_rate": 2.701311623443327e-05,
"loss": 0.0211,
"step": 14650
},
{
"epoch": 0.6234053410443953,
"grad_norm": 0.5042490363121033,
"learning_rate": 2.700816365405713e-05,
"loss": 0.0194,
"step": 14660
},
{
"epoch": 0.6238305834325566,
"grad_norm": 0.4644983112812042,
"learning_rate": 2.7003207425917926e-05,
"loss": 0.0216,
"step": 14670
},
{
"epoch": 0.6242558258207178,
"grad_norm": 0.46126359701156616,
"learning_rate": 2.6998247551521238e-05,
"loss": 0.0211,
"step": 14680
},
{
"epoch": 0.624681068208879,
"grad_norm": 0.44964349269866943,
"learning_rate": 2.6993284032373732e-05,
"loss": 0.0212,
"step": 14690
},
{
"epoch": 0.6251063105970404,
"grad_norm": 0.3886130154132843,
"learning_rate": 2.6988316869983208e-05,
"loss": 0.0224,
"step": 14700
},
{
"epoch": 0.6255315529852016,
"grad_norm": 0.40591052174568176,
"learning_rate": 2.698334606585855e-05,
"loss": 0.0205,
"step": 14710
},
{
"epoch": 0.6259567953733628,
"grad_norm": 0.539798378944397,
"learning_rate": 2.6978371621509757e-05,
"loss": 0.0203,
"step": 14720
},
{
"epoch": 0.6263820377615241,
"grad_norm": 0.46444740891456604,
"learning_rate": 2.697339353844794e-05,
"loss": 0.0219,
"step": 14730
},
{
"epoch": 0.6268072801496853,
"grad_norm": 0.3508462607860565,
"learning_rate": 2.6968411818185303e-05,
"loss": 0.0202,
"step": 14740
},
{
"epoch": 0.6272325225378466,
"grad_norm": 0.45676133036613464,
"learning_rate": 2.6963426462235167e-05,
"loss": 0.0199,
"step": 14750
},
{
"epoch": 0.6276577649260078,
"grad_norm": 0.43334293365478516,
"learning_rate": 2.6958437472111947e-05,
"loss": 0.023,
"step": 14760
},
{
"epoch": 0.6280830073141691,
"grad_norm": 0.3671410083770752,
"learning_rate": 2.695344484933116e-05,
"loss": 0.0193,
"step": 14770
},
{
"epoch": 0.6285082497023303,
"grad_norm": 0.38691452145576477,
"learning_rate": 2.6948448595409443e-05,
"loss": 0.0205,
"step": 14780
},
{
"epoch": 0.6289334920904915,
"grad_norm": 0.4143933355808258,
"learning_rate": 2.694344871186453e-05,
"loss": 0.0201,
"step": 14790
},
{
"epoch": 0.6293587344786529,
"grad_norm": 0.27843642234802246,
"learning_rate": 2.6938445200215237e-05,
"loss": 0.0214,
"step": 14800
},
{
"epoch": 0.6297839768668141,
"grad_norm": 0.33850619196891785,
"learning_rate": 2.6933438061981513e-05,
"loss": 0.0198,
"step": 14810
},
{
"epoch": 0.6302092192549753,
"grad_norm": 0.3694206178188324,
"learning_rate": 2.6928427298684395e-05,
"loss": 0.02,
"step": 14820
},
{
"epoch": 0.6306344616431366,
"grad_norm": 0.37032657861709595,
"learning_rate": 2.6923412911846008e-05,
"loss": 0.0208,
"step": 14830
},
{
"epoch": 0.6310597040312979,
"grad_norm": 0.4800131022930145,
"learning_rate": 2.6918394902989604e-05,
"loss": 0.0209,
"step": 14840
},
{
"epoch": 0.6314849464194591,
"grad_norm": 0.4037657082080841,
"learning_rate": 2.6913373273639508e-05,
"loss": 0.0202,
"step": 14850
},
{
"epoch": 0.6319101888076204,
"grad_norm": 0.40070000290870667,
"learning_rate": 2.690834802532117e-05,
"loss": 0.0198,
"step": 14860
},
{
"epoch": 0.6323354311957816,
"grad_norm": 0.4060189425945282,
"learning_rate": 2.6903319159561125e-05,
"loss": 0.0219,
"step": 14870
},
{
"epoch": 0.6327606735839428,
"grad_norm": 0.4039824903011322,
"learning_rate": 2.6898286677887005e-05,
"loss": 0.0223,
"step": 14880
},
{
"epoch": 0.633185915972104,
"grad_norm": 0.40134313702583313,
"learning_rate": 2.689325058182755e-05,
"loss": 0.0209,
"step": 14890
},
{
"epoch": 0.6336111583602654,
"grad_norm": 0.45482826232910156,
"learning_rate": 2.6888210872912594e-05,
"loss": 0.0229,
"step": 14900
},
{
"epoch": 0.6340364007484266,
"grad_norm": 0.3572455048561096,
"learning_rate": 2.688316755267306e-05,
"loss": 0.0221,
"step": 14910
},
{
"epoch": 0.6344616431365878,
"grad_norm": 0.3394013047218323,
"learning_rate": 2.6878120622640988e-05,
"loss": 0.0213,
"step": 14920
},
{
"epoch": 0.6348868855247491,
"grad_norm": 0.37122642993927,
"learning_rate": 2.6873070084349486e-05,
"loss": 0.0216,
"step": 14930
},
{
"epoch": 0.6353121279129104,
"grad_norm": 0.3685668408870697,
"learning_rate": 2.6868015939332788e-05,
"loss": 0.0212,
"step": 14940
},
{
"epoch": 0.6357373703010716,
"grad_norm": 0.3652264475822449,
"learning_rate": 2.68629581891262e-05,
"loss": 0.0215,
"step": 14950
},
{
"epoch": 0.6361626126892329,
"grad_norm": 0.3909711539745331,
"learning_rate": 2.685789683526614e-05,
"loss": 0.0186,
"step": 14960
},
{
"epoch": 0.6365878550773941,
"grad_norm": 0.5734156370162964,
"learning_rate": 2.685283187929011e-05,
"loss": 0.0217,
"step": 14970
},
{
"epoch": 0.6370130974655553,
"grad_norm": 0.643765926361084,
"learning_rate": 2.6847763322736713e-05,
"loss": 0.0213,
"step": 14980
},
{
"epoch": 0.6374383398537167,
"grad_norm": 0.3826221823692322,
"learning_rate": 2.684269116714564e-05,
"loss": 0.0195,
"step": 14990
},
{
"epoch": 0.6378635822418779,
"grad_norm": 0.4413127303123474,
"learning_rate": 2.6837615414057677e-05,
"loss": 0.0205,
"step": 15000
},
{
"epoch": 0.6382888246300391,
"grad_norm": 0.5099561810493469,
"learning_rate": 2.6832536065014708e-05,
"loss": 0.0225,
"step": 15010
},
{
"epoch": 0.6387140670182003,
"grad_norm": 0.40210336446762085,
"learning_rate": 2.68274531215597e-05,
"loss": 0.0226,
"step": 15020
},
{
"epoch": 0.6391393094063617,
"grad_norm": 0.3939429521560669,
"learning_rate": 2.6822366585236716e-05,
"loss": 0.0194,
"step": 15030
},
{
"epoch": 0.6395645517945229,
"grad_norm": 0.33854517340660095,
"learning_rate": 2.6817276457590924e-05,
"loss": 0.0192,
"step": 15040
},
{
"epoch": 0.6399897941826841,
"grad_norm": 0.3618461787700653,
"learning_rate": 2.6812182740168555e-05,
"loss": 0.0211,
"step": 15050
},
{
"epoch": 0.6404150365708454,
"grad_norm": 0.35461387038230896,
"learning_rate": 2.6807085434516953e-05,
"loss": 0.0208,
"step": 15060
},
{
"epoch": 0.6408402789590066,
"grad_norm": 0.4645833373069763,
"learning_rate": 2.6801984542184544e-05,
"loss": 0.0229,
"step": 15070
},
{
"epoch": 0.6412655213471679,
"grad_norm": 0.40264660120010376,
"learning_rate": 2.6796880064720845e-05,
"loss": 0.0234,
"step": 15080
},
{
"epoch": 0.6416907637353292,
"grad_norm": 0.3904982805252075,
"learning_rate": 2.6791772003676462e-05,
"loss": 0.0235,
"step": 15090
},
{
"epoch": 0.6421160061234904,
"grad_norm": 0.4172634780406952,
"learning_rate": 2.6786660360603087e-05,
"loss": 0.0213,
"step": 15100
},
{
"epoch": 0.6425412485116516,
"grad_norm": 0.5049511790275574,
"learning_rate": 2.6781545137053503e-05,
"loss": 0.0229,
"step": 15110
},
{
"epoch": 0.642966490899813,
"grad_norm": 0.4754362106323242,
"learning_rate": 2.677642633458158e-05,
"loss": 0.0193,
"step": 15120
},
{
"epoch": 0.6433917332879742,
"grad_norm": 0.36239519715309143,
"learning_rate": 2.6771303954742274e-05,
"loss": 0.0206,
"step": 15130
},
{
"epoch": 0.6438169756761354,
"grad_norm": 0.4245750308036804,
"learning_rate": 2.6766177999091633e-05,
"loss": 0.0214,
"step": 15140
},
{
"epoch": 0.6442422180642966,
"grad_norm": 0.3785336911678314,
"learning_rate": 2.676104846918678e-05,
"loss": 0.0199,
"step": 15150
},
{
"epoch": 0.6446674604524579,
"grad_norm": 0.4175279140472412,
"learning_rate": 2.675591536658594e-05,
"loss": 0.0212,
"step": 15160
},
{
"epoch": 0.6450927028406191,
"grad_norm": 0.38691219687461853,
"learning_rate": 2.6750778692848405e-05,
"loss": 0.0216,
"step": 15170
},
{
"epoch": 0.6455179452287804,
"grad_norm": 0.49059873819351196,
"learning_rate": 2.6745638449534562e-05,
"loss": 0.0207,
"step": 15180
},
{
"epoch": 0.6459431876169417,
"grad_norm": 0.4424341320991516,
"learning_rate": 2.6740494638205888e-05,
"loss": 0.0238,
"step": 15190
},
{
"epoch": 0.6463684300051029,
"grad_norm": 0.3581426441669464,
"learning_rate": 2.6735347260424928e-05,
"loss": 0.0202,
"step": 15200
},
{
"epoch": 0.6467936723932641,
"grad_norm": 0.3555155098438263,
"learning_rate": 2.6730196317755326e-05,
"loss": 0.0197,
"step": 15210
},
{
"epoch": 0.6472189147814255,
"grad_norm": 0.33948877453804016,
"learning_rate": 2.6725041811761805e-05,
"loss": 0.0203,
"step": 15220
},
{
"epoch": 0.6476441571695867,
"grad_norm": 0.38188841938972473,
"learning_rate": 2.671988374401016e-05,
"loss": 0.0211,
"step": 15230
},
{
"epoch": 0.6480693995577479,
"grad_norm": 0.33603012561798096,
"learning_rate": 2.671472211606728e-05,
"loss": 0.0185,
"step": 15240
},
{
"epoch": 0.6484946419459092,
"grad_norm": 0.5087174773216248,
"learning_rate": 2.6709556929501128e-05,
"loss": 0.0212,
"step": 15250
},
{
"epoch": 0.6489198843340704,
"grad_norm": 0.3816879987716675,
"learning_rate": 2.670438818588076e-05,
"loss": 0.0208,
"step": 15260
},
{
"epoch": 0.6493451267222317,
"grad_norm": 0.4196940064430237,
"learning_rate": 2.6699215886776287e-05,
"loss": 0.0233,
"step": 15270
},
{
"epoch": 0.6497703691103929,
"grad_norm": 0.42934325337409973,
"learning_rate": 2.6694040033758933e-05,
"loss": 0.0217,
"step": 15280
},
{
"epoch": 0.6501956114985542,
"grad_norm": 0.49663063883781433,
"learning_rate": 2.6688860628400982e-05,
"loss": 0.0218,
"step": 15290
},
{
"epoch": 0.6506208538867154,
"grad_norm": 0.3750404417514801,
"learning_rate": 2.6683677672275797e-05,
"loss": 0.0212,
"step": 15300
},
{
"epoch": 0.6510460962748766,
"grad_norm": 0.4539925158023834,
"learning_rate": 2.6678491166957825e-05,
"loss": 0.0199,
"step": 15310
},
{
"epoch": 0.651471338663038,
"grad_norm": 0.32169193029403687,
"learning_rate": 2.6673301114022592e-05,
"loss": 0.019,
"step": 15320
},
{
"epoch": 0.6518965810511992,
"grad_norm": 0.5332807302474976,
"learning_rate": 2.666810751504669e-05,
"loss": 0.022,
"step": 15330
},
{
"epoch": 0.6523218234393604,
"grad_norm": 0.39766547083854675,
"learning_rate": 2.6662910371607807e-05,
"loss": 0.02,
"step": 15340
},
{
"epoch": 0.6527470658275217,
"grad_norm": 0.32298263907432556,
"learning_rate": 2.6657709685284696e-05,
"loss": 0.021,
"step": 15350
},
{
"epoch": 0.653172308215683,
"grad_norm": 0.4446607232093811,
"learning_rate": 2.6652505457657183e-05,
"loss": 0.0213,
"step": 15360
},
{
"epoch": 0.6535975506038442,
"grad_norm": 0.36240512132644653,
"learning_rate": 2.664729769030618e-05,
"loss": 0.0212,
"step": 15370
},
{
"epoch": 0.6540227929920055,
"grad_norm": 0.31035521626472473,
"learning_rate": 2.6642086384813667e-05,
"loss": 0.0193,
"step": 15380
},
{
"epoch": 0.6544480353801667,
"grad_norm": 0.3434225618839264,
"learning_rate": 2.6636871542762703e-05,
"loss": 0.0219,
"step": 15390
},
{
"epoch": 0.6548732777683279,
"grad_norm": 0.37940528988838196,
"learning_rate": 2.6631653165737418e-05,
"loss": 0.021,
"step": 15400
},
{
"epoch": 0.6552985201564892,
"grad_norm": 0.4367035925388336,
"learning_rate": 2.662643125532302e-05,
"loss": 0.0175,
"step": 15410
},
{
"epoch": 0.6557237625446505,
"grad_norm": 0.3646489083766937,
"learning_rate": 2.6621205813105777e-05,
"loss": 0.0235,
"step": 15420
},
{
"epoch": 0.6561490049328117,
"grad_norm": 0.3998994827270508,
"learning_rate": 2.6615976840673056e-05,
"loss": 0.0204,
"step": 15430
},
{
"epoch": 0.6565742473209729,
"grad_norm": 0.510734498500824,
"learning_rate": 2.6610744339613265e-05,
"loss": 0.0239,
"step": 15440
},
{
"epoch": 0.6569994897091342,
"grad_norm": 0.40119668841362,
"learning_rate": 2.6605508311515916e-05,
"loss": 0.0179,
"step": 15450
},
{
"epoch": 0.6574247320972955,
"grad_norm": 0.36183375120162964,
"learning_rate": 2.6600268757971566e-05,
"loss": 0.0197,
"step": 15460
},
{
"epoch": 0.6578499744854567,
"grad_norm": 0.32286256551742554,
"learning_rate": 2.659502568057185e-05,
"loss": 0.0177,
"step": 15470
},
{
"epoch": 0.658275216873618,
"grad_norm": 0.3489496111869812,
"learning_rate": 2.658977908090949e-05,
"loss": 0.0211,
"step": 15480
},
{
"epoch": 0.6587004592617792,
"grad_norm": 0.35360145568847656,
"learning_rate": 2.6584528960578245e-05,
"loss": 0.0211,
"step": 15490
},
{
"epoch": 0.6591257016499404,
"grad_norm": 0.3279133439064026,
"learning_rate": 2.657927532117298e-05,
"loss": 0.0224,
"step": 15500
},
{
"epoch": 0.6595509440381018,
"grad_norm": 0.5307937860488892,
"learning_rate": 2.6574018164289605e-05,
"loss": 0.0206,
"step": 15510
},
{
"epoch": 0.659976186426263,
"grad_norm": 0.44229283928871155,
"learning_rate": 2.6568757491525103e-05,
"loss": 0.0212,
"step": 15520
},
{
"epoch": 0.6604014288144242,
"grad_norm": 0.4387955963611603,
"learning_rate": 2.6563493304477534e-05,
"loss": 0.0195,
"step": 15530
},
{
"epoch": 0.6608266712025854,
"grad_norm": 0.35147640109062195,
"learning_rate": 2.655822560474601e-05,
"loss": 0.0197,
"step": 15540
},
{
"epoch": 0.6612519135907468,
"grad_norm": 0.415713369846344,
"learning_rate": 2.655295439393073e-05,
"loss": 0.0202,
"step": 15550
},
{
"epoch": 0.661677155978908,
"grad_norm": 0.4200793504714966,
"learning_rate": 2.654767967363294e-05,
"loss": 0.0226,
"step": 15560
},
{
"epoch": 0.6621023983670692,
"grad_norm": 0.42018580436706543,
"learning_rate": 2.6542401445454967e-05,
"loss": 0.0215,
"step": 15570
},
{
"epoch": 0.6625276407552305,
"grad_norm": 0.4219094514846802,
"learning_rate": 2.6537119711000187e-05,
"loss": 0.0233,
"step": 15580
},
{
"epoch": 0.6629528831433917,
"grad_norm": 0.5176847577095032,
"learning_rate": 2.6531834471873068e-05,
"loss": 0.0192,
"step": 15590
},
{
"epoch": 0.663378125531553,
"grad_norm": 0.3487168550491333,
"learning_rate": 2.6526545729679115e-05,
"loss": 0.0204,
"step": 15600
},
{
"epoch": 0.6638033679197143,
"grad_norm": 0.4768315851688385,
"learning_rate": 2.6521253486024908e-05,
"loss": 0.0217,
"step": 15610
},
{
"epoch": 0.6642286103078755,
"grad_norm": 0.3240163326263428,
"learning_rate": 2.651595774251809e-05,
"loss": 0.0188,
"step": 15620
},
{
"epoch": 0.6646538526960367,
"grad_norm": 0.3336798846721649,
"learning_rate": 2.6510658500767378e-05,
"loss": 0.0191,
"step": 15630
},
{
"epoch": 0.665079095084198,
"grad_norm": 0.46676674485206604,
"learning_rate": 2.650535576238253e-05,
"loss": 0.0198,
"step": 15640
},
{
"epoch": 0.6655043374723593,
"grad_norm": 0.4120013117790222,
"learning_rate": 2.650004952897438e-05,
"loss": 0.0204,
"step": 15650
},
{
"epoch": 0.6659295798605205,
"grad_norm": 0.36168473958969116,
"learning_rate": 2.649473980215483e-05,
"loss": 0.02,
"step": 15660
},
{
"epoch": 0.6663548222486817,
"grad_norm": 0.34925559163093567,
"learning_rate": 2.648942658353683e-05,
"loss": 0.0192,
"step": 15670
},
{
"epoch": 0.666780064636843,
"grad_norm": 0.31256768107414246,
"learning_rate": 2.648410987473439e-05,
"loss": 0.0186,
"step": 15680
},
{
"epoch": 0.6672053070250042,
"grad_norm": 0.37509337067604065,
"learning_rate": 2.6478789677362596e-05,
"loss": 0.019,
"step": 15690
},
{
"epoch": 0.6676305494131655,
"grad_norm": 0.29366612434387207,
"learning_rate": 2.6473465993037572e-05,
"loss": 0.0191,
"step": 15700
},
{
"epoch": 0.6680557918013268,
"grad_norm": 0.3554866909980774,
"learning_rate": 2.6468138823376522e-05,
"loss": 0.0207,
"step": 15710
},
{
"epoch": 0.668481034189488,
"grad_norm": 0.39348745346069336,
"learning_rate": 2.6462808169997695e-05,
"loss": 0.0205,
"step": 15720
},
{
"epoch": 0.6689062765776492,
"grad_norm": 0.2861650288105011,
"learning_rate": 2.6457474034520402e-05,
"loss": 0.0203,
"step": 15730
},
{
"epoch": 0.6693315189658106,
"grad_norm": 0.4072084426879883,
"learning_rate": 2.6452136418565016e-05,
"loss": 0.0187,
"step": 15740
},
{
"epoch": 0.6697567613539718,
"grad_norm": 0.3620724678039551,
"learning_rate": 2.6446795323752962e-05,
"loss": 0.0216,
"step": 15750
},
{
"epoch": 0.670182003742133,
"grad_norm": 0.3886725902557373,
"learning_rate": 2.6441450751706725e-05,
"loss": 0.0194,
"step": 15760
},
{
"epoch": 0.6706072461302943,
"grad_norm": 0.41271156072616577,
"learning_rate": 2.6436102704049843e-05,
"loss": 0.021,
"step": 15770
},
{
"epoch": 0.6710324885184555,
"grad_norm": 0.4042252004146576,
"learning_rate": 2.6430751182406913e-05,
"loss": 0.0205,
"step": 15780
},
{
"epoch": 0.6714577309066168,
"grad_norm": 0.4653618335723877,
"learning_rate": 2.6425396188403585e-05,
"loss": 0.0229,
"step": 15790
},
{
"epoch": 0.671882973294778,
"grad_norm": 0.5367754697799683,
"learning_rate": 2.6420037723666565e-05,
"loss": 0.0233,
"step": 15800
},
{
"epoch": 0.6723082156829393,
"grad_norm": 0.4355956017971039,
"learning_rate": 2.641467578982361e-05,
"loss": 0.0197,
"step": 15810
},
{
"epoch": 0.6727334580711005,
"grad_norm": 0.4350462853908539,
"learning_rate": 2.6409310388503542e-05,
"loss": 0.0198,
"step": 15820
},
{
"epoch": 0.6731587004592617,
"grad_norm": 0.3870401680469513,
"learning_rate": 2.640394152133622e-05,
"loss": 0.0206,
"step": 15830
},
{
"epoch": 0.6735839428474231,
"grad_norm": 0.3935236632823944,
"learning_rate": 2.6398569189952573e-05,
"loss": 0.0194,
"step": 15840
},
{
"epoch": 0.6740091852355843,
"grad_norm": 0.4127284288406372,
"learning_rate": 2.6393193395984563e-05,
"loss": 0.02,
"step": 15850
},
{
"epoch": 0.6744344276237455,
"grad_norm": 0.4417366683483124,
"learning_rate": 2.6387814141065216e-05,
"loss": 0.0222,
"step": 15860
},
{
"epoch": 0.6748596700119068,
"grad_norm": 0.4162922203540802,
"learning_rate": 2.6382431426828615e-05,
"loss": 0.0185,
"step": 15870
},
{
"epoch": 0.675284912400068,
"grad_norm": 0.41200003027915955,
"learning_rate": 2.6377045254909885e-05,
"loss": 0.0207,
"step": 15880
},
{
"epoch": 0.6757101547882293,
"grad_norm": 0.4270523488521576,
"learning_rate": 2.6371655626945196e-05,
"loss": 0.0221,
"step": 15890
},
{
"epoch": 0.6761353971763906,
"grad_norm": 0.41679078340530396,
"learning_rate": 2.6366262544571778e-05,
"loss": 0.0198,
"step": 15900
},
{
"epoch": 0.6765606395645518,
"grad_norm": 0.4529988765716553,
"learning_rate": 2.636086600942791e-05,
"loss": 0.0191,
"step": 15910
},
{
"epoch": 0.676985881952713,
"grad_norm": 0.45074957609176636,
"learning_rate": 2.6355466023152913e-05,
"loss": 0.021,
"step": 15920
},
{
"epoch": 0.6774111243408742,
"grad_norm": 0.39274510741233826,
"learning_rate": 2.6350062587387164e-05,
"loss": 0.02,
"step": 15930
},
{
"epoch": 0.6778363667290356,
"grad_norm": 0.36425289511680603,
"learning_rate": 2.634465570377208e-05,
"loss": 0.0192,
"step": 15940
},
{
"epoch": 0.6782616091171968,
"grad_norm": 0.3857857882976532,
"learning_rate": 2.633924537395013e-05,
"loss": 0.02,
"step": 15950
},
{
"epoch": 0.678686851505358,
"grad_norm": 0.3799647092819214,
"learning_rate": 2.6333831599564835e-05,
"loss": 0.0183,
"step": 15960
},
{
"epoch": 0.6791120938935193,
"grad_norm": 0.42903515696525574,
"learning_rate": 2.632841438226075e-05,
"loss": 0.0221,
"step": 15970
},
{
"epoch": 0.6795373362816806,
"grad_norm": 0.5024873614311218,
"learning_rate": 2.6322993723683486e-05,
"loss": 0.021,
"step": 15980
},
{
"epoch": 0.6799625786698418,
"grad_norm": 0.435470849275589,
"learning_rate": 2.6317569625479695e-05,
"loss": 0.0208,
"step": 15990
},
{
"epoch": 0.6803878210580031,
"grad_norm": 0.43367549777030945,
"learning_rate": 2.6312142089297076e-05,
"loss": 0.0206,
"step": 16000
},
{
"epoch": 0.6808130634461643,
"grad_norm": 0.38560950756073,
"learning_rate": 2.6306711116784366e-05,
"loss": 0.0201,
"step": 16010
},
{
"epoch": 0.6812383058343255,
"grad_norm": 0.4164743423461914,
"learning_rate": 2.6301276709591358e-05,
"loss": 0.0193,
"step": 16020
},
{
"epoch": 0.6816635482224869,
"grad_norm": 0.36647531390190125,
"learning_rate": 2.6295838869368878e-05,
"loss": 0.0191,
"step": 16030
},
{
"epoch": 0.6820887906106481,
"grad_norm": 0.34206658601760864,
"learning_rate": 2.6290397597768803e-05,
"loss": 0.0189,
"step": 16040
},
{
"epoch": 0.6825140329988093,
"grad_norm": 0.40263527631759644,
"learning_rate": 2.6284952896444037e-05,
"loss": 0.0187,
"step": 16050
},
{
"epoch": 0.6829392753869705,
"grad_norm": 0.4216151237487793,
"learning_rate": 2.6279504767048547e-05,
"loss": 0.0211,
"step": 16060
},
{
"epoch": 0.6833645177751319,
"grad_norm": 0.41514572501182556,
"learning_rate": 2.6274053211237323e-05,
"loss": 0.0182,
"step": 16070
},
{
"epoch": 0.6837897601632931,
"grad_norm": 0.3836633861064911,
"learning_rate": 2.626859823066641e-05,
"loss": 0.0196,
"step": 16080
},
{
"epoch": 0.6842150025514543,
"grad_norm": 0.39318519830703735,
"learning_rate": 2.6263139826992886e-05,
"loss": 0.0237,
"step": 16090
},
{
"epoch": 0.6846402449396156,
"grad_norm": 0.42578408122062683,
"learning_rate": 2.625767800187487e-05,
"loss": 0.0172,
"step": 16100
},
{
"epoch": 0.6850654873277768,
"grad_norm": 0.3923208713531494,
"learning_rate": 2.6252212756971514e-05,
"loss": 0.0187,
"step": 16110
},
{
"epoch": 0.685490729715938,
"grad_norm": 0.3883177936077118,
"learning_rate": 2.6246744093943022e-05,
"loss": 0.0202,
"step": 16120
},
{
"epoch": 0.6859159721040994,
"grad_norm": 0.42425307631492615,
"learning_rate": 2.6241272014450625e-05,
"loss": 0.0195,
"step": 16130
},
{
"epoch": 0.6863412144922606,
"grad_norm": 0.3335045576095581,
"learning_rate": 2.6235796520156603e-05,
"loss": 0.0203,
"step": 16140
},
{
"epoch": 0.6867664568804218,
"grad_norm": 0.3852084279060364,
"learning_rate": 2.623031761272426e-05,
"loss": 0.0201,
"step": 16150
},
{
"epoch": 0.6871916992685831,
"grad_norm": 0.34301063418388367,
"learning_rate": 2.6224835293817948e-05,
"loss": 0.0189,
"step": 16160
},
{
"epoch": 0.6876169416567444,
"grad_norm": 0.401050329208374,
"learning_rate": 2.6219349565103044e-05,
"loss": 0.0199,
"step": 16170
},
{
"epoch": 0.6880421840449056,
"grad_norm": 0.44773104786872864,
"learning_rate": 2.6213860428245974e-05,
"loss": 0.0182,
"step": 16180
},
{
"epoch": 0.6884674264330668,
"grad_norm": 0.37949037551879883,
"learning_rate": 2.620836788491419e-05,
"loss": 0.0203,
"step": 16190
},
{
"epoch": 0.6888926688212281,
"grad_norm": 0.47246506810188293,
"learning_rate": 2.6202871936776183e-05,
"loss": 0.0207,
"step": 16200
},
{
"epoch": 0.6893179112093893,
"grad_norm": 0.44393685460090637,
"learning_rate": 2.6197372585501477e-05,
"loss": 0.0186,
"step": 16210
},
{
"epoch": 0.6897431535975506,
"grad_norm": 0.42799538373947144,
"learning_rate": 2.6191869832760626e-05,
"loss": 0.0196,
"step": 16220
},
{
"epoch": 0.6901683959857119,
"grad_norm": 0.39829087257385254,
"learning_rate": 2.618636368022523e-05,
"loss": 0.02,
"step": 16230
},
{
"epoch": 0.6905936383738731,
"grad_norm": 0.39561641216278076,
"learning_rate": 2.6180854129567902e-05,
"loss": 0.0178,
"step": 16240
},
{
"epoch": 0.6910188807620343,
"grad_norm": 0.4070194661617279,
"learning_rate": 2.6175341182462303e-05,
"loss": 0.0207,
"step": 16250
},
{
"epoch": 0.6914441231501957,
"grad_norm": 0.39801710844039917,
"learning_rate": 2.6169824840583124e-05,
"loss": 0.0185,
"step": 16260
},
{
"epoch": 0.6918693655383569,
"grad_norm": 0.3587666153907776,
"learning_rate": 2.6164305105606076e-05,
"loss": 0.0209,
"step": 16270
},
{
"epoch": 0.6922946079265181,
"grad_norm": 0.3414198160171509,
"learning_rate": 2.6158781979207918e-05,
"loss": 0.0196,
"step": 16280
},
{
"epoch": 0.6927198503146794,
"grad_norm": 0.372044175863266,
"learning_rate": 2.615325546306642e-05,
"loss": 0.0233,
"step": 16290
},
{
"epoch": 0.6931450927028406,
"grad_norm": 0.330537348985672,
"learning_rate": 2.61477255588604e-05,
"loss": 0.0213,
"step": 16300
},
{
"epoch": 0.6935703350910019,
"grad_norm": 0.3763916492462158,
"learning_rate": 2.614219226826969e-05,
"loss": 0.0196,
"step": 16310
},
{
"epoch": 0.6939955774791631,
"grad_norm": 0.4111920893192291,
"learning_rate": 2.6136655592975166e-05,
"loss": 0.0207,
"step": 16320
},
{
"epoch": 0.6944208198673244,
"grad_norm": 0.38545480370521545,
"learning_rate": 2.613111553465872e-05,
"loss": 0.0215,
"step": 16330
},
{
"epoch": 0.6948460622554856,
"grad_norm": 0.43266260623931885,
"learning_rate": 2.6125572095003276e-05,
"loss": 0.0191,
"step": 16340
},
{
"epoch": 0.6952713046436468,
"grad_norm": 0.4159199297428131,
"learning_rate": 2.6120025275692777e-05,
"loss": 0.0183,
"step": 16350
},
{
"epoch": 0.6956965470318082,
"grad_norm": 0.3651190996170044,
"learning_rate": 2.6114475078412212e-05,
"loss": 0.02,
"step": 16360
},
{
"epoch": 0.6961217894199694,
"grad_norm": 0.32506754994392395,
"learning_rate": 2.6108921504847575e-05,
"loss": 0.0177,
"step": 16370
},
{
"epoch": 0.6965470318081306,
"grad_norm": 0.413657009601593,
"learning_rate": 2.6103364556685902e-05,
"loss": 0.018,
"step": 16380
},
{
"epoch": 0.6969722741962919,
"grad_norm": 0.36810994148254395,
"learning_rate": 2.6097804235615242e-05,
"loss": 0.0208,
"step": 16390
},
{
"epoch": 0.6973975165844531,
"grad_norm": 0.5319944024085999,
"learning_rate": 2.6092240543324676e-05,
"loss": 0.02,
"step": 16400
},
{
"epoch": 0.6978227589726144,
"grad_norm": 0.3020700216293335,
"learning_rate": 2.6086673481504303e-05,
"loss": 0.0184,
"step": 16410
},
{
"epoch": 0.6982480013607757,
"grad_norm": 0.3940604031085968,
"learning_rate": 2.608110305184526e-05,
"loss": 0.0191,
"step": 16420
},
{
"epoch": 0.6986732437489369,
"grad_norm": 0.2802399694919586,
"learning_rate": 2.6075529256039687e-05,
"loss": 0.0207,
"step": 16430
},
{
"epoch": 0.6990984861370981,
"grad_norm": 0.29645195603370667,
"learning_rate": 2.6069952095780756e-05,
"loss": 0.0178,
"step": 16440
},
{
"epoch": 0.6995237285252593,
"grad_norm": 0.2838857173919678,
"learning_rate": 2.6064371572762667e-05,
"loss": 0.0193,
"step": 16450
},
{
"epoch": 0.6999489709134207,
"grad_norm": 0.38183513283729553,
"learning_rate": 2.605878768868063e-05,
"loss": 0.0178,
"step": 16460
},
{
"epoch": 0.7003742133015819,
"grad_norm": 0.4492323696613312,
"learning_rate": 2.605320044523088e-05,
"loss": 0.018,
"step": 16470
},
{
"epoch": 0.7007994556897431,
"grad_norm": 0.5879418849945068,
"learning_rate": 2.6047609844110683e-05,
"loss": 0.0213,
"step": 16480
},
{
"epoch": 0.7012246980779044,
"grad_norm": 0.5019773840904236,
"learning_rate": 2.604201588701831e-05,
"loss": 0.0232,
"step": 16490
},
{
"epoch": 0.7016499404660657,
"grad_norm": 0.3990536034107208,
"learning_rate": 2.6036418575653057e-05,
"loss": 0.0175,
"step": 16500
},
{
"epoch": 0.7020751828542269,
"grad_norm": 0.4245409369468689,
"learning_rate": 2.603081791171524e-05,
"loss": 0.0191,
"step": 16510
},
{
"epoch": 0.7025004252423882,
"grad_norm": 0.456036239862442,
"learning_rate": 2.602521389690619e-05,
"loss": 0.0181,
"step": 16520
},
{
"epoch": 0.7029256676305494,
"grad_norm": 0.47955575585365295,
"learning_rate": 2.601960653292827e-05,
"loss": 0.0184,
"step": 16530
},
{
"epoch": 0.7033509100187106,
"grad_norm": 0.38320642709732056,
"learning_rate": 2.6013995821484833e-05,
"loss": 0.019,
"step": 16540
},
{
"epoch": 0.703776152406872,
"grad_norm": 0.3216317594051361,
"learning_rate": 2.600838176428028e-05,
"loss": 0.0189,
"step": 16550
},
{
"epoch": 0.7042013947950332,
"grad_norm": 0.3859560489654541,
"learning_rate": 2.6002764363020004e-05,
"loss": 0.0177,
"step": 16560
},
{
"epoch": 0.7046266371831944,
"grad_norm": 0.5185313820838928,
"learning_rate": 2.5997143619410427e-05,
"loss": 0.0192,
"step": 16570
},
{
"epoch": 0.7050518795713556,
"grad_norm": 0.3257139027118683,
"learning_rate": 2.5991519535158976e-05,
"loss": 0.0188,
"step": 16580
},
{
"epoch": 0.705477121959517,
"grad_norm": 0.3514297902584076,
"learning_rate": 2.598589211197411e-05,
"loss": 0.0199,
"step": 16590
},
{
"epoch": 0.7059023643476782,
"grad_norm": 0.33370789885520935,
"learning_rate": 2.598026135156528e-05,
"loss": 0.0204,
"step": 16600
},
{
"epoch": 0.7063276067358394,
"grad_norm": 0.41825467348098755,
"learning_rate": 2.5974627255642974e-05,
"loss": 0.0185,
"step": 16610
},
{
"epoch": 0.7067528491240007,
"grad_norm": 0.34714779257774353,
"learning_rate": 2.5968989825918674e-05,
"loss": 0.0184,
"step": 16620
},
{
"epoch": 0.7071780915121619,
"grad_norm": 0.3500387370586395,
"learning_rate": 2.5963349064104887e-05,
"loss": 0.0197,
"step": 16630
},
{
"epoch": 0.7076033339003232,
"grad_norm": 0.5644680261611938,
"learning_rate": 2.5957704971915116e-05,
"loss": 0.0209,
"step": 16640
},
{
"epoch": 0.7080285762884845,
"grad_norm": 0.46666523814201355,
"learning_rate": 2.5952057551063905e-05,
"loss": 0.0193,
"step": 16650
},
{
"epoch": 0.7084538186766457,
"grad_norm": 0.41835102438926697,
"learning_rate": 2.594640680326678e-05,
"loss": 0.0192,
"step": 16660
},
{
"epoch": 0.7088790610648069,
"grad_norm": 0.5231054425239563,
"learning_rate": 2.594075273024029e-05,
"loss": 0.02,
"step": 16670
},
{
"epoch": 0.7093043034529682,
"grad_norm": 0.4124384820461273,
"learning_rate": 2.5935095333701994e-05,
"loss": 0.0204,
"step": 16680
},
{
"epoch": 0.7097295458411295,
"grad_norm": 0.297749787569046,
"learning_rate": 2.5929434615370462e-05,
"loss": 0.0235,
"step": 16690
},
{
"epoch": 0.7101547882292907,
"grad_norm": 0.3325209319591522,
"learning_rate": 2.592377057696527e-05,
"loss": 0.0191,
"step": 16700
},
{
"epoch": 0.7105800306174519,
"grad_norm": 0.3475107252597809,
"learning_rate": 2.5918103220206997e-05,
"loss": 0.0167,
"step": 16710
},
{
"epoch": 0.7110052730056132,
"grad_norm": 0.36572086811065674,
"learning_rate": 2.5912432546817247e-05,
"loss": 0.0183,
"step": 16720
},
{
"epoch": 0.7114305153937744,
"grad_norm": 0.3658443093299866,
"learning_rate": 2.5906758558518613e-05,
"loss": 0.0183,
"step": 16730
},
{
"epoch": 0.7118557577819357,
"grad_norm": 0.5162744522094727,
"learning_rate": 2.5901081257034706e-05,
"loss": 0.0178,
"step": 16740
},
{
"epoch": 0.712281000170097,
"grad_norm": 0.43483680486679077,
"learning_rate": 2.5895400644090138e-05,
"loss": 0.0207,
"step": 16750
},
{
"epoch": 0.7127062425582582,
"grad_norm": 0.43120190501213074,
"learning_rate": 2.5889716721410535e-05,
"loss": 0.0179,
"step": 16760
},
{
"epoch": 0.7131314849464194,
"grad_norm": 0.49665582180023193,
"learning_rate": 2.5884029490722515e-05,
"loss": 0.0196,
"step": 16770
},
{
"epoch": 0.7135567273345808,
"grad_norm": 0.555846095085144,
"learning_rate": 2.587833895375371e-05,
"loss": 0.0186,
"step": 16780
},
{
"epoch": 0.713981969722742,
"grad_norm": 0.31895700097084045,
"learning_rate": 2.587264511223276e-05,
"loss": 0.0183,
"step": 16790
},
{
"epoch": 0.7144072121109032,
"grad_norm": 0.3290864825248718,
"learning_rate": 2.5866947967889295e-05,
"loss": 0.0214,
"step": 16800
},
{
"epoch": 0.7148324544990645,
"grad_norm": 0.4324344992637634,
"learning_rate": 2.586124752245397e-05,
"loss": 0.02,
"step": 16810
},
{
"epoch": 0.7152576968872257,
"grad_norm": 0.4053352475166321,
"learning_rate": 2.585554377765842e-05,
"loss": 0.0214,
"step": 16820
},
{
"epoch": 0.715682939275387,
"grad_norm": 0.4539511501789093,
"learning_rate": 2.584983673523529e-05,
"loss": 0.0215,
"step": 16830
},
{
"epoch": 0.7161081816635482,
"grad_norm": 0.4536151885986328,
"learning_rate": 2.5844126396918236e-05,
"loss": 0.0183,
"step": 16840
},
{
"epoch": 0.7165334240517095,
"grad_norm": 0.4749014973640442,
"learning_rate": 2.5838412764441905e-05,
"loss": 0.0199,
"step": 16850
},
{
"epoch": 0.7169586664398707,
"grad_norm": 0.3392566442489624,
"learning_rate": 2.583269583954195e-05,
"loss": 0.0193,
"step": 16860
},
{
"epoch": 0.7173839088280319,
"grad_norm": 0.26803597807884216,
"learning_rate": 2.5826975623955017e-05,
"loss": 0.0187,
"step": 16870
},
{
"epoch": 0.7178091512161933,
"grad_norm": 0.37092941999435425,
"learning_rate": 2.5821252119418762e-05,
"loss": 0.0177,
"step": 16880
},
{
"epoch": 0.7182343936043545,
"grad_norm": 0.31138113141059875,
"learning_rate": 2.581552532767183e-05,
"loss": 0.0188,
"step": 16890
},
{
"epoch": 0.7186596359925157,
"grad_norm": 0.3887597322463989,
"learning_rate": 2.580979525045387e-05,
"loss": 0.0223,
"step": 16900
},
{
"epoch": 0.719084878380677,
"grad_norm": 0.3579878807067871,
"learning_rate": 2.580406188950553e-05,
"loss": 0.0181,
"step": 16910
},
{
"epoch": 0.7195101207688382,
"grad_norm": 0.3246123194694519,
"learning_rate": 2.5798325246568453e-05,
"loss": 0.0187,
"step": 16920
},
{
"epoch": 0.7199353631569995,
"grad_norm": 0.4029758870601654,
"learning_rate": 2.579258532338528e-05,
"loss": 0.019,
"step": 16930
},
{
"epoch": 0.7203606055451608,
"grad_norm": 0.3865378499031067,
"learning_rate": 2.5786842121699644e-05,
"loss": 0.0178,
"step": 16940
},
{
"epoch": 0.720785847933322,
"grad_norm": 0.3467353582382202,
"learning_rate": 2.5781095643256186e-05,
"loss": 0.0196,
"step": 16950
},
{
"epoch": 0.7212110903214832,
"grad_norm": 0.42222702503204346,
"learning_rate": 2.577534588980053e-05,
"loss": 0.0193,
"step": 16960
},
{
"epoch": 0.7216363327096444,
"grad_norm": 0.39148077368736267,
"learning_rate": 2.5769592863079298e-05,
"loss": 0.0176,
"step": 16970
},
{
"epoch": 0.7220615750978058,
"grad_norm": 0.3850536048412323,
"learning_rate": 2.576383656484011e-05,
"loss": 0.0199,
"step": 16980
},
{
"epoch": 0.722486817485967,
"grad_norm": 0.40393391251564026,
"learning_rate": 2.5758076996831572e-05,
"loss": 0.0218,
"step": 16990
},
{
"epoch": 0.7229120598741282,
"grad_norm": 0.39605194330215454,
"learning_rate": 2.5752314160803295e-05,
"loss": 0.02,
"step": 17000
},
{
"epoch": 0.7233373022622895,
"grad_norm": 0.39946261048316956,
"learning_rate": 2.5746548058505873e-05,
"loss": 0.0201,
"step": 17010
},
{
"epoch": 0.7237625446504508,
"grad_norm": 0.5638138651847839,
"learning_rate": 2.57407786916909e-05,
"loss": 0.021,
"step": 17020
},
{
"epoch": 0.724187787038612,
"grad_norm": 0.36596256494522095,
"learning_rate": 2.573500606211095e-05,
"loss": 0.0186,
"step": 17030
},
{
"epoch": 0.7246130294267733,
"grad_norm": 0.5323764085769653,
"learning_rate": 2.57292301715196e-05,
"loss": 0.0202,
"step": 17040
},
{
"epoch": 0.7250382718149345,
"grad_norm": 0.4261687099933624,
"learning_rate": 2.572345102167142e-05,
"loss": 0.018,
"step": 17050
},
{
"epoch": 0.7254635142030957,
"grad_norm": 0.44146978855133057,
"learning_rate": 2.571766861432195e-05,
"loss": 0.021,
"step": 17060
},
{
"epoch": 0.7258887565912571,
"grad_norm": 0.36145922541618347,
"learning_rate": 2.5711882951227738e-05,
"loss": 0.018,
"step": 17070
},
{
"epoch": 0.7263139989794183,
"grad_norm": 0.4039449393749237,
"learning_rate": 2.570609403414632e-05,
"loss": 0.0198,
"step": 17080
},
{
"epoch": 0.7267392413675795,
"grad_norm": 0.42143484950065613,
"learning_rate": 2.5700301864836212e-05,
"loss": 0.0195,
"step": 17090
},
{
"epoch": 0.7271644837557407,
"grad_norm": 0.5072124600410461,
"learning_rate": 2.569450644505692e-05,
"loss": 0.02,
"step": 17100
},
{
"epoch": 0.727589726143902,
"grad_norm": 0.418938547372818,
"learning_rate": 2.5688707776568943e-05,
"loss": 0.0213,
"step": 17110
},
{
"epoch": 0.7280149685320633,
"grad_norm": 0.3790704607963562,
"learning_rate": 2.5682905861133764e-05,
"loss": 0.0189,
"step": 17120
},
{
"epoch": 0.7284402109202245,
"grad_norm": 0.42760220170021057,
"learning_rate": 2.567710070051385e-05,
"loss": 0.0186,
"step": 17130
},
{
"epoch": 0.7288654533083858,
"grad_norm": 0.4262620210647583,
"learning_rate": 2.5671292296472652e-05,
"loss": 0.0205,
"step": 17140
},
{
"epoch": 0.729290695696547,
"grad_norm": 0.35412681102752686,
"learning_rate": 2.5665480650774616e-05,
"loss": 0.0194,
"step": 17150
},
{
"epoch": 0.7297159380847082,
"grad_norm": 0.461956262588501,
"learning_rate": 2.565966576518517e-05,
"loss": 0.0216,
"step": 17160
},
{
"epoch": 0.7301411804728696,
"grad_norm": 0.4576774537563324,
"learning_rate": 2.5653847641470706e-05,
"loss": 0.0194,
"step": 17170
},
{
"epoch": 0.7305664228610308,
"grad_norm": 0.4432138502597809,
"learning_rate": 2.564802628139863e-05,
"loss": 0.021,
"step": 17180
},
{
"epoch": 0.730991665249192,
"grad_norm": 0.343555212020874,
"learning_rate": 2.5642201686737318e-05,
"loss": 0.0201,
"step": 17190
},
{
"epoch": 0.7314169076373533,
"grad_norm": 0.49047133326530457,
"learning_rate": 2.563637385925612e-05,
"loss": 0.0186,
"step": 17200
},
{
"epoch": 0.7318421500255146,
"grad_norm": 0.40630829334259033,
"learning_rate": 2.5630542800725388e-05,
"loss": 0.02,
"step": 17210
},
{
"epoch": 0.7322673924136758,
"grad_norm": 0.5432057976722717,
"learning_rate": 2.562470851291643e-05,
"loss": 0.0218,
"step": 17220
},
{
"epoch": 0.732692634801837,
"grad_norm": 0.4478088617324829,
"learning_rate": 2.561887099760156e-05,
"loss": 0.019,
"step": 17230
},
{
"epoch": 0.7331178771899983,
"grad_norm": 0.3950343430042267,
"learning_rate": 2.5613030256554054e-05,
"loss": 0.0187,
"step": 17240
},
{
"epoch": 0.7335431195781595,
"grad_norm": 0.3851512670516968,
"learning_rate": 2.5607186291548174e-05,
"loss": 0.0194,
"step": 17250
},
{
"epoch": 0.7339683619663208,
"grad_norm": 0.38966938853263855,
"learning_rate": 2.5601339104359167e-05,
"loss": 0.0194,
"step": 17260
},
{
"epoch": 0.7343936043544821,
"grad_norm": 0.4339113235473633,
"learning_rate": 2.559548869676325e-05,
"loss": 0.0167,
"step": 17270
},
{
"epoch": 0.7348188467426433,
"grad_norm": 0.5083813667297363,
"learning_rate": 2.558963507053763e-05,
"loss": 0.0195,
"step": 17280
},
{
"epoch": 0.7352440891308045,
"grad_norm": 0.37444162368774414,
"learning_rate": 2.5583778227460476e-05,
"loss": 0.0171,
"step": 17290
},
{
"epoch": 0.7356693315189659,
"grad_norm": 0.3342028856277466,
"learning_rate": 2.5577918169310943e-05,
"loss": 0.0214,
"step": 17300
},
{
"epoch": 0.7360945739071271,
"grad_norm": 0.38764867186546326,
"learning_rate": 2.5572054897869167e-05,
"loss": 0.0192,
"step": 17310
},
{
"epoch": 0.7365198162952883,
"grad_norm": 0.457276314496994,
"learning_rate": 2.556618841491625e-05,
"loss": 0.0195,
"step": 17320
},
{
"epoch": 0.7369450586834496,
"grad_norm": 0.3450474441051483,
"learning_rate": 2.5560318722234276e-05,
"loss": 0.0195,
"step": 17330
},
{
"epoch": 0.7373703010716108,
"grad_norm": 0.3469228446483612,
"learning_rate": 2.5554445821606306e-05,
"loss": 0.0169,
"step": 17340
},
{
"epoch": 0.737795543459772,
"grad_norm": 0.45763877034187317,
"learning_rate": 2.554856971481637e-05,
"loss": 0.0183,
"step": 17350
},
{
"epoch": 0.7382207858479333,
"grad_norm": 0.44012799859046936,
"learning_rate": 2.554269040364947e-05,
"loss": 0.0173,
"step": 17360
},
{
"epoch": 0.7386460282360946,
"grad_norm": 0.40965408086776733,
"learning_rate": 2.553680788989159e-05,
"loss": 0.019,
"step": 17370
},
{
"epoch": 0.7390712706242558,
"grad_norm": 0.3658427596092224,
"learning_rate": 2.5530922175329677e-05,
"loss": 0.0174,
"step": 17380
},
{
"epoch": 0.739496513012417,
"grad_norm": 0.4212437570095062,
"learning_rate": 2.5525033261751667e-05,
"loss": 0.0184,
"step": 17390
},
{
"epoch": 0.7399217554005784,
"grad_norm": 0.3762251138687134,
"learning_rate": 2.5519141150946445e-05,
"loss": 0.0224,
"step": 17400
},
{
"epoch": 0.7403469977887396,
"grad_norm": 0.40899696946144104,
"learning_rate": 2.551324584470389e-05,
"loss": 0.0187,
"step": 17410
},
{
"epoch": 0.7407722401769008,
"grad_norm": 0.3483990728855133,
"learning_rate": 2.5507347344814825e-05,
"loss": 0.0186,
"step": 17420
},
{
"epoch": 0.7411974825650621,
"grad_norm": 0.3548679053783417,
"learning_rate": 2.5501445653071072e-05,
"loss": 0.0188,
"step": 17430
},
{
"epoch": 0.7416227249532233,
"grad_norm": 0.3296952247619629,
"learning_rate": 2.5495540771265406e-05,
"loss": 0.0168,
"step": 17440
},
{
"epoch": 0.7420479673413846,
"grad_norm": 0.3904550075531006,
"learning_rate": 2.5489632701191565e-05,
"loss": 0.0172,
"step": 17450
},
{
"epoch": 0.7424732097295459,
"grad_norm": 0.4135804772377014,
"learning_rate": 2.5483721444644276e-05,
"loss": 0.0202,
"step": 17460
},
{
"epoch": 0.7428984521177071,
"grad_norm": 0.44280630350112915,
"learning_rate": 2.5477807003419222e-05,
"loss": 0.0197,
"step": 17470
},
{
"epoch": 0.7433236945058683,
"grad_norm": 0.394596666097641,
"learning_rate": 2.547188937931305e-05,
"loss": 0.0182,
"step": 17480
},
{
"epoch": 0.7437489368940295,
"grad_norm": 0.37119102478027344,
"learning_rate": 2.5465968574123372e-05,
"loss": 0.0191,
"step": 17490
},
{
"epoch": 0.7441741792821909,
"grad_norm": 0.2509319484233856,
"learning_rate": 2.5460044589648787e-05,
"loss": 0.0172,
"step": 17500
},
{
"epoch": 0.7445994216703521,
"grad_norm": 0.3507034480571747,
"learning_rate": 2.5454117427688833e-05,
"loss": 0.0169,
"step": 17510
},
{
"epoch": 0.7450246640585133,
"grad_norm": 0.36366188526153564,
"learning_rate": 2.5448187090044033e-05,
"loss": 0.0209,
"step": 17520
},
{
"epoch": 0.7454499064466746,
"grad_norm": 0.37416818737983704,
"learning_rate": 2.544225357851586e-05,
"loss": 0.021,
"step": 17530
},
{
"epoch": 0.7458751488348359,
"grad_norm": 0.42603954672813416,
"learning_rate": 2.543631689490676e-05,
"loss": 0.0205,
"step": 17540
},
{
"epoch": 0.7463003912229971,
"grad_norm": 0.3621084988117218,
"learning_rate": 2.543037704102015e-05,
"loss": 0.0191,
"step": 17550
},
{
"epoch": 0.7467256336111584,
"grad_norm": 0.5240678787231445,
"learning_rate": 2.5424434018660386e-05,
"loss": 0.0205,
"step": 17560
},
{
"epoch": 0.7471508759993196,
"grad_norm": 0.2871054410934448,
"learning_rate": 2.541848782963281e-05,
"loss": 0.0217,
"step": 17570
},
{
"epoch": 0.7475761183874808,
"grad_norm": 0.4009586274623871,
"learning_rate": 2.5412538475743714e-05,
"loss": 0.0198,
"step": 17580
},
{
"epoch": 0.7480013607756422,
"grad_norm": 0.3869740068912506,
"learning_rate": 2.540658595880036e-05,
"loss": 0.0197,
"step": 17590
},
{
"epoch": 0.7484266031638034,
"grad_norm": 0.35922184586524963,
"learning_rate": 2.540063028061096e-05,
"loss": 0.0195,
"step": 17600
},
{
"epoch": 0.7488518455519646,
"grad_norm": 0.32907354831695557,
"learning_rate": 2.5394671442984692e-05,
"loss": 0.0178,
"step": 17610
},
{
"epoch": 0.7492770879401258,
"grad_norm": 0.4469134211540222,
"learning_rate": 2.5388709447731696e-05,
"loss": 0.0157,
"step": 17620
},
{
"epoch": 0.7497023303282871,
"grad_norm": 0.38923630118370056,
"learning_rate": 2.5382744296663068e-05,
"loss": 0.0205,
"step": 17630
},
{
"epoch": 0.7501275727164484,
"grad_norm": 0.5383104681968689,
"learning_rate": 2.5376775991590865e-05,
"loss": 0.0212,
"step": 17640
},
{
"epoch": 0.7505528151046096,
"grad_norm": 0.3489135205745697,
"learning_rate": 2.5370804534328097e-05,
"loss": 0.0205,
"step": 17650
},
{
"epoch": 0.7509780574927709,
"grad_norm": 0.3815806806087494,
"learning_rate": 2.5364829926688736e-05,
"loss": 0.0207,
"step": 17660
},
{
"epoch": 0.7514032998809321,
"grad_norm": 0.22493062913417816,
"learning_rate": 2.5358852170487713e-05,
"loss": 0.0166,
"step": 17670
},
{
"epoch": 0.7518285422690933,
"grad_norm": 0.39472290873527527,
"learning_rate": 2.5352871267540906e-05,
"loss": 0.0178,
"step": 17680
},
{
"epoch": 0.7522537846572547,
"grad_norm": 0.444985032081604,
"learning_rate": 2.534688721966516e-05,
"loss": 0.021,
"step": 17690
},
{
"epoch": 0.7526790270454159,
"grad_norm": 0.3079502284526825,
"learning_rate": 2.5340900028678276e-05,
"loss": 0.0192,
"step": 17700
},
{
"epoch": 0.7531042694335771,
"grad_norm": 0.27846136689186096,
"learning_rate": 2.5334909696398998e-05,
"loss": 0.0182,
"step": 17710
},
{
"epoch": 0.7535295118217384,
"grad_norm": 0.3500784635543823,
"learning_rate": 2.5328916224647027e-05,
"loss": 0.0178,
"step": 17720
},
{
"epoch": 0.7539547542098997,
"grad_norm": 0.4569058418273926,
"learning_rate": 2.532291961524303e-05,
"loss": 0.0185,
"step": 17730
},
{
"epoch": 0.7543799965980609,
"grad_norm": 0.34410810470581055,
"learning_rate": 2.531691987000861e-05,
"loss": 0.0191,
"step": 17740
},
{
"epoch": 0.7548052389862221,
"grad_norm": 0.38633546233177185,
"learning_rate": 2.5310916990766343e-05,
"loss": 0.0184,
"step": 17750
},
{
"epoch": 0.7552304813743834,
"grad_norm": 0.37940606474876404,
"learning_rate": 2.5304910979339734e-05,
"loss": 0.0174,
"step": 17760
},
{
"epoch": 0.7556557237625446,
"grad_norm": 0.3326164484024048,
"learning_rate": 2.5298901837553255e-05,
"loss": 0.0171,
"step": 17770
},
{
"epoch": 0.7560809661507059,
"grad_norm": 0.33612585067749023,
"learning_rate": 2.5292889567232326e-05,
"loss": 0.0189,
"step": 17780
},
{
"epoch": 0.7565062085388672,
"grad_norm": 0.3002769649028778,
"learning_rate": 2.528687417020331e-05,
"loss": 0.0178,
"step": 17790
},
{
"epoch": 0.7569314509270284,
"grad_norm": 0.5658273696899414,
"learning_rate": 2.5280855648293536e-05,
"loss": 0.0178,
"step": 17800
},
{
"epoch": 0.7573566933151896,
"grad_norm": 0.2587268352508545,
"learning_rate": 2.5274834003331266e-05,
"loss": 0.019,
"step": 17810
},
{
"epoch": 0.757781935703351,
"grad_norm": 0.3134019374847412,
"learning_rate": 2.5268809237145717e-05,
"loss": 0.0195,
"step": 17820
},
{
"epoch": 0.7582071780915122,
"grad_norm": 0.34614887833595276,
"learning_rate": 2.5262781351567052e-05,
"loss": 0.017,
"step": 17830
},
{
"epoch": 0.7586324204796734,
"grad_norm": 0.4725238084793091,
"learning_rate": 2.525675034842638e-05,
"loss": 0.0207,
"step": 17840
},
{
"epoch": 0.7590576628678347,
"grad_norm": 0.3761941194534302,
"learning_rate": 2.5250716229555774e-05,
"loss": 0.0185,
"step": 17850
},
{
"epoch": 0.7594829052559959,
"grad_norm": 0.35306262969970703,
"learning_rate": 2.524467899678823e-05,
"loss": 0.0188,
"step": 17860
},
{
"epoch": 0.7599081476441572,
"grad_norm": 0.4369204342365265,
"learning_rate": 2.52386386519577e-05,
"loss": 0.0205,
"step": 17870
},
{
"epoch": 0.7603333900323184,
"grad_norm": 0.35953933000564575,
"learning_rate": 2.5232595196899086e-05,
"loss": 0.0191,
"step": 17880
},
{
"epoch": 0.7607586324204797,
"grad_norm": 0.28581809997558594,
"learning_rate": 2.5226548633448224e-05,
"loss": 0.0175,
"step": 17890
},
{
"epoch": 0.7611838748086409,
"grad_norm": 0.3392051160335541,
"learning_rate": 2.5220498963441906e-05,
"loss": 0.0181,
"step": 17900
},
{
"epoch": 0.7616091171968021,
"grad_norm": 0.3997742533683777,
"learning_rate": 2.521444618871786e-05,
"loss": 0.019,
"step": 17910
},
{
"epoch": 0.7620343595849635,
"grad_norm": 0.5108124613761902,
"learning_rate": 2.5208390311114758e-05,
"loss": 0.0206,
"step": 17920
},
{
"epoch": 0.7624596019731247,
"grad_norm": 0.37691330909729004,
"learning_rate": 2.520233133247221e-05,
"loss": 0.0197,
"step": 17930
},
{
"epoch": 0.7628848443612859,
"grad_norm": 0.4904385507106781,
"learning_rate": 2.519626925463079e-05,
"loss": 0.0171,
"step": 17940
},
{
"epoch": 0.7633100867494472,
"grad_norm": 0.43569111824035645,
"learning_rate": 2.519020407943198e-05,
"loss": 0.0178,
"step": 17950
},
{
"epoch": 0.7637353291376084,
"grad_norm": 0.4185038208961487,
"learning_rate": 2.518413580871823e-05,
"loss": 0.0175,
"step": 17960
},
{
"epoch": 0.7641605715257697,
"grad_norm": 0.3519199788570404,
"learning_rate": 2.5178064444332922e-05,
"loss": 0.0186,
"step": 17970
},
{
"epoch": 0.764585813913931,
"grad_norm": 0.40214109420776367,
"learning_rate": 2.5171989988120368e-05,
"loss": 0.016,
"step": 17980
},
{
"epoch": 0.7650110563020922,
"grad_norm": 0.46126723289489746,
"learning_rate": 2.5165912441925832e-05,
"loss": 0.0169,
"step": 17990
},
{
"epoch": 0.7654362986902534,
"grad_norm": 0.30911028385162354,
"learning_rate": 2.515983180759551e-05,
"loss": 0.0172,
"step": 18000
},
{
"epoch": 0.7658615410784146,
"grad_norm": 0.4340876042842865,
"learning_rate": 2.515374808697654e-05,
"loss": 0.0189,
"step": 18010
},
{
"epoch": 0.766286783466576,
"grad_norm": 0.30016154050827026,
"learning_rate": 2.5147661281916996e-05,
"loss": 0.017,
"step": 18020
},
{
"epoch": 0.7667120258547372,
"grad_norm": 0.27429714798927307,
"learning_rate": 2.5141571394265892e-05,
"loss": 0.0183,
"step": 18030
},
{
"epoch": 0.7671372682428984,
"grad_norm": 0.4100990295410156,
"learning_rate": 2.513547842587317e-05,
"loss": 0.0185,
"step": 18040
},
{
"epoch": 0.7675625106310597,
"grad_norm": 0.38406017422676086,
"learning_rate": 2.5129382378589708e-05,
"loss": 0.0192,
"step": 18050
},
{
"epoch": 0.767987753019221,
"grad_norm": 0.4918137788772583,
"learning_rate": 2.512328325426733e-05,
"loss": 0.0169,
"step": 18060
},
{
"epoch": 0.7684129954073822,
"grad_norm": 0.46917903423309326,
"learning_rate": 2.5117181054758798e-05,
"loss": 0.0191,
"step": 18070
},
{
"epoch": 0.7688382377955435,
"grad_norm": 0.3494175374507904,
"learning_rate": 2.5111075781917783e-05,
"loss": 0.0184,
"step": 18080
},
{
"epoch": 0.7692634801837047,
"grad_norm": 0.3880108594894409,
"learning_rate": 2.510496743759892e-05,
"loss": 0.0198,
"step": 18090
},
{
"epoch": 0.7696887225718659,
"grad_norm": 0.28338170051574707,
"learning_rate": 2.509885602365775e-05,
"loss": 0.0171,
"step": 18100
},
{
"epoch": 0.7701139649600273,
"grad_norm": 0.3353395164012909,
"learning_rate": 2.509274154195076e-05,
"loss": 0.0183,
"step": 18110
},
{
"epoch": 0.7705392073481885,
"grad_norm": 0.3458663523197174,
"learning_rate": 2.5086623994335383e-05,
"loss": 0.0172,
"step": 18120
},
{
"epoch": 0.7709644497363497,
"grad_norm": 0.36026453971862793,
"learning_rate": 2.5080503382669953e-05,
"loss": 0.0199,
"step": 18130
},
{
"epoch": 0.7713896921245109,
"grad_norm": 0.38520294427871704,
"learning_rate": 2.507437970881376e-05,
"loss": 0.0222,
"step": 18140
},
{
"epoch": 0.7718149345126722,
"grad_norm": 0.4084884822368622,
"learning_rate": 2.5068252974627003e-05,
"loss": 0.0168,
"step": 18150
},
{
"epoch": 0.7722401769008335,
"grad_norm": 0.3372369408607483,
"learning_rate": 2.5062123181970834e-05,
"loss": 0.0189,
"step": 18160
},
{
"epoch": 0.7726654192889947,
"grad_norm": 0.36237940192222595,
"learning_rate": 2.5055990332707316e-05,
"loss": 0.0236,
"step": 18170
},
{
"epoch": 0.773090661677156,
"grad_norm": 0.395637571811676,
"learning_rate": 2.5049854428699444e-05,
"loss": 0.018,
"step": 18180
},
{
"epoch": 0.7735159040653172,
"grad_norm": 0.3573133051395416,
"learning_rate": 2.5043715471811158e-05,
"loss": 0.0191,
"step": 18190
},
{
"epoch": 0.7739411464534784,
"grad_norm": 0.48093611001968384,
"learning_rate": 2.5037573463907296e-05,
"loss": 0.0165,
"step": 18200
},
{
"epoch": 0.7743663888416398,
"grad_norm": 0.3498045802116394,
"learning_rate": 2.5031428406853637e-05,
"loss": 0.0181,
"step": 18210
},
{
"epoch": 0.774791631229801,
"grad_norm": 0.38417303562164307,
"learning_rate": 2.5025280302516897e-05,
"loss": 0.0204,
"step": 18220
},
{
"epoch": 0.7752168736179622,
"grad_norm": 0.3889990746974945,
"learning_rate": 2.5019129152764698e-05,
"loss": 0.0192,
"step": 18230
},
{
"epoch": 0.7756421160061235,
"grad_norm": 0.4350021481513977,
"learning_rate": 2.5012974959465612e-05,
"loss": 0.0178,
"step": 18240
},
{
"epoch": 0.7760673583942848,
"grad_norm": 0.3341920077800751,
"learning_rate": 2.5006817724489105e-05,
"loss": 0.0174,
"step": 18250
},
{
"epoch": 0.776492600782446,
"grad_norm": 0.3362509608268738,
"learning_rate": 2.500065744970559e-05,
"loss": 0.0176,
"step": 18260
},
{
"epoch": 0.7769178431706072,
"grad_norm": 0.3553008735179901,
"learning_rate": 2.499449413698639e-05,
"loss": 0.0189,
"step": 18270
},
{
"epoch": 0.7773430855587685,
"grad_norm": 0.3581770658493042,
"learning_rate": 2.4988327788203764e-05,
"loss": 0.0193,
"step": 18280
},
{
"epoch": 0.7777683279469297,
"grad_norm": 0.4561096131801605,
"learning_rate": 2.498215840523088e-05,
"loss": 0.019,
"step": 18290
},
{
"epoch": 0.778193570335091,
"grad_norm": 0.2999465763568878,
"learning_rate": 2.4975985989941837e-05,
"loss": 0.0189,
"step": 18300
},
{
"epoch": 0.7786188127232523,
"grad_norm": 0.36827707290649414,
"learning_rate": 2.4969810544211652e-05,
"loss": 0.0196,
"step": 18310
},
{
"epoch": 0.7790440551114135,
"grad_norm": 0.3937795162200928,
"learning_rate": 2.4963632069916258e-05,
"loss": 0.0171,
"step": 18320
},
{
"epoch": 0.7794692974995747,
"grad_norm": 0.37284278869628906,
"learning_rate": 2.495745056893252e-05,
"loss": 0.0204,
"step": 18330
},
{
"epoch": 0.779894539887736,
"grad_norm": 0.33898916840553284,
"learning_rate": 2.495126604313821e-05,
"loss": 0.0183,
"step": 18340
},
{
"epoch": 0.7803197822758973,
"grad_norm": 0.3953031003475189,
"learning_rate": 2.4945078494412023e-05,
"loss": 0.0176,
"step": 18350
},
{
"epoch": 0.7807450246640585,
"grad_norm": 0.3483516275882721,
"learning_rate": 2.493888792463357e-05,
"loss": 0.0173,
"step": 18360
},
{
"epoch": 0.7811702670522198,
"grad_norm": 0.3744226098060608,
"learning_rate": 2.4932694335683395e-05,
"loss": 0.0173,
"step": 18370
},
{
"epoch": 0.781595509440381,
"grad_norm": 0.3779127299785614,
"learning_rate": 2.4926497729442932e-05,
"loss": 0.0203,
"step": 18380
},
{
"epoch": 0.7820207518285422,
"grad_norm": 0.3308951258659363,
"learning_rate": 2.4920298107794555e-05,
"loss": 0.018,
"step": 18390
},
{
"epoch": 0.7824459942167035,
"grad_norm": 0.3206545412540436,
"learning_rate": 2.4914095472621544e-05,
"loss": 0.0184,
"step": 18400
},
{
"epoch": 0.7828712366048648,
"grad_norm": 0.5378426909446716,
"learning_rate": 2.4907889825808093e-05,
"loss": 0.0217,
"step": 18410
},
{
"epoch": 0.783296478993026,
"grad_norm": 0.3933923840522766,
"learning_rate": 2.4901681169239314e-05,
"loss": 0.0174,
"step": 18420
},
{
"epoch": 0.7837217213811872,
"grad_norm": 0.3201284110546112,
"learning_rate": 2.4895469504801236e-05,
"loss": 0.0204,
"step": 18430
},
{
"epoch": 0.7841469637693486,
"grad_norm": 0.44616612792015076,
"learning_rate": 2.4889254834380802e-05,
"loss": 0.0174,
"step": 18440
},
{
"epoch": 0.7845722061575098,
"grad_norm": 0.33565399050712585,
"learning_rate": 2.488303715986585e-05,
"loss": 0.0185,
"step": 18450
},
{
"epoch": 0.784997448545671,
"grad_norm": 0.2184840440750122,
"learning_rate": 2.4876816483145166e-05,
"loss": 0.0186,
"step": 18460
},
{
"epoch": 0.7854226909338323,
"grad_norm": 0.2786431312561035,
"learning_rate": 2.487059280610841e-05,
"loss": 0.0171,
"step": 18470
},
{
"epoch": 0.7858479333219935,
"grad_norm": 0.3800705671310425,
"learning_rate": 2.4864366130646178e-05,
"loss": 0.0165,
"step": 18480
},
{
"epoch": 0.7862731757101548,
"grad_norm": 0.3747217059135437,
"learning_rate": 2.485813645864997e-05,
"loss": 0.0182,
"step": 18490
},
{
"epoch": 0.7866984180983161,
"grad_norm": 0.3308454751968384,
"learning_rate": 2.4851903792012198e-05,
"loss": 0.0176,
"step": 18500
},
{
"epoch": 0.7871236604864773,
"grad_norm": 0.47512272000312805,
"learning_rate": 2.4845668132626173e-05,
"loss": 0.0207,
"step": 18510
},
{
"epoch": 0.7875489028746385,
"grad_norm": 0.2912342846393585,
"learning_rate": 2.483942948238613e-05,
"loss": 0.0201,
"step": 18520
},
{
"epoch": 0.7879741452627997,
"grad_norm": 0.38101184368133545,
"learning_rate": 2.4833187843187207e-05,
"loss": 0.0173,
"step": 18530
},
{
"epoch": 0.7883993876509611,
"grad_norm": 0.42462605237960815,
"learning_rate": 2.4826943216925448e-05,
"loss": 0.0204,
"step": 18540
},
{
"epoch": 0.7888246300391223,
"grad_norm": 0.4753773808479309,
"learning_rate": 2.4820695605497807e-05,
"loss": 0.018,
"step": 18550
},
{
"epoch": 0.7892498724272835,
"grad_norm": 0.5159977078437805,
"learning_rate": 2.481444501080214e-05,
"loss": 0.0182,
"step": 18560
},
{
"epoch": 0.7896751148154448,
"grad_norm": 0.4189545512199402,
"learning_rate": 2.4808191434737217e-05,
"loss": 0.0214,
"step": 18570
},
{
"epoch": 0.790100357203606,
"grad_norm": 0.2559889554977417,
"learning_rate": 2.4801934879202696e-05,
"loss": 0.0158,
"step": 18580
},
{
"epoch": 0.7905255995917673,
"grad_norm": 0.6500698924064636,
"learning_rate": 2.4795675346099172e-05,
"loss": 0.0202,
"step": 18590
},
{
"epoch": 0.7909508419799286,
"grad_norm": 0.29353052377700806,
"learning_rate": 2.4789412837328114e-05,
"loss": 0.019,
"step": 18600
},
{
"epoch": 0.7913760843680898,
"grad_norm": 0.39966511726379395,
"learning_rate": 2.4783147354791915e-05,
"loss": 0.019,
"step": 18610
},
{
"epoch": 0.791801326756251,
"grad_norm": 0.33445870876312256,
"learning_rate": 2.4776878900393858e-05,
"loss": 0.0168,
"step": 18620
},
{
"epoch": 0.7922265691444124,
"grad_norm": 0.33083635568618774,
"learning_rate": 2.477060747603813e-05,
"loss": 0.0178,
"step": 18630
},
{
"epoch": 0.7926518115325736,
"grad_norm": 0.2751755118370056,
"learning_rate": 2.4764333083629833e-05,
"loss": 0.0176,
"step": 18640
},
{
"epoch": 0.7930770539207348,
"grad_norm": 0.25697383284568787,
"learning_rate": 2.4758055725074954e-05,
"loss": 0.0178,
"step": 18650
},
{
"epoch": 0.793502296308896,
"grad_norm": 0.3238734006881714,
"learning_rate": 2.4751775402280396e-05,
"loss": 0.0173,
"step": 18660
},
{
"epoch": 0.7939275386970573,
"grad_norm": 0.32385683059692383,
"learning_rate": 2.4745492117153947e-05,
"loss": 0.0163,
"step": 18670
},
{
"epoch": 0.7943527810852186,
"grad_norm": 0.4163258671760559,
"learning_rate": 2.473920587160431e-05,
"loss": 0.0198,
"step": 18680
},
{
"epoch": 0.7947780234733798,
"grad_norm": 0.3122082054615021,
"learning_rate": 2.473291666754108e-05,
"loss": 0.0191,
"step": 18690
},
{
"epoch": 0.7952032658615411,
"grad_norm": 0.3161797821521759,
"learning_rate": 2.4726624506874748e-05,
"loss": 0.0181,
"step": 18700
},
{
"epoch": 0.7956285082497023,
"grad_norm": 0.3504398465156555,
"learning_rate": 2.4720329391516708e-05,
"loss": 0.0179,
"step": 18710
},
{
"epoch": 0.7960537506378635,
"grad_norm": 0.28261467814445496,
"learning_rate": 2.4714031323379248e-05,
"loss": 0.0176,
"step": 18720
},
{
"epoch": 0.7964789930260249,
"grad_norm": 0.37667712569236755,
"learning_rate": 2.4707730304375556e-05,
"loss": 0.0166,
"step": 18730
},
{
"epoch": 0.7969042354141861,
"grad_norm": 0.40282124280929565,
"learning_rate": 2.4701426336419713e-05,
"loss": 0.0204,
"step": 18740
},
{
"epoch": 0.7973294778023473,
"grad_norm": 0.3857354521751404,
"learning_rate": 2.4695119421426707e-05,
"loss": 0.0183,
"step": 18750
},
{
"epoch": 0.7977547201905086,
"grad_norm": 0.32577598094940186,
"learning_rate": 2.46888095613124e-05,
"loss": 0.0156,
"step": 18760
},
{
"epoch": 0.7981799625786699,
"grad_norm": 0.31286710500717163,
"learning_rate": 2.468249675799357e-05,
"loss": 0.0187,
"step": 18770
},
{
"epoch": 0.7986052049668311,
"grad_norm": 0.29057997465133667,
"learning_rate": 2.467618101338787e-05,
"loss": 0.0186,
"step": 18780
},
{
"epoch": 0.7990304473549923,
"grad_norm": 0.32208725810050964,
"learning_rate": 2.466986232941387e-05,
"loss": 0.0179,
"step": 18790
},
{
"epoch": 0.7994556897431536,
"grad_norm": 0.42681917548179626,
"learning_rate": 2.466354070799101e-05,
"loss": 0.0185,
"step": 18800
},
{
"epoch": 0.7998809321313148,
"grad_norm": 0.389090895652771,
"learning_rate": 2.4657216151039634e-05,
"loss": 0.0182,
"step": 18810
},
{
"epoch": 0.800306174519476,
"grad_norm": 0.338835209608078,
"learning_rate": 2.4650888660480976e-05,
"loss": 0.0192,
"step": 18820
},
{
"epoch": 0.8007314169076374,
"grad_norm": 0.39491820335388184,
"learning_rate": 2.464455823823716e-05,
"loss": 0.0188,
"step": 18830
},
{
"epoch": 0.8011566592957986,
"grad_norm": 0.3796221911907196,
"learning_rate": 2.4638224886231196e-05,
"loss": 0.0203,
"step": 18840
},
{
"epoch": 0.8015819016839598,
"grad_norm": 0.4522951543331146,
"learning_rate": 2.4631888606387e-05,
"loss": 0.0196,
"step": 18850
},
{
"epoch": 0.8020071440721211,
"grad_norm": 0.32392099499702454,
"learning_rate": 2.4625549400629356e-05,
"loss": 0.0179,
"step": 18860
},
{
"epoch": 0.8024323864602824,
"grad_norm": 0.36404240131378174,
"learning_rate": 2.4619207270883958e-05,
"loss": 0.0166,
"step": 18870
},
{
"epoch": 0.8028576288484436,
"grad_norm": 0.31685033440589905,
"learning_rate": 2.461286221907737e-05,
"loss": 0.0177,
"step": 18880
},
{
"epoch": 0.8032828712366049,
"grad_norm": 0.35425153374671936,
"learning_rate": 2.460651424713705e-05,
"loss": 0.0184,
"step": 18890
},
{
"epoch": 0.8037081136247661,
"grad_norm": 0.3561999201774597,
"learning_rate": 2.4600163356991347e-05,
"loss": 0.0174,
"step": 18900
},
{
"epoch": 0.8041333560129273,
"grad_norm": 0.3638227880001068,
"learning_rate": 2.4593809550569498e-05,
"loss": 0.0169,
"step": 18910
},
{
"epoch": 0.8045585984010887,
"grad_norm": 0.3247460722923279,
"learning_rate": 2.4587452829801614e-05,
"loss": 0.0217,
"step": 18920
},
{
"epoch": 0.8049838407892499,
"grad_norm": 0.34640103578567505,
"learning_rate": 2.45810931966187e-05,
"loss": 0.017,
"step": 18930
},
{
"epoch": 0.8054090831774111,
"grad_norm": 0.2949196696281433,
"learning_rate": 2.457473065295265e-05,
"loss": 0.0164,
"step": 18940
},
{
"epoch": 0.8058343255655723,
"grad_norm": 0.2806178033351898,
"learning_rate": 2.4568365200736232e-05,
"loss": 0.0184,
"step": 18950
},
{
"epoch": 0.8062595679537337,
"grad_norm": 0.38688793778419495,
"learning_rate": 2.4561996841903104e-05,
"loss": 0.0184,
"step": 18960
},
{
"epoch": 0.8066848103418949,
"grad_norm": 0.294419527053833,
"learning_rate": 2.455562557838781e-05,
"loss": 0.0202,
"step": 18970
},
{
"epoch": 0.8071100527300561,
"grad_norm": 0.39870259165763855,
"learning_rate": 2.4549251412125762e-05,
"loss": 0.0194,
"step": 18980
},
{
"epoch": 0.8075352951182174,
"grad_norm": 0.35803669691085815,
"learning_rate": 2.454287434505327e-05,
"loss": 0.019,
"step": 18990
},
{
"epoch": 0.8079605375063786,
"grad_norm": 0.3306076228618622,
"learning_rate": 2.4536494379107514e-05,
"loss": 0.0173,
"step": 19000
},
{
"epoch": 0.8083857798945399,
"grad_norm": 0.31047430634498596,
"learning_rate": 2.453011151622657e-05,
"loss": 0.0163,
"step": 19010
},
{
"epoch": 0.8088110222827012,
"grad_norm": 0.45618563890457153,
"learning_rate": 2.452372575834937e-05,
"loss": 0.0178,
"step": 19020
},
{
"epoch": 0.8092362646708624,
"grad_norm": 0.3509865701198578,
"learning_rate": 2.4517337107415742e-05,
"loss": 0.0185,
"step": 19030
},
{
"epoch": 0.8096615070590236,
"grad_norm": 0.40352803468704224,
"learning_rate": 2.4510945565366397e-05,
"loss": 0.0197,
"step": 19040
},
{
"epoch": 0.810086749447185,
"grad_norm": 0.2777395248413086,
"learning_rate": 2.4504551134142905e-05,
"loss": 0.0183,
"step": 19050
},
{
"epoch": 0.8105119918353462,
"grad_norm": 0.43311402201652527,
"learning_rate": 2.4498153815687738e-05,
"loss": 0.021,
"step": 19060
},
{
"epoch": 0.8109372342235074,
"grad_norm": 0.3267991840839386,
"learning_rate": 2.4491753611944224e-05,
"loss": 0.0182,
"step": 19070
},
{
"epoch": 0.8113624766116686,
"grad_norm": 0.37511321902275085,
"learning_rate": 2.4485350524856577e-05,
"loss": 0.0183,
"step": 19080
},
{
"epoch": 0.8117877189998299,
"grad_norm": 0.33489423990249634,
"learning_rate": 2.4478944556369886e-05,
"loss": 0.0184,
"step": 19090
},
{
"epoch": 0.8122129613879912,
"grad_norm": 0.30842292308807373,
"learning_rate": 2.4472535708430116e-05,
"loss": 0.0194,
"step": 19100
},
{
"epoch": 0.8126382037761524,
"grad_norm": 0.4636704921722412,
"learning_rate": 2.4466123982984103e-05,
"loss": 0.022,
"step": 19110
},
{
"epoch": 0.8130634461643137,
"grad_norm": 0.3947596848011017,
"learning_rate": 2.445970938197957e-05,
"loss": 0.0193,
"step": 19120
},
{
"epoch": 0.8134886885524749,
"grad_norm": 0.3773009479045868,
"learning_rate": 2.4453291907365092e-05,
"loss": 0.0188,
"step": 19130
},
{
"epoch": 0.8139139309406361,
"grad_norm": 0.4608099162578583,
"learning_rate": 2.444687156109013e-05,
"loss": 0.02,
"step": 19140
},
{
"epoch": 0.8143391733287975,
"grad_norm": 0.3472055196762085,
"learning_rate": 2.444044834510502e-05,
"loss": 0.0193,
"step": 19150
},
{
"epoch": 0.8147644157169587,
"grad_norm": 0.40071094036102295,
"learning_rate": 2.443402226136096e-05,
"loss": 0.0193,
"step": 19160
},
{
"epoch": 0.8151896581051199,
"grad_norm": 0.4245036542415619,
"learning_rate": 2.442759331181003e-05,
"loss": 0.0162,
"step": 19170
},
{
"epoch": 0.8156149004932812,
"grad_norm": 0.592641294002533,
"learning_rate": 2.4421161498405176e-05,
"loss": 0.0201,
"step": 19180
},
{
"epoch": 0.8160401428814424,
"grad_norm": 0.48311716318130493,
"learning_rate": 2.4414726823100207e-05,
"loss": 0.0191,
"step": 19190
},
{
"epoch": 0.8164653852696037,
"grad_norm": 0.3404284417629242,
"learning_rate": 2.4408289287849813e-05,
"loss": 0.0203,
"step": 19200
},
{
"epoch": 0.8168906276577649,
"grad_norm": 0.3067640960216522,
"learning_rate": 2.4401848894609543e-05,
"loss": 0.0182,
"step": 19210
},
{
"epoch": 0.8173158700459262,
"grad_norm": 0.44467446208000183,
"learning_rate": 2.439540564533582e-05,
"loss": 0.0181,
"step": 19220
},
{
"epoch": 0.8177411124340874,
"grad_norm": 0.3175550401210785,
"learning_rate": 2.4388959541985935e-05,
"loss": 0.0177,
"step": 19230
},
{
"epoch": 0.8181663548222486,
"grad_norm": 0.44434231519699097,
"learning_rate": 2.438251058651804e-05,
"loss": 0.0187,
"step": 19240
},
{
"epoch": 0.81859159721041,
"grad_norm": 0.3861560821533203,
"learning_rate": 2.437605878089116e-05,
"loss": 0.0212,
"step": 19250
},
{
"epoch": 0.8190168395985712,
"grad_norm": 0.3165411353111267,
"learning_rate": 2.4369604127065183e-05,
"loss": 0.0192,
"step": 19260
},
{
"epoch": 0.8194420819867324,
"grad_norm": 0.3824351131916046,
"learning_rate": 2.4363146627000862e-05,
"loss": 0.0168,
"step": 19270
},
{
"epoch": 0.8198673243748937,
"grad_norm": 0.375009149312973,
"learning_rate": 2.435668628265982e-05,
"loss": 0.0212,
"step": 19280
},
{
"epoch": 0.820292566763055,
"grad_norm": 0.2801119387149811,
"learning_rate": 2.4350223096004533e-05,
"loss": 0.0171,
"step": 19290
},
{
"epoch": 0.8207178091512162,
"grad_norm": 0.3751022219657898,
"learning_rate": 2.4343757068998343e-05,
"loss": 0.0188,
"step": 19300
},
{
"epoch": 0.8211430515393775,
"grad_norm": 0.34862837195396423,
"learning_rate": 2.4337288203605465e-05,
"loss": 0.0184,
"step": 19310
},
{
"epoch": 0.8215682939275387,
"grad_norm": 0.3476138710975647,
"learning_rate": 2.433081650179097e-05,
"loss": 0.02,
"step": 19320
},
{
"epoch": 0.8219935363156999,
"grad_norm": 0.36957767605781555,
"learning_rate": 2.4324341965520787e-05,
"loss": 0.0166,
"step": 19330
},
{
"epoch": 0.8224187787038612,
"grad_norm": 0.31499162316322327,
"learning_rate": 2.4317864596761706e-05,
"loss": 0.0177,
"step": 19340
},
{
"epoch": 0.8228440210920225,
"grad_norm": 0.32327380776405334,
"learning_rate": 2.4311384397481387e-05,
"loss": 0.0172,
"step": 19350
},
{
"epoch": 0.8232692634801837,
"grad_norm": 0.27957937121391296,
"learning_rate": 2.4304901369648344e-05,
"loss": 0.0169,
"step": 19360
},
{
"epoch": 0.8236945058683449,
"grad_norm": 0.4002857804298401,
"learning_rate": 2.429841551523194e-05,
"loss": 0.0222,
"step": 19370
},
{
"epoch": 0.8241197482565062,
"grad_norm": 0.43563494086265564,
"learning_rate": 2.4291926836202416e-05,
"loss": 0.019,
"step": 19380
},
{
"epoch": 0.8245449906446675,
"grad_norm": 0.31372612714767456,
"learning_rate": 2.428543533453086e-05,
"loss": 0.0203,
"step": 19390
},
{
"epoch": 0.8249702330328287,
"grad_norm": 0.31944289803504944,
"learning_rate": 2.4278941012189215e-05,
"loss": 0.0195,
"step": 19400
},
{
"epoch": 0.82539547542099,
"grad_norm": 0.28638774156570435,
"learning_rate": 2.427244387115029e-05,
"loss": 0.0191,
"step": 19410
},
{
"epoch": 0.8258207178091512,
"grad_norm": 0.4001684784889221,
"learning_rate": 2.4265943913387738e-05,
"loss": 0.0176,
"step": 19420
},
{
"epoch": 0.8262459601973124,
"grad_norm": 0.34653154015541077,
"learning_rate": 2.425944114087608e-05,
"loss": 0.0169,
"step": 19430
},
{
"epoch": 0.8266712025854738,
"grad_norm": 0.3180333077907562,
"learning_rate": 2.4252935555590684e-05,
"loss": 0.0214,
"step": 19440
},
{
"epoch": 0.827096444973635,
"grad_norm": 0.420258492231369,
"learning_rate": 2.4246427159507772e-05,
"loss": 0.0155,
"step": 19450
},
{
"epoch": 0.8275216873617962,
"grad_norm": 0.30975809693336487,
"learning_rate": 2.4239915954604433e-05,
"loss": 0.0158,
"step": 19460
},
{
"epoch": 0.8279469297499574,
"grad_norm": 0.32593613862991333,
"learning_rate": 2.4233401942858595e-05,
"loss": 0.0161,
"step": 19470
},
{
"epoch": 0.8283721721381188,
"grad_norm": 0.3261292576789856,
"learning_rate": 2.4226885126249033e-05,
"loss": 0.0163,
"step": 19480
},
{
"epoch": 0.82879741452628,
"grad_norm": 0.329802542924881,
"learning_rate": 2.4220365506755394e-05,
"loss": 0.0156,
"step": 19490
},
{
"epoch": 0.8292226569144412,
"grad_norm": 0.36109891533851624,
"learning_rate": 2.4213843086358166e-05,
"loss": 0.0154,
"step": 19500
},
{
"epoch": 0.8296478993026025,
"grad_norm": 0.44082027673721313,
"learning_rate": 2.4207317867038684e-05,
"loss": 0.0206,
"step": 19510
},
{
"epoch": 0.8300731416907637,
"grad_norm": 0.316689133644104,
"learning_rate": 2.4200789850779137e-05,
"loss": 0.0172,
"step": 19520
},
{
"epoch": 0.830498384078925,
"grad_norm": 0.399915486574173,
"learning_rate": 2.419425903956257e-05,
"loss": 0.0188,
"step": 19530
},
{
"epoch": 0.8309236264670863,
"grad_norm": 0.5746642351150513,
"learning_rate": 2.4187725435372865e-05,
"loss": 0.0156,
"step": 19540
},
{
"epoch": 0.8313488688552475,
"grad_norm": 0.6345515847206116,
"learning_rate": 2.418118904019476e-05,
"loss": 0.0182,
"step": 19550
},
{
"epoch": 0.8317741112434087,
"grad_norm": 0.35415080189704895,
"learning_rate": 2.4174649856013842e-05,
"loss": 0.0179,
"step": 19560
},
{
"epoch": 0.83219935363157,
"grad_norm": 0.3247321844100952,
"learning_rate": 2.416810788481654e-05,
"loss": 0.0176,
"step": 19570
},
{
"epoch": 0.8326245960197313,
"grad_norm": 0.41374531388282776,
"learning_rate": 2.416156312859013e-05,
"loss": 0.017,
"step": 19580
},
{
"epoch": 0.8330498384078925,
"grad_norm": 0.3736271858215332,
"learning_rate": 2.4155015589322742e-05,
"loss": 0.0168,
"step": 19590
},
{
"epoch": 0.8334750807960537,
"grad_norm": 0.4117409288883209,
"learning_rate": 2.4148465269003338e-05,
"loss": 0.0189,
"step": 19600
},
{
"epoch": 0.833900323184215,
"grad_norm": 0.32814255356788635,
"learning_rate": 2.4141912169621742e-05,
"loss": 0.018,
"step": 19610
},
{
"epoch": 0.8343255655723762,
"grad_norm": 0.34247809648513794,
"learning_rate": 2.4135356293168602e-05,
"loss": 0.0176,
"step": 19620
},
{
"epoch": 0.8347508079605375,
"grad_norm": 0.317573606967926,
"learning_rate": 2.4128797641635427e-05,
"loss": 0.0178,
"step": 19630
},
{
"epoch": 0.8351760503486988,
"grad_norm": 0.3367726802825928,
"learning_rate": 2.412223621701456e-05,
"loss": 0.0148,
"step": 19640
},
{
"epoch": 0.83560129273686,
"grad_norm": 0.34025198221206665,
"learning_rate": 2.4115672021299193e-05,
"loss": 0.0175,
"step": 19650
},
{
"epoch": 0.8360265351250212,
"grad_norm": 0.32034459710121155,
"learning_rate": 2.4109105056483345e-05,
"loss": 0.0209,
"step": 19660
},
{
"epoch": 0.8364517775131826,
"grad_norm": 0.350245863199234,
"learning_rate": 2.4102535324561898e-05,
"loss": 0.0208,
"step": 19670
},
{
"epoch": 0.8368770199013438,
"grad_norm": 0.4573119878768921,
"learning_rate": 2.4095962827530556e-05,
"loss": 0.0172,
"step": 19680
},
{
"epoch": 0.837302262289505,
"grad_norm": 0.38064107298851013,
"learning_rate": 2.408938756738587e-05,
"loss": 0.0166,
"step": 19690
},
{
"epoch": 0.8377275046776663,
"grad_norm": 0.3633199632167816,
"learning_rate": 2.4082809546125235e-05,
"loss": 0.0189,
"step": 19700
},
{
"epoch": 0.8381527470658275,
"grad_norm": 0.5364571809768677,
"learning_rate": 2.4076228765746876e-05,
"loss": 0.0171,
"step": 19710
},
{
"epoch": 0.8385779894539888,
"grad_norm": 0.27998796105384827,
"learning_rate": 2.4069645228249864e-05,
"loss": 0.0164,
"step": 19720
},
{
"epoch": 0.83900323184215,
"grad_norm": 0.33518269658088684,
"learning_rate": 2.40630589356341e-05,
"loss": 0.0175,
"step": 19730
},
{
"epoch": 0.8394284742303113,
"grad_norm": 0.3925931453704834,
"learning_rate": 2.4056469889900327e-05,
"loss": 0.0183,
"step": 19740
},
{
"epoch": 0.8398537166184725,
"grad_norm": 0.2806520164012909,
"learning_rate": 2.404987809305012e-05,
"loss": 0.0179,
"step": 19750
},
{
"epoch": 0.8402789590066337,
"grad_norm": 0.26155996322631836,
"learning_rate": 2.4043283547085903e-05,
"loss": 0.0206,
"step": 19760
},
{
"epoch": 0.8407042013947951,
"grad_norm": 0.3690318763256073,
"learning_rate": 2.403668625401092e-05,
"loss": 0.0182,
"step": 19770
},
{
"epoch": 0.8411294437829563,
"grad_norm": 0.38202929496765137,
"learning_rate": 2.4030086215829247e-05,
"loss": 0.0154,
"step": 19780
},
{
"epoch": 0.8415546861711175,
"grad_norm": 0.3304462432861328,
"learning_rate": 2.4023483434545806e-05,
"loss": 0.019,
"step": 19790
},
{
"epoch": 0.8419799285592788,
"grad_norm": 0.32554149627685547,
"learning_rate": 2.4016877912166356e-05,
"loss": 0.018,
"step": 19800
},
{
"epoch": 0.84240517094744,
"grad_norm": 0.3655115067958832,
"learning_rate": 2.401026965069747e-05,
"loss": 0.0188,
"step": 19810
},
{
"epoch": 0.8428304133356013,
"grad_norm": 0.3540496528148651,
"learning_rate": 2.4003658652146564e-05,
"loss": 0.0172,
"step": 19820
},
{
"epoch": 0.8432556557237626,
"grad_norm": 0.2831367552280426,
"learning_rate": 2.3997044918521896e-05,
"loss": 0.0181,
"step": 19830
},
{
"epoch": 0.8436808981119238,
"grad_norm": 0.3655035197734833,
"learning_rate": 2.3990428451832524e-05,
"loss": 0.0172,
"step": 19840
},
{
"epoch": 0.844106140500085,
"grad_norm": 0.266615092754364,
"learning_rate": 2.398380925408837e-05,
"loss": 0.0158,
"step": 19850
},
{
"epoch": 0.8445313828882463,
"grad_norm": 0.37141183018684387,
"learning_rate": 2.3977187327300174e-05,
"loss": 0.0189,
"step": 19860
},
{
"epoch": 0.8449566252764076,
"grad_norm": 0.43702635169029236,
"learning_rate": 2.397056267347949e-05,
"loss": 0.0186,
"step": 19870
},
{
"epoch": 0.8453818676645688,
"grad_norm": 0.42261597514152527,
"learning_rate": 2.3963935294638725e-05,
"loss": 0.0177,
"step": 19880
},
{
"epoch": 0.84580711005273,
"grad_norm": 0.3517419993877411,
"learning_rate": 2.395730519279109e-05,
"loss": 0.017,
"step": 19890
},
{
"epoch": 0.8462323524408913,
"grad_norm": 0.303210973739624,
"learning_rate": 2.3950672369950646e-05,
"loss": 0.0177,
"step": 19900
},
{
"epoch": 0.8466575948290526,
"grad_norm": 0.3902907073497772,
"learning_rate": 2.3944036828132266e-05,
"loss": 0.0199,
"step": 19910
},
{
"epoch": 0.8470828372172138,
"grad_norm": 0.28750261664390564,
"learning_rate": 2.3937398569351647e-05,
"loss": 0.0186,
"step": 19920
},
{
"epoch": 0.8475080796053751,
"grad_norm": 0.3395385146141052,
"learning_rate": 2.3930757595625326e-05,
"loss": 0.0221,
"step": 19930
},
{
"epoch": 0.8479333219935363,
"grad_norm": 0.3717360198497772,
"learning_rate": 2.3924113908970645e-05,
"loss": 0.0183,
"step": 19940
},
{
"epoch": 0.8483585643816975,
"grad_norm": 0.39457517862319946,
"learning_rate": 2.391746751140579e-05,
"loss": 0.0175,
"step": 19950
},
{
"epoch": 0.8487838067698589,
"grad_norm": 0.5388570427894592,
"learning_rate": 2.3910818404949756e-05,
"loss": 0.0206,
"step": 19960
},
{
"epoch": 0.8492090491580201,
"grad_norm": 0.36116811633110046,
"learning_rate": 2.3904166591622367e-05,
"loss": 0.0163,
"step": 19970
},
{
"epoch": 0.8496342915461813,
"grad_norm": 0.42637062072753906,
"learning_rate": 2.3897512073444267e-05,
"loss": 0.0177,
"step": 19980
},
{
"epoch": 0.8500595339343425,
"grad_norm": 0.32587724924087524,
"learning_rate": 2.389085485243693e-05,
"loss": 0.0181,
"step": 19990
},
{
"epoch": 0.8504847763225039,
"grad_norm": 0.3714869022369385,
"learning_rate": 2.3884194930622632e-05,
"loss": 0.018,
"step": 20000
}
],
"logging_steps": 10,
"max_steps": 60000,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}