1b_distill_width_prune / trainer_state.json
friendshipkim's picture
Model save
9a0f1ca verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.08267797093601252,
"eval_steps": 10000,
"global_step": 20001,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 4.133691862207516e-06,
"grad_norm": 71.12261039322922,
"learning_rate": 8.264462809917355e-09,
"loss": 7.6413,
"step": 1
},
{
"epoch": 4.1336918622075155e-05,
"grad_norm": 67.75271488374239,
"learning_rate": 8.264462809917357e-08,
"loss": 7.6837,
"step": 10
},
{
"epoch": 8.267383724415031e-05,
"grad_norm": 68.70872130093693,
"learning_rate": 1.6528925619834713e-07,
"loss": 7.6852,
"step": 20
},
{
"epoch": 0.00012401075586622546,
"grad_norm": 62.98947446015298,
"learning_rate": 2.4793388429752067e-07,
"loss": 7.6376,
"step": 30
},
{
"epoch": 0.00016534767448830062,
"grad_norm": 54.98248131444704,
"learning_rate": 3.3057851239669426e-07,
"loss": 7.4517,
"step": 40
},
{
"epoch": 0.00020668459311037578,
"grad_norm": 42.7313362524158,
"learning_rate": 4.132231404958678e-07,
"loss": 7.1676,
"step": 50
},
{
"epoch": 0.0002480215117324509,
"grad_norm": 38.42250005234289,
"learning_rate": 4.958677685950413e-07,
"loss": 6.7673,
"step": 60
},
{
"epoch": 0.0002893584303545261,
"grad_norm": 26.17988841617355,
"learning_rate": 5.78512396694215e-07,
"loss": 6.1421,
"step": 70
},
{
"epoch": 0.00033069534897660124,
"grad_norm": 20.709699960482357,
"learning_rate": 6.611570247933885e-07,
"loss": 5.7812,
"step": 80
},
{
"epoch": 0.0003720322675986764,
"grad_norm": 18.83787744791924,
"learning_rate": 7.438016528925621e-07,
"loss": 5.4004,
"step": 90
},
{
"epoch": 0.00041336918622075157,
"grad_norm": 19.12503087164104,
"learning_rate": 8.264462809917356e-07,
"loss": 5.0367,
"step": 100
},
{
"epoch": 0.0004547061048428267,
"grad_norm": 14.34978763505406,
"learning_rate": 9.090909090909091e-07,
"loss": 4.7081,
"step": 110
},
{
"epoch": 0.0004960430234649018,
"grad_norm": 13.492660464865253,
"learning_rate": 9.917355371900827e-07,
"loss": 4.5718,
"step": 120
},
{
"epoch": 0.000537379942086977,
"grad_norm": 14.420928714044905,
"learning_rate": 1.0743801652892562e-06,
"loss": 4.4218,
"step": 130
},
{
"epoch": 0.0005787168607090522,
"grad_norm": 12.495448321891903,
"learning_rate": 1.15702479338843e-06,
"loss": 4.166,
"step": 140
},
{
"epoch": 0.0006200537793311273,
"grad_norm": 9.801331301009261,
"learning_rate": 1.2396694214876035e-06,
"loss": 4.0983,
"step": 150
},
{
"epoch": 0.0006613906979532025,
"grad_norm": 11.207849241885746,
"learning_rate": 1.322314049586777e-06,
"loss": 3.8569,
"step": 160
},
{
"epoch": 0.0007027276165752776,
"grad_norm": 9.679145248918651,
"learning_rate": 1.4049586776859506e-06,
"loss": 3.8265,
"step": 170
},
{
"epoch": 0.0007440645351973528,
"grad_norm": 11.74286989793758,
"learning_rate": 1.4876033057851241e-06,
"loss": 3.7614,
"step": 180
},
{
"epoch": 0.0007854014538194279,
"grad_norm": 12.757634031418831,
"learning_rate": 1.5702479338842977e-06,
"loss": 3.7239,
"step": 190
},
{
"epoch": 0.0008267383724415031,
"grad_norm": 10.441179098409538,
"learning_rate": 1.6528925619834712e-06,
"loss": 3.5734,
"step": 200
},
{
"epoch": 0.0008680752910635782,
"grad_norm": 9.812592859538713,
"learning_rate": 1.7355371900826448e-06,
"loss": 3.567,
"step": 210
},
{
"epoch": 0.0009094122096856533,
"grad_norm": 13.93844427924458,
"learning_rate": 1.8181818181818183e-06,
"loss": 3.4882,
"step": 220
},
{
"epoch": 0.0009507491283077286,
"grad_norm": 10.098507514325146,
"learning_rate": 1.900826446280992e-06,
"loss": 3.4818,
"step": 230
},
{
"epoch": 0.0009920860469298037,
"grad_norm": 9.58855528934084,
"learning_rate": 1.9834710743801654e-06,
"loss": 3.4219,
"step": 240
},
{
"epoch": 0.0010334229655518789,
"grad_norm": 10.682635147410519,
"learning_rate": 2.066115702479339e-06,
"loss": 3.4406,
"step": 250
},
{
"epoch": 0.001074759884173954,
"grad_norm": 8.223333896011393,
"learning_rate": 2.1487603305785124e-06,
"loss": 3.3402,
"step": 260
},
{
"epoch": 0.001116096802796029,
"grad_norm": 8.984512948693363,
"learning_rate": 2.231404958677686e-06,
"loss": 3.3059,
"step": 270
},
{
"epoch": 0.0011574337214181043,
"grad_norm": 9.341193028741104,
"learning_rate": 2.31404958677686e-06,
"loss": 3.1823,
"step": 280
},
{
"epoch": 0.0011987706400401795,
"grad_norm": 10.3056509420508,
"learning_rate": 2.3966942148760335e-06,
"loss": 3.2817,
"step": 290
},
{
"epoch": 0.0012401075586622545,
"grad_norm": 6.768688690699956,
"learning_rate": 2.479338842975207e-06,
"loss": 3.1747,
"step": 300
},
{
"epoch": 0.0012814444772843298,
"grad_norm": 6.809665393368199,
"learning_rate": 2.56198347107438e-06,
"loss": 3.1865,
"step": 310
},
{
"epoch": 0.001322781395906405,
"grad_norm": 6.996784607657393,
"learning_rate": 2.644628099173554e-06,
"loss": 3.1331,
"step": 320
},
{
"epoch": 0.0013641183145284802,
"grad_norm": 9.752811655395744,
"learning_rate": 2.7272727272727272e-06,
"loss": 3.1656,
"step": 330
},
{
"epoch": 0.0014054552331505552,
"grad_norm": 8.381106520516422,
"learning_rate": 2.809917355371901e-06,
"loss": 3.1045,
"step": 340
},
{
"epoch": 0.0014467921517726304,
"grad_norm": 9.529557732528744,
"learning_rate": 2.8925619834710743e-06,
"loss": 3.0388,
"step": 350
},
{
"epoch": 0.0014881290703947056,
"grad_norm": 7.141802881599084,
"learning_rate": 2.9752066115702483e-06,
"loss": 3.1119,
"step": 360
},
{
"epoch": 0.0015294659890167806,
"grad_norm": 8.784456968597924,
"learning_rate": 3.0578512396694214e-06,
"loss": 2.9281,
"step": 370
},
{
"epoch": 0.0015708029076388558,
"grad_norm": 7.19486645716449,
"learning_rate": 3.1404958677685953e-06,
"loss": 2.9817,
"step": 380
},
{
"epoch": 0.001612139826260931,
"grad_norm": 8.694417122337162,
"learning_rate": 3.2231404958677685e-06,
"loss": 3.0107,
"step": 390
},
{
"epoch": 0.0016534767448830063,
"grad_norm": 8.323745687899638,
"learning_rate": 3.3057851239669424e-06,
"loss": 3.0367,
"step": 400
},
{
"epoch": 0.0016948136635050813,
"grad_norm": 7.289283064256704,
"learning_rate": 3.388429752066116e-06,
"loss": 2.9071,
"step": 410
},
{
"epoch": 0.0017361505821271565,
"grad_norm": 8.436424184415285,
"learning_rate": 3.4710743801652895e-06,
"loss": 2.8761,
"step": 420
},
{
"epoch": 0.0017774875007492317,
"grad_norm": 8.284395953073583,
"learning_rate": 3.553719008264463e-06,
"loss": 2.8742,
"step": 430
},
{
"epoch": 0.0018188244193713067,
"grad_norm": 7.765622556565918,
"learning_rate": 3.6363636363636366e-06,
"loss": 2.9028,
"step": 440
},
{
"epoch": 0.001860161337993382,
"grad_norm": 8.06677494412333,
"learning_rate": 3.71900826446281e-06,
"loss": 2.885,
"step": 450
},
{
"epoch": 0.0019014982566154571,
"grad_norm": 6.847325183046608,
"learning_rate": 3.801652892561984e-06,
"loss": 2.8491,
"step": 460
},
{
"epoch": 0.0019428351752375323,
"grad_norm": 6.363354378607742,
"learning_rate": 3.884297520661157e-06,
"loss": 2.8626,
"step": 470
},
{
"epoch": 0.0019841720938596073,
"grad_norm": 6.752545926601506,
"learning_rate": 3.966942148760331e-06,
"loss": 2.8759,
"step": 480
},
{
"epoch": 0.0020255090124816828,
"grad_norm": 6.408269470963144,
"learning_rate": 4.049586776859504e-06,
"loss": 2.8132,
"step": 490
},
{
"epoch": 0.0020668459311037578,
"grad_norm": 10.174350278228932,
"learning_rate": 4.132231404958678e-06,
"loss": 2.7726,
"step": 500
},
{
"epoch": 0.0021081828497258328,
"grad_norm": 7.3668788002558045,
"learning_rate": 4.214876033057851e-06,
"loss": 2.781,
"step": 510
},
{
"epoch": 0.002149519768347908,
"grad_norm": 6.065779568736328,
"learning_rate": 4.297520661157025e-06,
"loss": 2.7566,
"step": 520
},
{
"epoch": 0.002190856686969983,
"grad_norm": 7.249820467027506,
"learning_rate": 4.3801652892561984e-06,
"loss": 2.7745,
"step": 530
},
{
"epoch": 0.002232193605592058,
"grad_norm": 6.6147066822580305,
"learning_rate": 4.462809917355372e-06,
"loss": 2.7287,
"step": 540
},
{
"epoch": 0.0022735305242141336,
"grad_norm": 6.730411628057589,
"learning_rate": 4.5454545454545455e-06,
"loss": 2.7263,
"step": 550
},
{
"epoch": 0.0023148674428362086,
"grad_norm": 5.90842778355055,
"learning_rate": 4.62809917355372e-06,
"loss": 2.7113,
"step": 560
},
{
"epoch": 0.0023562043614582836,
"grad_norm": 5.290690112865889,
"learning_rate": 4.710743801652893e-06,
"loss": 2.7407,
"step": 570
},
{
"epoch": 0.002397541280080359,
"grad_norm": 6.913494491726395,
"learning_rate": 4.793388429752067e-06,
"loss": 2.7089,
"step": 580
},
{
"epoch": 0.002438878198702434,
"grad_norm": 6.999126848562726,
"learning_rate": 4.87603305785124e-06,
"loss": 2.7074,
"step": 590
},
{
"epoch": 0.002480215117324509,
"grad_norm": 7.325247968940623,
"learning_rate": 4.958677685950414e-06,
"loss": 2.6561,
"step": 600
},
{
"epoch": 0.0025215520359465845,
"grad_norm": 5.841708656820878,
"learning_rate": 5.041322314049587e-06,
"loss": 2.5882,
"step": 610
},
{
"epoch": 0.0025628889545686595,
"grad_norm": 6.0353216317971725,
"learning_rate": 5.12396694214876e-06,
"loss": 2.6469,
"step": 620
},
{
"epoch": 0.002604225873190735,
"grad_norm": 7.544181254798358,
"learning_rate": 5.206611570247935e-06,
"loss": 2.6267,
"step": 630
},
{
"epoch": 0.00264556279181281,
"grad_norm": 6.608459353317291,
"learning_rate": 5.289256198347108e-06,
"loss": 2.5662,
"step": 640
},
{
"epoch": 0.002686899710434885,
"grad_norm": 6.839416904552874,
"learning_rate": 5.371900826446281e-06,
"loss": 2.6888,
"step": 650
},
{
"epoch": 0.0027282366290569604,
"grad_norm": 6.668329679339745,
"learning_rate": 5.4545454545454545e-06,
"loss": 2.5642,
"step": 660
},
{
"epoch": 0.0027695735476790354,
"grad_norm": 7.132958283503685,
"learning_rate": 5.537190082644629e-06,
"loss": 2.5651,
"step": 670
},
{
"epoch": 0.0028109104663011104,
"grad_norm": 6.277307177686086,
"learning_rate": 5.619834710743802e-06,
"loss": 2.5659,
"step": 680
},
{
"epoch": 0.002852247384923186,
"grad_norm": 6.128689798291957,
"learning_rate": 5.702479338842976e-06,
"loss": 2.5826,
"step": 690
},
{
"epoch": 0.002893584303545261,
"grad_norm": 6.5950769294424125,
"learning_rate": 5.785123966942149e-06,
"loss": 2.5845,
"step": 700
},
{
"epoch": 0.002934921222167336,
"grad_norm": 6.419190212095196,
"learning_rate": 5.867768595041323e-06,
"loss": 2.5336,
"step": 710
},
{
"epoch": 0.0029762581407894112,
"grad_norm": 8.4242870632546,
"learning_rate": 5.9504132231404965e-06,
"loss": 2.5085,
"step": 720
},
{
"epoch": 0.0030175950594114862,
"grad_norm": 7.690590337257814,
"learning_rate": 6.03305785123967e-06,
"loss": 2.6229,
"step": 730
},
{
"epoch": 0.0030589319780335612,
"grad_norm": 6.501607766316929,
"learning_rate": 6.115702479338843e-06,
"loss": 2.5214,
"step": 740
},
{
"epoch": 0.0031002688966556367,
"grad_norm": 6.318891494759645,
"learning_rate": 6.198347107438017e-06,
"loss": 2.5001,
"step": 750
},
{
"epoch": 0.0031416058152777117,
"grad_norm": 6.549087764929742,
"learning_rate": 6.280991735537191e-06,
"loss": 2.4692,
"step": 760
},
{
"epoch": 0.003182942733899787,
"grad_norm": 6.139353182718512,
"learning_rate": 6.363636363636364e-06,
"loss": 2.4862,
"step": 770
},
{
"epoch": 0.003224279652521862,
"grad_norm": 6.927572304151442,
"learning_rate": 6.446280991735537e-06,
"loss": 2.5114,
"step": 780
},
{
"epoch": 0.003265616571143937,
"grad_norm": 6.193127375797419,
"learning_rate": 6.528925619834712e-06,
"loss": 2.5163,
"step": 790
},
{
"epoch": 0.0033069534897660125,
"grad_norm": 6.624590490134376,
"learning_rate": 6.611570247933885e-06,
"loss": 2.4314,
"step": 800
},
{
"epoch": 0.0033482904083880875,
"grad_norm": 6.821779483779539,
"learning_rate": 6.694214876033058e-06,
"loss": 2.5099,
"step": 810
},
{
"epoch": 0.0033896273270101625,
"grad_norm": 7.545681159342933,
"learning_rate": 6.776859504132232e-06,
"loss": 2.431,
"step": 820
},
{
"epoch": 0.003430964245632238,
"grad_norm": 7.4575512413535785,
"learning_rate": 6.859504132231406e-06,
"loss": 2.5166,
"step": 830
},
{
"epoch": 0.003472301164254313,
"grad_norm": 5.330361729769768,
"learning_rate": 6.942148760330579e-06,
"loss": 2.4147,
"step": 840
},
{
"epoch": 0.003513638082876388,
"grad_norm": 9.642946599111673,
"learning_rate": 7.0247933884297525e-06,
"loss": 2.416,
"step": 850
},
{
"epoch": 0.0035549750014984634,
"grad_norm": 5.81243704909108,
"learning_rate": 7.107438016528926e-06,
"loss": 2.4712,
"step": 860
},
{
"epoch": 0.0035963119201205384,
"grad_norm": 8.181150357128717,
"learning_rate": 7.1900826446281005e-06,
"loss": 2.4275,
"step": 870
},
{
"epoch": 0.0036376488387426134,
"grad_norm": 6.783789830926592,
"learning_rate": 7.272727272727273e-06,
"loss": 2.4283,
"step": 880
},
{
"epoch": 0.003678985757364689,
"grad_norm": 6.537482232422147,
"learning_rate": 7.355371900826447e-06,
"loss": 2.4064,
"step": 890
},
{
"epoch": 0.003720322675986764,
"grad_norm": 5.51502652262759,
"learning_rate": 7.43801652892562e-06,
"loss": 2.3804,
"step": 900
},
{
"epoch": 0.0037616595946088393,
"grad_norm": 5.643663413025215,
"learning_rate": 7.520661157024795e-06,
"loss": 2.4225,
"step": 910
},
{
"epoch": 0.0038029965132309143,
"grad_norm": 5.698077553184173,
"learning_rate": 7.603305785123968e-06,
"loss": 2.3767,
"step": 920
},
{
"epoch": 0.0038443334318529893,
"grad_norm": 7.7844289388382695,
"learning_rate": 7.685950413223142e-06,
"loss": 2.3515,
"step": 930
},
{
"epoch": 0.0038856703504750647,
"grad_norm": 7.037093549799256,
"learning_rate": 7.768595041322314e-06,
"loss": 2.34,
"step": 940
},
{
"epoch": 0.00392700726909714,
"grad_norm": 6.080619495201754,
"learning_rate": 7.851239669421489e-06,
"loss": 2.3174,
"step": 950
},
{
"epoch": 0.003968344187719215,
"grad_norm": 5.880580728396556,
"learning_rate": 7.933884297520661e-06,
"loss": 2.3706,
"step": 960
},
{
"epoch": 0.00400968110634129,
"grad_norm": 5.661072675024262,
"learning_rate": 8.016528925619836e-06,
"loss": 2.3481,
"step": 970
},
{
"epoch": 0.0040510180249633656,
"grad_norm": 7.201611884034939,
"learning_rate": 8.099173553719009e-06,
"loss": 2.3667,
"step": 980
},
{
"epoch": 0.00409235494358544,
"grad_norm": 6.275874199218544,
"learning_rate": 8.181818181818183e-06,
"loss": 2.3169,
"step": 990
},
{
"epoch": 0.0041336918622075156,
"grad_norm": 5.267583094894874,
"learning_rate": 8.264462809917356e-06,
"loss": 2.3757,
"step": 1000
},
{
"epoch": 0.004175028780829591,
"grad_norm": 5.377757889968936,
"learning_rate": 8.34710743801653e-06,
"loss": 2.3631,
"step": 1010
},
{
"epoch": 0.0042163656994516656,
"grad_norm": 6.0201100161095225,
"learning_rate": 8.429752066115703e-06,
"loss": 2.2818,
"step": 1020
},
{
"epoch": 0.004257702618073741,
"grad_norm": 6.579057670565248,
"learning_rate": 8.512396694214877e-06,
"loss": 2.3313,
"step": 1030
},
{
"epoch": 0.004299039536695816,
"grad_norm": 7.27660719754988,
"learning_rate": 8.59504132231405e-06,
"loss": 2.3288,
"step": 1040
},
{
"epoch": 0.004340376455317891,
"grad_norm": 6.144262724026651,
"learning_rate": 8.677685950413224e-06,
"loss": 2.2981,
"step": 1050
},
{
"epoch": 0.004381713373939966,
"grad_norm": 5.672033241927713,
"learning_rate": 8.760330578512397e-06,
"loss": 2.3059,
"step": 1060
},
{
"epoch": 0.004423050292562042,
"grad_norm": 6.00241597585073,
"learning_rate": 8.842975206611571e-06,
"loss": 2.389,
"step": 1070
},
{
"epoch": 0.004464387211184116,
"grad_norm": 5.649027277167286,
"learning_rate": 8.925619834710744e-06,
"loss": 2.3371,
"step": 1080
},
{
"epoch": 0.004505724129806192,
"grad_norm": 5.927371810838215,
"learning_rate": 9.008264462809918e-06,
"loss": 2.3133,
"step": 1090
},
{
"epoch": 0.004547061048428267,
"grad_norm": 5.662885075294936,
"learning_rate": 9.090909090909091e-06,
"loss": 2.2779,
"step": 1100
},
{
"epoch": 0.004588397967050342,
"grad_norm": 5.654705250045512,
"learning_rate": 9.173553719008265e-06,
"loss": 2.2234,
"step": 1110
},
{
"epoch": 0.004629734885672417,
"grad_norm": 6.241923587474726,
"learning_rate": 9.25619834710744e-06,
"loss": 2.2626,
"step": 1120
},
{
"epoch": 0.004671071804294493,
"grad_norm": 5.741893435201511,
"learning_rate": 9.338842975206613e-06,
"loss": 2.3012,
"step": 1130
},
{
"epoch": 0.004712408722916567,
"grad_norm": 6.034507786920065,
"learning_rate": 9.421487603305785e-06,
"loss": 2.2682,
"step": 1140
},
{
"epoch": 0.004753745641538643,
"grad_norm": 7.410349347874194,
"learning_rate": 9.50413223140496e-06,
"loss": 2.2796,
"step": 1150
},
{
"epoch": 0.004795082560160718,
"grad_norm": 5.992102424922784,
"learning_rate": 9.586776859504134e-06,
"loss": 2.2112,
"step": 1160
},
{
"epoch": 0.004836419478782793,
"grad_norm": 5.453311998034154,
"learning_rate": 9.669421487603307e-06,
"loss": 2.1744,
"step": 1170
},
{
"epoch": 0.004877756397404868,
"grad_norm": 5.521988823358632,
"learning_rate": 9.75206611570248e-06,
"loss": 2.2984,
"step": 1180
},
{
"epoch": 0.004919093316026944,
"grad_norm": 6.153893937530345,
"learning_rate": 9.834710743801654e-06,
"loss": 2.2443,
"step": 1190
},
{
"epoch": 0.004960430234649018,
"grad_norm": 6.5507135206490315,
"learning_rate": 9.917355371900828e-06,
"loss": 2.245,
"step": 1200
},
{
"epoch": 0.005001767153271094,
"grad_norm": 8.209761096607327,
"learning_rate": 1e-05,
"loss": 2.1959,
"step": 1210
},
{
"epoch": 0.005043104071893169,
"grad_norm": 4.986264712575914,
"learning_rate": 1.0082644628099174e-05,
"loss": 2.1612,
"step": 1220
},
{
"epoch": 0.0050844409905152444,
"grad_norm": 6.3381969120868895,
"learning_rate": 1.0165289256198348e-05,
"loss": 2.187,
"step": 1230
},
{
"epoch": 0.005125777909137319,
"grad_norm": 5.750067641203542,
"learning_rate": 1.024793388429752e-05,
"loss": 2.2004,
"step": 1240
},
{
"epoch": 0.0051671148277593944,
"grad_norm": 5.826539821613237,
"learning_rate": 1.0330578512396693e-05,
"loss": 2.1668,
"step": 1250
},
{
"epoch": 0.00520845174638147,
"grad_norm": 6.296936925085496,
"learning_rate": 1.041322314049587e-05,
"loss": 2.1807,
"step": 1260
},
{
"epoch": 0.0052497886650035444,
"grad_norm": 5.812866932063289,
"learning_rate": 1.0495867768595042e-05,
"loss": 2.209,
"step": 1270
},
{
"epoch": 0.00529112558362562,
"grad_norm": 5.808144224407848,
"learning_rate": 1.0578512396694216e-05,
"loss": 2.1807,
"step": 1280
},
{
"epoch": 0.005332462502247695,
"grad_norm": 7.460856083590218,
"learning_rate": 1.0661157024793389e-05,
"loss": 2.2229,
"step": 1290
},
{
"epoch": 0.00537379942086977,
"grad_norm": 6.980089322389665,
"learning_rate": 1.0743801652892562e-05,
"loss": 2.175,
"step": 1300
},
{
"epoch": 0.005415136339491845,
"grad_norm": 5.57740557557049,
"learning_rate": 1.0826446280991736e-05,
"loss": 2.159,
"step": 1310
},
{
"epoch": 0.005456473258113921,
"grad_norm": 6.647266714783434,
"learning_rate": 1.0909090909090909e-05,
"loss": 2.1214,
"step": 1320
},
{
"epoch": 0.005497810176735995,
"grad_norm": 6.128334205497799,
"learning_rate": 1.0991735537190083e-05,
"loss": 2.1792,
"step": 1330
},
{
"epoch": 0.005539147095358071,
"grad_norm": 6.483094766646449,
"learning_rate": 1.1074380165289258e-05,
"loss": 2.2472,
"step": 1340
},
{
"epoch": 0.005580484013980146,
"grad_norm": 5.359049945838656,
"learning_rate": 1.1157024793388432e-05,
"loss": 2.182,
"step": 1350
},
{
"epoch": 0.005621820932602221,
"grad_norm": 6.6553587609192695,
"learning_rate": 1.1239669421487605e-05,
"loss": 2.1918,
"step": 1360
},
{
"epoch": 0.005663157851224296,
"grad_norm": 6.105297642757683,
"learning_rate": 1.1322314049586777e-05,
"loss": 2.1221,
"step": 1370
},
{
"epoch": 0.005704494769846372,
"grad_norm": 5.22407946250878,
"learning_rate": 1.1404958677685952e-05,
"loss": 2.0898,
"step": 1380
},
{
"epoch": 0.005745831688468446,
"grad_norm": 5.695260375861287,
"learning_rate": 1.1487603305785125e-05,
"loss": 2.2,
"step": 1390
},
{
"epoch": 0.005787168607090522,
"grad_norm": 5.834677547053157,
"learning_rate": 1.1570247933884297e-05,
"loss": 2.1191,
"step": 1400
},
{
"epoch": 0.005828505525712597,
"grad_norm": 7.863484441598645,
"learning_rate": 1.1652892561983472e-05,
"loss": 2.1255,
"step": 1410
},
{
"epoch": 0.005869842444334672,
"grad_norm": 5.295752440326079,
"learning_rate": 1.1735537190082646e-05,
"loss": 2.1535,
"step": 1420
},
{
"epoch": 0.005911179362956747,
"grad_norm": 6.925687761354192,
"learning_rate": 1.181818181818182e-05,
"loss": 2.0618,
"step": 1430
},
{
"epoch": 0.0059525162815788225,
"grad_norm": 4.89230568395151,
"learning_rate": 1.1900826446280993e-05,
"loss": 2.1745,
"step": 1440
},
{
"epoch": 0.005993853200200897,
"grad_norm": 5.795044632849597,
"learning_rate": 1.1983471074380166e-05,
"loss": 2.1352,
"step": 1450
},
{
"epoch": 0.0060351901188229725,
"grad_norm": 6.43513153980254,
"learning_rate": 1.206611570247934e-05,
"loss": 2.102,
"step": 1460
},
{
"epoch": 0.006076527037445048,
"grad_norm": 6.46737354415826,
"learning_rate": 1.2148760330578513e-05,
"loss": 2.0934,
"step": 1470
},
{
"epoch": 0.0061178639560671225,
"grad_norm": 6.202005592277405,
"learning_rate": 1.2231404958677686e-05,
"loss": 2.1482,
"step": 1480
},
{
"epoch": 0.006159200874689198,
"grad_norm": 5.8071883971926725,
"learning_rate": 1.231404958677686e-05,
"loss": 2.0553,
"step": 1490
},
{
"epoch": 0.006200537793311273,
"grad_norm": 6.2092050955251334,
"learning_rate": 1.2396694214876034e-05,
"loss": 2.0836,
"step": 1500
},
{
"epoch": 0.006241874711933349,
"grad_norm": 4.486772138898485,
"learning_rate": 1.2479338842975209e-05,
"loss": 2.1622,
"step": 1510
},
{
"epoch": 0.006283211630555423,
"grad_norm": 5.229981562060858,
"learning_rate": 1.2561983471074381e-05,
"loss": 2.0854,
"step": 1520
},
{
"epoch": 0.006324548549177499,
"grad_norm": 5.604269574061805,
"learning_rate": 1.2644628099173554e-05,
"loss": 2.1604,
"step": 1530
},
{
"epoch": 0.006365885467799574,
"grad_norm": 5.361170550183367,
"learning_rate": 1.2727272727272728e-05,
"loss": 2.063,
"step": 1540
},
{
"epoch": 0.006407222386421649,
"grad_norm": 7.20018526125173,
"learning_rate": 1.2809917355371901e-05,
"loss": 2.0935,
"step": 1550
},
{
"epoch": 0.006448559305043724,
"grad_norm": 5.379224740428811,
"learning_rate": 1.2892561983471074e-05,
"loss": 2.1291,
"step": 1560
},
{
"epoch": 0.0064898962236658,
"grad_norm": 4.967695885199312,
"learning_rate": 1.2975206611570248e-05,
"loss": 2.102,
"step": 1570
},
{
"epoch": 0.006531233142287874,
"grad_norm": 6.039676530986522,
"learning_rate": 1.3057851239669424e-05,
"loss": 2.1134,
"step": 1580
},
{
"epoch": 0.00657257006090995,
"grad_norm": 5.459189111144024,
"learning_rate": 1.3140495867768597e-05,
"loss": 2.0302,
"step": 1590
},
{
"epoch": 0.006613906979532025,
"grad_norm": 6.143839950859222,
"learning_rate": 1.322314049586777e-05,
"loss": 2.0978,
"step": 1600
},
{
"epoch": 0.0066552438981541,
"grad_norm": 5.6756704061902825,
"learning_rate": 1.3305785123966944e-05,
"loss": 2.1592,
"step": 1610
},
{
"epoch": 0.006696580816776175,
"grad_norm": 4.994981957965064,
"learning_rate": 1.3388429752066117e-05,
"loss": 2.1035,
"step": 1620
},
{
"epoch": 0.0067379177353982505,
"grad_norm": 5.853251459350967,
"learning_rate": 1.347107438016529e-05,
"loss": 2.0701,
"step": 1630
},
{
"epoch": 0.006779254654020325,
"grad_norm": 5.4681573607696965,
"learning_rate": 1.3553719008264464e-05,
"loss": 2.089,
"step": 1640
},
{
"epoch": 0.0068205915726424005,
"grad_norm": 5.848581304068256,
"learning_rate": 1.3636363636363637e-05,
"loss": 2.0819,
"step": 1650
},
{
"epoch": 0.006861928491264476,
"grad_norm": 5.481243900559041,
"learning_rate": 1.3719008264462813e-05,
"loss": 2.0284,
"step": 1660
},
{
"epoch": 0.0069032654098865505,
"grad_norm": 5.699959566604993,
"learning_rate": 1.3801652892561985e-05,
"loss": 2.0622,
"step": 1670
},
{
"epoch": 0.006944602328508626,
"grad_norm": 4.996648526388665,
"learning_rate": 1.3884297520661158e-05,
"loss": 2.0071,
"step": 1680
},
{
"epoch": 0.006985939247130701,
"grad_norm": 5.126923483054136,
"learning_rate": 1.3966942148760332e-05,
"loss": 2.0631,
"step": 1690
},
{
"epoch": 0.007027276165752776,
"grad_norm": 5.883552104251492,
"learning_rate": 1.4049586776859505e-05,
"loss": 2.0791,
"step": 1700
},
{
"epoch": 0.007068613084374851,
"grad_norm": 4.929514966855435,
"learning_rate": 1.4132231404958678e-05,
"loss": 2.0364,
"step": 1710
},
{
"epoch": 0.007109950002996927,
"grad_norm": 5.301129760644346,
"learning_rate": 1.4214876033057852e-05,
"loss": 2.0373,
"step": 1720
},
{
"epoch": 0.007151286921619001,
"grad_norm": 5.523739748516145,
"learning_rate": 1.4297520661157025e-05,
"loss": 2.1079,
"step": 1730
},
{
"epoch": 0.007192623840241077,
"grad_norm": 5.7887756838227755,
"learning_rate": 1.4380165289256201e-05,
"loss": 2.049,
"step": 1740
},
{
"epoch": 0.007233960758863152,
"grad_norm": 5.2452853088604865,
"learning_rate": 1.4462809917355374e-05,
"loss": 2.0686,
"step": 1750
},
{
"epoch": 0.007275297677485227,
"grad_norm": 4.454384214370969,
"learning_rate": 1.4545454545454546e-05,
"loss": 2.0124,
"step": 1760
},
{
"epoch": 0.007316634596107302,
"grad_norm": 6.397338503442304,
"learning_rate": 1.462809917355372e-05,
"loss": 2.0152,
"step": 1770
},
{
"epoch": 0.007357971514729378,
"grad_norm": 6.554144037150873,
"learning_rate": 1.4710743801652893e-05,
"loss": 1.9976,
"step": 1780
},
{
"epoch": 0.007399308433351453,
"grad_norm": 4.973940426595748,
"learning_rate": 1.4793388429752066e-05,
"loss": 2.0085,
"step": 1790
},
{
"epoch": 0.007440645351973528,
"grad_norm": 5.776375204519037,
"learning_rate": 1.487603305785124e-05,
"loss": 2.0339,
"step": 1800
},
{
"epoch": 0.007481982270595603,
"grad_norm": 5.472367097758556,
"learning_rate": 1.4958677685950413e-05,
"loss": 2.0016,
"step": 1810
},
{
"epoch": 0.0075233191892176785,
"grad_norm": 4.850880114898939,
"learning_rate": 1.504132231404959e-05,
"loss": 1.9952,
"step": 1820
},
{
"epoch": 0.007564656107839753,
"grad_norm": 4.825492061262016,
"learning_rate": 1.5123966942148762e-05,
"loss": 2.0149,
"step": 1830
},
{
"epoch": 0.0076059930264618285,
"grad_norm": 6.317700924322252,
"learning_rate": 1.5206611570247936e-05,
"loss": 1.9765,
"step": 1840
},
{
"epoch": 0.007647329945083904,
"grad_norm": 5.831048263887902,
"learning_rate": 1.528925619834711e-05,
"loss": 1.9669,
"step": 1850
},
{
"epoch": 0.0076886668637059785,
"grad_norm": 5.190457786334756,
"learning_rate": 1.5371900826446283e-05,
"loss": 2.0342,
"step": 1860
},
{
"epoch": 0.007730003782328054,
"grad_norm": 5.752029895196757,
"learning_rate": 1.5454545454545454e-05,
"loss": 2.0606,
"step": 1870
},
{
"epoch": 0.007771340700950129,
"grad_norm": 5.005855197604682,
"learning_rate": 1.553719008264463e-05,
"loss": 2.0764,
"step": 1880
},
{
"epoch": 0.007812677619572205,
"grad_norm": 5.362161895494138,
"learning_rate": 1.5619834710743803e-05,
"loss": 2.0373,
"step": 1890
},
{
"epoch": 0.00785401453819428,
"grad_norm": 5.589239650428267,
"learning_rate": 1.5702479338842978e-05,
"loss": 2.0536,
"step": 1900
},
{
"epoch": 0.007895351456816354,
"grad_norm": 5.38085836484136,
"learning_rate": 1.5785123966942152e-05,
"loss": 2.0357,
"step": 1910
},
{
"epoch": 0.00793668837543843,
"grad_norm": 6.4123494555744065,
"learning_rate": 1.5867768595041323e-05,
"loss": 1.9992,
"step": 1920
},
{
"epoch": 0.007978025294060505,
"grad_norm": 5.369699763052158,
"learning_rate": 1.5950413223140497e-05,
"loss": 1.9645,
"step": 1930
},
{
"epoch": 0.00801936221268258,
"grad_norm": 6.655726980448801,
"learning_rate": 1.6033057851239672e-05,
"loss": 2.0023,
"step": 1940
},
{
"epoch": 0.008060699131304656,
"grad_norm": 5.150395460216213,
"learning_rate": 1.6115702479338843e-05,
"loss": 1.9953,
"step": 1950
},
{
"epoch": 0.008102036049926731,
"grad_norm": 5.534796132616727,
"learning_rate": 1.6198347107438017e-05,
"loss": 1.964,
"step": 1960
},
{
"epoch": 0.008143372968548805,
"grad_norm": 5.0714233165065075,
"learning_rate": 1.628099173553719e-05,
"loss": 1.9942,
"step": 1970
},
{
"epoch": 0.00818470988717088,
"grad_norm": 5.370096628339807,
"learning_rate": 1.6363636363636366e-05,
"loss": 1.9938,
"step": 1980
},
{
"epoch": 0.008226046805792956,
"grad_norm": 4.816680798680657,
"learning_rate": 1.644628099173554e-05,
"loss": 1.9023,
"step": 1990
},
{
"epoch": 0.008267383724415031,
"grad_norm": 5.910326143029371,
"learning_rate": 1.652892561983471e-05,
"loss": 1.9366,
"step": 2000
},
{
"epoch": 0.008308720643037107,
"grad_norm": 5.364682793090204,
"learning_rate": 1.6611570247933886e-05,
"loss": 1.9835,
"step": 2010
},
{
"epoch": 0.008350057561659182,
"grad_norm": 6.171717096992393,
"learning_rate": 1.669421487603306e-05,
"loss": 1.9641,
"step": 2020
},
{
"epoch": 0.008391394480281256,
"grad_norm": 4.794750763380389,
"learning_rate": 1.677685950413223e-05,
"loss": 1.9405,
"step": 2030
},
{
"epoch": 0.008432731398903331,
"grad_norm": 6.3363242673070745,
"learning_rate": 1.6859504132231405e-05,
"loss": 1.9717,
"step": 2040
},
{
"epoch": 0.008474068317525407,
"grad_norm": 5.10756576497978,
"learning_rate": 1.694214876033058e-05,
"loss": 1.9312,
"step": 2050
},
{
"epoch": 0.008515405236147482,
"grad_norm": 5.5429121513722945,
"learning_rate": 1.7024793388429754e-05,
"loss": 1.9692,
"step": 2060
},
{
"epoch": 0.008556742154769557,
"grad_norm": 5.053921879606705,
"learning_rate": 1.710743801652893e-05,
"loss": 1.9187,
"step": 2070
},
{
"epoch": 0.008598079073391633,
"grad_norm": 5.246682645264326,
"learning_rate": 1.71900826446281e-05,
"loss": 2.0086,
"step": 2080
},
{
"epoch": 0.008639415992013707,
"grad_norm": 4.651358563124329,
"learning_rate": 1.7272727272727274e-05,
"loss": 1.9676,
"step": 2090
},
{
"epoch": 0.008680752910635782,
"grad_norm": 5.254574557184252,
"learning_rate": 1.735537190082645e-05,
"loss": 1.9193,
"step": 2100
},
{
"epoch": 0.008722089829257857,
"grad_norm": 5.5559516380514316,
"learning_rate": 1.743801652892562e-05,
"loss": 1.9123,
"step": 2110
},
{
"epoch": 0.008763426747879933,
"grad_norm": 5.714609535718523,
"learning_rate": 1.7520661157024794e-05,
"loss": 1.9831,
"step": 2120
},
{
"epoch": 0.008804763666502008,
"grad_norm": 4.664121459414757,
"learning_rate": 1.7603305785123968e-05,
"loss": 1.9606,
"step": 2130
},
{
"epoch": 0.008846100585124084,
"grad_norm": 4.9060858638182685,
"learning_rate": 1.7685950413223143e-05,
"loss": 1.9535,
"step": 2140
},
{
"epoch": 0.008887437503746157,
"grad_norm": 4.997171967559315,
"learning_rate": 1.7768595041322317e-05,
"loss": 1.9243,
"step": 2150
},
{
"epoch": 0.008928774422368233,
"grad_norm": 4.60645777188567,
"learning_rate": 1.7851239669421488e-05,
"loss": 1.9291,
"step": 2160
},
{
"epoch": 0.008970111340990308,
"grad_norm": 4.2131519608354004,
"learning_rate": 1.7933884297520662e-05,
"loss": 1.9105,
"step": 2170
},
{
"epoch": 0.009011448259612384,
"grad_norm": 5.51444703850531,
"learning_rate": 1.8016528925619837e-05,
"loss": 1.9502,
"step": 2180
},
{
"epoch": 0.00905278517823446,
"grad_norm": 5.2003808089855825,
"learning_rate": 1.809917355371901e-05,
"loss": 1.9436,
"step": 2190
},
{
"epoch": 0.009094122096856535,
"grad_norm": 4.240179682087964,
"learning_rate": 1.8181818181818182e-05,
"loss": 1.9194,
"step": 2200
},
{
"epoch": 0.00913545901547861,
"grad_norm": 4.582501074244312,
"learning_rate": 1.8264462809917356e-05,
"loss": 1.9145,
"step": 2210
},
{
"epoch": 0.009176795934100684,
"grad_norm": 5.362083861786352,
"learning_rate": 1.834710743801653e-05,
"loss": 1.9165,
"step": 2220
},
{
"epoch": 0.009218132852722759,
"grad_norm": 5.06281875114174,
"learning_rate": 1.8429752066115705e-05,
"loss": 1.977,
"step": 2230
},
{
"epoch": 0.009259469771344835,
"grad_norm": 4.661496047656461,
"learning_rate": 1.851239669421488e-05,
"loss": 1.8892,
"step": 2240
},
{
"epoch": 0.00930080668996691,
"grad_norm": 4.735532406310298,
"learning_rate": 1.859504132231405e-05,
"loss": 1.8689,
"step": 2250
},
{
"epoch": 0.009342143608588985,
"grad_norm": 4.445771479719063,
"learning_rate": 1.8677685950413225e-05,
"loss": 1.9301,
"step": 2260
},
{
"epoch": 0.00938348052721106,
"grad_norm": 5.100109904664726,
"learning_rate": 1.87603305785124e-05,
"loss": 1.9309,
"step": 2270
},
{
"epoch": 0.009424817445833135,
"grad_norm": 6.115469535323335,
"learning_rate": 1.884297520661157e-05,
"loss": 1.918,
"step": 2280
},
{
"epoch": 0.00946615436445521,
"grad_norm": 4.890019742506766,
"learning_rate": 1.8925619834710745e-05,
"loss": 1.945,
"step": 2290
},
{
"epoch": 0.009507491283077285,
"grad_norm": 5.023798525711054,
"learning_rate": 1.900826446280992e-05,
"loss": 1.9004,
"step": 2300
},
{
"epoch": 0.00954882820169936,
"grad_norm": 5.652810624249754,
"learning_rate": 1.9090909090909094e-05,
"loss": 1.9339,
"step": 2310
},
{
"epoch": 0.009590165120321436,
"grad_norm": 6.04847384963266,
"learning_rate": 1.9173553719008268e-05,
"loss": 1.8687,
"step": 2320
},
{
"epoch": 0.009631502038943512,
"grad_norm": 5.332733823425359,
"learning_rate": 1.925619834710744e-05,
"loss": 1.8938,
"step": 2330
},
{
"epoch": 0.009672838957565585,
"grad_norm": 4.814331448709232,
"learning_rate": 1.9338842975206613e-05,
"loss": 1.8752,
"step": 2340
},
{
"epoch": 0.00971417587618766,
"grad_norm": 5.558977499560294,
"learning_rate": 1.9421487603305788e-05,
"loss": 1.8835,
"step": 2350
},
{
"epoch": 0.009755512794809736,
"grad_norm": 5.554940177569548,
"learning_rate": 1.950413223140496e-05,
"loss": 1.857,
"step": 2360
},
{
"epoch": 0.009796849713431812,
"grad_norm": 4.970502489086558,
"learning_rate": 1.9586776859504133e-05,
"loss": 1.8553,
"step": 2370
},
{
"epoch": 0.009838186632053887,
"grad_norm": 4.044099915606779,
"learning_rate": 1.9669421487603307e-05,
"loss": 1.924,
"step": 2380
},
{
"epoch": 0.009879523550675963,
"grad_norm": 4.880726953238654,
"learning_rate": 1.9752066115702482e-05,
"loss": 1.9785,
"step": 2390
},
{
"epoch": 0.009920860469298036,
"grad_norm": 5.457077789861094,
"learning_rate": 1.9834710743801656e-05,
"loss": 1.8585,
"step": 2400
},
{
"epoch": 0.009962197387920112,
"grad_norm": 4.608586390817221,
"learning_rate": 1.9917355371900827e-05,
"loss": 1.8861,
"step": 2410
},
{
"epoch": 0.010003534306542187,
"grad_norm": 4.5178969512670335,
"learning_rate": 2e-05,
"loss": 1.8936,
"step": 2420
},
{
"epoch": 0.010044871225164263,
"grad_norm": 5.722004352525454,
"learning_rate": 1.999999991396395e-05,
"loss": 1.8616,
"step": 2430
},
{
"epoch": 0.010086208143786338,
"grad_norm": 4.99862696301366,
"learning_rate": 1.9999999655855794e-05,
"loss": 1.8865,
"step": 2440
},
{
"epoch": 0.010127545062408413,
"grad_norm": 5.204994732642035,
"learning_rate": 1.9999999225675543e-05,
"loss": 1.8602,
"step": 2450
},
{
"epoch": 0.010168881981030489,
"grad_norm": 4.1143956012846505,
"learning_rate": 1.9999998623423198e-05,
"loss": 1.9101,
"step": 2460
},
{
"epoch": 0.010210218899652563,
"grad_norm": 5.535771463118041,
"learning_rate": 1.9999997849098773e-05,
"loss": 1.8596,
"step": 2470
},
{
"epoch": 0.010251555818274638,
"grad_norm": 5.020430211409416,
"learning_rate": 1.999999690270228e-05,
"loss": 1.8393,
"step": 2480
},
{
"epoch": 0.010292892736896713,
"grad_norm": 5.571737674116448,
"learning_rate": 1.999999578423374e-05,
"loss": 1.8987,
"step": 2490
},
{
"epoch": 0.010334229655518789,
"grad_norm": 4.336614412280944,
"learning_rate": 1.9999994493693165e-05,
"loss": 1.9194,
"step": 2500
},
{
"epoch": 0.010375566574140864,
"grad_norm": 4.815853586635344,
"learning_rate": 1.999999303108058e-05,
"loss": 1.8332,
"step": 2510
},
{
"epoch": 0.01041690349276294,
"grad_norm": 4.559920874208704,
"learning_rate": 1.9999991396396014e-05,
"loss": 1.8818,
"step": 2520
},
{
"epoch": 0.010458240411385013,
"grad_norm": 5.23388810362715,
"learning_rate": 1.9999989589639487e-05,
"loss": 1.8302,
"step": 2530
},
{
"epoch": 0.010499577330007089,
"grad_norm": 5.000301577463503,
"learning_rate": 1.999998761081104e-05,
"loss": 1.8657,
"step": 2540
},
{
"epoch": 0.010540914248629164,
"grad_norm": 4.448533864801871,
"learning_rate": 1.9999985459910698e-05,
"loss": 1.8762,
"step": 2550
},
{
"epoch": 0.01058225116725124,
"grad_norm": 4.715035035883112,
"learning_rate": 1.9999983136938504e-05,
"loss": 1.8511,
"step": 2560
},
{
"epoch": 0.010623588085873315,
"grad_norm": 4.025529484549816,
"learning_rate": 1.9999980641894497e-05,
"loss": 1.8458,
"step": 2570
},
{
"epoch": 0.01066492500449539,
"grad_norm": 4.754000032727581,
"learning_rate": 1.9999977974778715e-05,
"loss": 1.8714,
"step": 2580
},
{
"epoch": 0.010706261923117464,
"grad_norm": 4.930978688660729,
"learning_rate": 1.999997513559121e-05,
"loss": 1.8656,
"step": 2590
},
{
"epoch": 0.01074759884173954,
"grad_norm": 4.4132729377261475,
"learning_rate": 1.9999972124332028e-05,
"loss": 1.9383,
"step": 2600
},
{
"epoch": 0.010788935760361615,
"grad_norm": 4.4540551253199325,
"learning_rate": 1.9999968941001225e-05,
"loss": 1.8426,
"step": 2610
},
{
"epoch": 0.01083027267898369,
"grad_norm": 4.797250042059473,
"learning_rate": 1.999996558559885e-05,
"loss": 1.8634,
"step": 2620
},
{
"epoch": 0.010871609597605766,
"grad_norm": 5.456931710111963,
"learning_rate": 1.999996205812496e-05,
"loss": 1.825,
"step": 2630
},
{
"epoch": 0.010912946516227841,
"grad_norm": 4.377649523138056,
"learning_rate": 1.999995835857962e-05,
"loss": 1.8545,
"step": 2640
},
{
"epoch": 0.010954283434849915,
"grad_norm": 4.5317328844732145,
"learning_rate": 1.9999954486962893e-05,
"loss": 1.8774,
"step": 2650
},
{
"epoch": 0.01099562035347199,
"grad_norm": 4.709498283906347,
"learning_rate": 1.9999950443274847e-05,
"loss": 1.8083,
"step": 2660
},
{
"epoch": 0.011036957272094066,
"grad_norm": 4.592327748002219,
"learning_rate": 1.9999946227515547e-05,
"loss": 1.792,
"step": 2670
},
{
"epoch": 0.011078294190716141,
"grad_norm": 5.036724935294618,
"learning_rate": 1.999994183968507e-05,
"loss": 1.8779,
"step": 2680
},
{
"epoch": 0.011119631109338217,
"grad_norm": 6.694409030503598,
"learning_rate": 1.999993727978349e-05,
"loss": 1.8573,
"step": 2690
},
{
"epoch": 0.011160968027960292,
"grad_norm": 5.018981836140064,
"learning_rate": 1.9999932547810883e-05,
"loss": 1.8726,
"step": 2700
},
{
"epoch": 0.011202304946582366,
"grad_norm": 4.451864521641474,
"learning_rate": 1.9999927643767332e-05,
"loss": 1.8193,
"step": 2710
},
{
"epoch": 0.011243641865204441,
"grad_norm": 5.036890897058994,
"learning_rate": 1.999992256765292e-05,
"loss": 1.8594,
"step": 2720
},
{
"epoch": 0.011284978783826517,
"grad_norm": 4.7545987502372755,
"learning_rate": 1.999991731946774e-05,
"loss": 1.9158,
"step": 2730
},
{
"epoch": 0.011326315702448592,
"grad_norm": 3.9156550800432783,
"learning_rate": 1.999991189921188e-05,
"loss": 1.8166,
"step": 2740
},
{
"epoch": 0.011367652621070668,
"grad_norm": 4.622686377530181,
"learning_rate": 1.999990630688543e-05,
"loss": 1.8426,
"step": 2750
},
{
"epoch": 0.011408989539692743,
"grad_norm": 4.176720366120709,
"learning_rate": 1.9999900542488487e-05,
"loss": 1.8701,
"step": 2760
},
{
"epoch": 0.011450326458314819,
"grad_norm": 4.588055146989058,
"learning_rate": 1.999989460602115e-05,
"loss": 1.8474,
"step": 2770
},
{
"epoch": 0.011491663376936892,
"grad_norm": 4.7632605353618604,
"learning_rate": 1.9999888497483523e-05,
"loss": 1.7611,
"step": 2780
},
{
"epoch": 0.011533000295558968,
"grad_norm": 5.168047411415939,
"learning_rate": 1.9999882216875714e-05,
"loss": 1.8297,
"step": 2790
},
{
"epoch": 0.011574337214181043,
"grad_norm": 5.6032261833368215,
"learning_rate": 1.9999875764197824e-05,
"loss": 1.8273,
"step": 2800
},
{
"epoch": 0.011615674132803119,
"grad_norm": 4.836606306201456,
"learning_rate": 1.9999869139449965e-05,
"loss": 1.8067,
"step": 2810
},
{
"epoch": 0.011657011051425194,
"grad_norm": 5.371156408385522,
"learning_rate": 1.9999862342632258e-05,
"loss": 1.7726,
"step": 2820
},
{
"epoch": 0.01169834797004727,
"grad_norm": 4.715562111195242,
"learning_rate": 1.9999855373744813e-05,
"loss": 1.8257,
"step": 2830
},
{
"epoch": 0.011739684888669343,
"grad_norm": 4.672226047314074,
"learning_rate": 1.9999848232787753e-05,
"loss": 1.807,
"step": 2840
},
{
"epoch": 0.011781021807291419,
"grad_norm": 5.305211095175868,
"learning_rate": 1.9999840919761202e-05,
"loss": 1.8398,
"step": 2850
},
{
"epoch": 0.011822358725913494,
"grad_norm": 4.973800529744849,
"learning_rate": 1.9999833434665282e-05,
"loss": 1.8028,
"step": 2860
},
{
"epoch": 0.01186369564453557,
"grad_norm": 4.580336749750151,
"learning_rate": 1.9999825777500127e-05,
"loss": 1.7559,
"step": 2870
},
{
"epoch": 0.011905032563157645,
"grad_norm": 5.203176556084249,
"learning_rate": 1.999981794826586e-05,
"loss": 1.8375,
"step": 2880
},
{
"epoch": 0.01194636948177972,
"grad_norm": 5.810430258629928,
"learning_rate": 1.9999809946962627e-05,
"loss": 1.8126,
"step": 2890
},
{
"epoch": 0.011987706400401794,
"grad_norm": 5.7480488342439955,
"learning_rate": 1.9999801773590556e-05,
"loss": 1.8228,
"step": 2900
},
{
"epoch": 0.01202904331902387,
"grad_norm": 4.946108636349945,
"learning_rate": 1.9999793428149793e-05,
"loss": 1.7801,
"step": 2910
},
{
"epoch": 0.012070380237645945,
"grad_norm": 4.686375909907021,
"learning_rate": 1.9999784910640484e-05,
"loss": 1.7595,
"step": 2920
},
{
"epoch": 0.01211171715626802,
"grad_norm": 5.2352360374303135,
"learning_rate": 1.9999776221062767e-05,
"loss": 1.8413,
"step": 2930
},
{
"epoch": 0.012153054074890096,
"grad_norm": 4.509401680547479,
"learning_rate": 1.99997673594168e-05,
"loss": 1.7973,
"step": 2940
},
{
"epoch": 0.012194390993512171,
"grad_norm": 4.614511294927466,
"learning_rate": 1.9999758325702728e-05,
"loss": 1.8206,
"step": 2950
},
{
"epoch": 0.012235727912134245,
"grad_norm": 6.185921445660834,
"learning_rate": 1.9999749119920714e-05,
"loss": 1.8462,
"step": 2960
},
{
"epoch": 0.01227706483075632,
"grad_norm": 4.174924494562577,
"learning_rate": 1.999973974207091e-05,
"loss": 1.8036,
"step": 2970
},
{
"epoch": 0.012318401749378396,
"grad_norm": 4.836665186633954,
"learning_rate": 1.9999730192153483e-05,
"loss": 1.8517,
"step": 2980
},
{
"epoch": 0.012359738668000471,
"grad_norm": 5.384960643252126,
"learning_rate": 1.999972047016859e-05,
"loss": 1.8159,
"step": 2990
},
{
"epoch": 0.012401075586622547,
"grad_norm": 5.021462883098841,
"learning_rate": 1.9999710576116403e-05,
"loss": 1.7985,
"step": 3000
},
{
"epoch": 0.012442412505244622,
"grad_norm": 5.427243361920921,
"learning_rate": 1.99997005099971e-05,
"loss": 1.7765,
"step": 3010
},
{
"epoch": 0.012483749423866698,
"grad_norm": 4.491526906719712,
"learning_rate": 1.999969027181084e-05,
"loss": 1.8183,
"step": 3020
},
{
"epoch": 0.012525086342488771,
"grad_norm": 4.559161527860771,
"learning_rate": 1.9999679861557804e-05,
"loss": 1.724,
"step": 3030
},
{
"epoch": 0.012566423261110847,
"grad_norm": 4.161439649907769,
"learning_rate": 1.9999669279238173e-05,
"loss": 1.7683,
"step": 3040
},
{
"epoch": 0.012607760179732922,
"grad_norm": 4.56366674854018,
"learning_rate": 1.999965852485213e-05,
"loss": 1.786,
"step": 3050
},
{
"epoch": 0.012649097098354998,
"grad_norm": 5.245429404102266,
"learning_rate": 1.999964759839986e-05,
"loss": 1.7822,
"step": 3060
},
{
"epoch": 0.012690434016977073,
"grad_norm": 4.6630532976211825,
"learning_rate": 1.9999636499881548e-05,
"loss": 1.7466,
"step": 3070
},
{
"epoch": 0.012731770935599148,
"grad_norm": 4.060133282127772,
"learning_rate": 1.9999625229297385e-05,
"loss": 1.795,
"step": 3080
},
{
"epoch": 0.012773107854221222,
"grad_norm": 3.9035653210065644,
"learning_rate": 1.9999613786647568e-05,
"loss": 1.7644,
"step": 3090
},
{
"epoch": 0.012814444772843298,
"grad_norm": 4.309261522039182,
"learning_rate": 1.9999602171932292e-05,
"loss": 1.7843,
"step": 3100
},
{
"epoch": 0.012855781691465373,
"grad_norm": 4.849350440475919,
"learning_rate": 1.999959038515176e-05,
"loss": 1.7703,
"step": 3110
},
{
"epoch": 0.012897118610087448,
"grad_norm": 4.275025549538348,
"learning_rate": 1.999957842630617e-05,
"loss": 1.7714,
"step": 3120
},
{
"epoch": 0.012938455528709524,
"grad_norm": 5.173686081149218,
"learning_rate": 1.9999566295395728e-05,
"loss": 1.7638,
"step": 3130
},
{
"epoch": 0.0129797924473316,
"grad_norm": 4.908518222034618,
"learning_rate": 1.999955399242065e-05,
"loss": 1.8008,
"step": 3140
},
{
"epoch": 0.013021129365953673,
"grad_norm": 4.14921313541257,
"learning_rate": 1.9999541517381137e-05,
"loss": 1.7741,
"step": 3150
},
{
"epoch": 0.013062466284575748,
"grad_norm": 4.543722393877187,
"learning_rate": 1.9999528870277412e-05,
"loss": 1.7949,
"step": 3160
},
{
"epoch": 0.013103803203197824,
"grad_norm": 4.410976439510873,
"learning_rate": 1.9999516051109688e-05,
"loss": 1.7547,
"step": 3170
},
{
"epoch": 0.0131451401218199,
"grad_norm": 5.705194883861194,
"learning_rate": 1.9999503059878188e-05,
"loss": 1.7513,
"step": 3180
},
{
"epoch": 0.013186477040441975,
"grad_norm": 4.65186813408292,
"learning_rate": 1.999948989658313e-05,
"loss": 1.7664,
"step": 3190
},
{
"epoch": 0.01322781395906405,
"grad_norm": 5.20074413082596,
"learning_rate": 1.9999476561224754e-05,
"loss": 1.7545,
"step": 3200
},
{
"epoch": 0.013269150877686124,
"grad_norm": 3.9975643391331745,
"learning_rate": 1.9999463053803275e-05,
"loss": 1.7175,
"step": 3210
},
{
"epoch": 0.0133104877963082,
"grad_norm": 4.798261065065511,
"learning_rate": 1.9999449374318934e-05,
"loss": 1.7464,
"step": 3220
},
{
"epoch": 0.013351824714930275,
"grad_norm": 4.858469902498838,
"learning_rate": 1.9999435522771963e-05,
"loss": 1.7568,
"step": 3230
},
{
"epoch": 0.01339316163355235,
"grad_norm": 4.236662694907985,
"learning_rate": 1.99994214991626e-05,
"loss": 1.7335,
"step": 3240
},
{
"epoch": 0.013434498552174426,
"grad_norm": 4.6188062645939585,
"learning_rate": 1.9999407303491085e-05,
"loss": 1.7529,
"step": 3250
},
{
"epoch": 0.013475835470796501,
"grad_norm": 4.77521754475989,
"learning_rate": 1.9999392935757668e-05,
"loss": 1.7734,
"step": 3260
},
{
"epoch": 0.013517172389418576,
"grad_norm": 6.027108769658543,
"learning_rate": 1.999937839596259e-05,
"loss": 1.8136,
"step": 3270
},
{
"epoch": 0.01355850930804065,
"grad_norm": 4.163761649197771,
"learning_rate": 1.9999363684106105e-05,
"loss": 1.7085,
"step": 3280
},
{
"epoch": 0.013599846226662726,
"grad_norm": 3.916493440655603,
"learning_rate": 1.9999348800188466e-05,
"loss": 1.7815,
"step": 3290
},
{
"epoch": 0.013641183145284801,
"grad_norm": 4.397530066361572,
"learning_rate": 1.9999333744209924e-05,
"loss": 1.7759,
"step": 3300
},
{
"epoch": 0.013682520063906876,
"grad_norm": 4.839462185241853,
"learning_rate": 1.9999318516170747e-05,
"loss": 1.7548,
"step": 3310
},
{
"epoch": 0.013723856982528952,
"grad_norm": 4.759012044819632,
"learning_rate": 1.999930311607119e-05,
"loss": 1.7815,
"step": 3320
},
{
"epoch": 0.013765193901151027,
"grad_norm": 4.1799473470272295,
"learning_rate": 1.9999287543911522e-05,
"loss": 1.7907,
"step": 3330
},
{
"epoch": 0.013806530819773101,
"grad_norm": 4.454633377063746,
"learning_rate": 1.9999271799692006e-05,
"loss": 1.7579,
"step": 3340
},
{
"epoch": 0.013847867738395176,
"grad_norm": 4.997867503776301,
"learning_rate": 1.999925588341292e-05,
"loss": 1.7335,
"step": 3350
},
{
"epoch": 0.013889204657017252,
"grad_norm": 4.345433706332678,
"learning_rate": 1.999923979507453e-05,
"loss": 1.7124,
"step": 3360
},
{
"epoch": 0.013930541575639327,
"grad_norm": 4.531985231521044,
"learning_rate": 1.999922353467712e-05,
"loss": 1.758,
"step": 3370
},
{
"epoch": 0.013971878494261403,
"grad_norm": 4.399801080955952,
"learning_rate": 1.9999207102220962e-05,
"loss": 1.7065,
"step": 3380
},
{
"epoch": 0.014013215412883478,
"grad_norm": 5.059800651377326,
"learning_rate": 1.999919049770635e-05,
"loss": 1.693,
"step": 3390
},
{
"epoch": 0.014054552331505552,
"grad_norm": 4.260000291303237,
"learning_rate": 1.9999173721133557e-05,
"loss": 1.7488,
"step": 3400
},
{
"epoch": 0.014095889250127627,
"grad_norm": 4.5331171056345605,
"learning_rate": 1.999915677250288e-05,
"loss": 1.7046,
"step": 3410
},
{
"epoch": 0.014137226168749703,
"grad_norm": 4.187185061547482,
"learning_rate": 1.999913965181461e-05,
"loss": 1.7013,
"step": 3420
},
{
"epoch": 0.014178563087371778,
"grad_norm": 4.429191188349303,
"learning_rate": 1.999912235906904e-05,
"loss": 1.7127,
"step": 3430
},
{
"epoch": 0.014219900005993854,
"grad_norm": 4.083346883074332,
"learning_rate": 1.9999104894266466e-05,
"loss": 1.7571,
"step": 3440
},
{
"epoch": 0.014261236924615929,
"grad_norm": 4.424685185794806,
"learning_rate": 1.999908725740719e-05,
"loss": 1.7563,
"step": 3450
},
{
"epoch": 0.014302573843238003,
"grad_norm": 4.285454928033791,
"learning_rate": 1.9999069448491516e-05,
"loss": 1.7547,
"step": 3460
},
{
"epoch": 0.014343910761860078,
"grad_norm": 4.424718664433471,
"learning_rate": 1.999905146751975e-05,
"loss": 1.7374,
"step": 3470
},
{
"epoch": 0.014385247680482154,
"grad_norm": 4.075077717271962,
"learning_rate": 1.99990333144922e-05,
"loss": 1.7661,
"step": 3480
},
{
"epoch": 0.014426584599104229,
"grad_norm": 4.538474712661468,
"learning_rate": 1.999901498940918e-05,
"loss": 1.7118,
"step": 3490
},
{
"epoch": 0.014467921517726304,
"grad_norm": 4.435775318213515,
"learning_rate": 1.9998996492271007e-05,
"loss": 1.7368,
"step": 3500
},
{
"epoch": 0.01450925843634838,
"grad_norm": 4.545629750120307,
"learning_rate": 1.9998977823077998e-05,
"loss": 1.7335,
"step": 3510
},
{
"epoch": 0.014550595354970454,
"grad_norm": 4.359608762821868,
"learning_rate": 1.9998958981830473e-05,
"loss": 1.7318,
"step": 3520
},
{
"epoch": 0.014591932273592529,
"grad_norm": 4.453842525389737,
"learning_rate": 1.9998939968528754e-05,
"loss": 1.7499,
"step": 3530
},
{
"epoch": 0.014633269192214604,
"grad_norm": 5.088846901725583,
"learning_rate": 1.9998920783173172e-05,
"loss": 1.7555,
"step": 3540
},
{
"epoch": 0.01467460611083668,
"grad_norm": 4.1590343693668395,
"learning_rate": 1.9998901425764057e-05,
"loss": 1.7386,
"step": 3550
},
{
"epoch": 0.014715943029458755,
"grad_norm": 4.181140524329235,
"learning_rate": 1.9998881896301744e-05,
"loss": 1.6455,
"step": 3560
},
{
"epoch": 0.01475727994808083,
"grad_norm": 4.228896471972448,
"learning_rate": 1.999886219478656e-05,
"loss": 1.7282,
"step": 3570
},
{
"epoch": 0.014798616866702906,
"grad_norm": 3.9899831004408526,
"learning_rate": 1.9998842321218855e-05,
"loss": 1.7201,
"step": 3580
},
{
"epoch": 0.01483995378532498,
"grad_norm": 3.9178031007246408,
"learning_rate": 1.9998822275598964e-05,
"loss": 1.6812,
"step": 3590
},
{
"epoch": 0.014881290703947055,
"grad_norm": 4.3808976089497484,
"learning_rate": 1.9998802057927236e-05,
"loss": 1.7175,
"step": 3600
},
{
"epoch": 0.01492262762256913,
"grad_norm": 3.9780395209303197,
"learning_rate": 1.9998781668204015e-05,
"loss": 1.7351,
"step": 3610
},
{
"epoch": 0.014963964541191206,
"grad_norm": 5.378812354806347,
"learning_rate": 1.9998761106429655e-05,
"loss": 1.7092,
"step": 3620
},
{
"epoch": 0.015005301459813282,
"grad_norm": 3.9422939515447246,
"learning_rate": 1.999874037260451e-05,
"loss": 1.7261,
"step": 3630
},
{
"epoch": 0.015046638378435357,
"grad_norm": 4.442422033504748,
"learning_rate": 1.9998719466728934e-05,
"loss": 1.7027,
"step": 3640
},
{
"epoch": 0.01508797529705743,
"grad_norm": 4.102984271689072,
"learning_rate": 1.9998698388803288e-05,
"loss": 1.6741,
"step": 3650
},
{
"epoch": 0.015129312215679506,
"grad_norm": 3.9608491048290615,
"learning_rate": 1.9998677138827934e-05,
"loss": 1.7542,
"step": 3660
},
{
"epoch": 0.015170649134301582,
"grad_norm": 4.561629575046756,
"learning_rate": 1.999865571680324e-05,
"loss": 1.6785,
"step": 3670
},
{
"epoch": 0.015211986052923657,
"grad_norm": 4.4640127715057885,
"learning_rate": 1.9998634122729573e-05,
"loss": 1.7,
"step": 3680
},
{
"epoch": 0.015253322971545732,
"grad_norm": 3.8935864417828188,
"learning_rate": 1.9998612356607303e-05,
"loss": 1.6939,
"step": 3690
},
{
"epoch": 0.015294659890167808,
"grad_norm": 5.011277234521909,
"learning_rate": 1.9998590418436808e-05,
"loss": 1.7019,
"step": 3700
},
{
"epoch": 0.015335996808789882,
"grad_norm": 4.107354033282244,
"learning_rate": 1.9998568308218465e-05,
"loss": 1.6637,
"step": 3710
},
{
"epoch": 0.015377333727411957,
"grad_norm": 5.3480918453617905,
"learning_rate": 1.999854602595265e-05,
"loss": 1.7322,
"step": 3720
},
{
"epoch": 0.015418670646034032,
"grad_norm": 4.443648241332512,
"learning_rate": 1.9998523571639752e-05,
"loss": 1.6794,
"step": 3730
},
{
"epoch": 0.015460007564656108,
"grad_norm": 3.4677507775480025,
"learning_rate": 1.999850094528015e-05,
"loss": 1.6943,
"step": 3740
},
{
"epoch": 0.015501344483278183,
"grad_norm": 4.306434811794374,
"learning_rate": 1.9998478146874244e-05,
"loss": 1.6996,
"step": 3750
},
{
"epoch": 0.015542681401900259,
"grad_norm": 5.783322294479809,
"learning_rate": 1.9998455176422423e-05,
"loss": 1.7071,
"step": 3760
},
{
"epoch": 0.015584018320522332,
"grad_norm": 5.907422561947855,
"learning_rate": 1.999843203392507e-05,
"loss": 1.7736,
"step": 3770
},
{
"epoch": 0.01562535523914441,
"grad_norm": 4.114299347318918,
"learning_rate": 1.9998408719382602e-05,
"loss": 1.7068,
"step": 3780
},
{
"epoch": 0.015666692157766483,
"grad_norm": 4.897826252389082,
"learning_rate": 1.999838523279541e-05,
"loss": 1.6542,
"step": 3790
},
{
"epoch": 0.01570802907638856,
"grad_norm": 4.387711122090114,
"learning_rate": 1.9998361574163897e-05,
"loss": 1.7202,
"step": 3800
},
{
"epoch": 0.015749365995010634,
"grad_norm": 4.249935651772863,
"learning_rate": 1.999833774348847e-05,
"loss": 1.6871,
"step": 3810
},
{
"epoch": 0.015790702913632708,
"grad_norm": 4.961734747800958,
"learning_rate": 1.9998313740769547e-05,
"loss": 1.7012,
"step": 3820
},
{
"epoch": 0.015832039832254785,
"grad_norm": 4.247988360660198,
"learning_rate": 1.9998289566007535e-05,
"loss": 1.684,
"step": 3830
},
{
"epoch": 0.01587337675087686,
"grad_norm": 5.434305269506113,
"learning_rate": 1.999826521920285e-05,
"loss": 1.7673,
"step": 3840
},
{
"epoch": 0.015914713669498936,
"grad_norm": 4.617133742171007,
"learning_rate": 1.999824070035591e-05,
"loss": 1.6622,
"step": 3850
},
{
"epoch": 0.01595605058812101,
"grad_norm": 3.70746479523289,
"learning_rate": 1.9998216009467136e-05,
"loss": 1.6647,
"step": 3860
},
{
"epoch": 0.015997387506743083,
"grad_norm": 4.604026510578186,
"learning_rate": 1.999819114653696e-05,
"loss": 1.6772,
"step": 3870
},
{
"epoch": 0.01603872442536516,
"grad_norm": 3.8606213125382642,
"learning_rate": 1.9998166111565804e-05,
"loss": 1.694,
"step": 3880
},
{
"epoch": 0.016080061343987234,
"grad_norm": 5.0244652420608045,
"learning_rate": 1.99981409045541e-05,
"loss": 1.7797,
"step": 3890
},
{
"epoch": 0.01612139826260931,
"grad_norm": 4.707739461519922,
"learning_rate": 1.999811552550228e-05,
"loss": 1.7159,
"step": 3900
},
{
"epoch": 0.016162735181231385,
"grad_norm": 3.9677147576335043,
"learning_rate": 1.9998089974410782e-05,
"loss": 1.6708,
"step": 3910
},
{
"epoch": 0.016204072099853462,
"grad_norm": 4.311084704937728,
"learning_rate": 1.9998064251280048e-05,
"loss": 1.7109,
"step": 3920
},
{
"epoch": 0.016245409018475536,
"grad_norm": 3.9457174661249534,
"learning_rate": 1.999803835611052e-05,
"loss": 1.6713,
"step": 3930
},
{
"epoch": 0.01628674593709761,
"grad_norm": 3.947531059176682,
"learning_rate": 1.999801228890264e-05,
"loss": 1.6796,
"step": 3940
},
{
"epoch": 0.016328082855719687,
"grad_norm": 4.14663907999712,
"learning_rate": 1.9997986049656858e-05,
"loss": 1.6452,
"step": 3950
},
{
"epoch": 0.01636941977434176,
"grad_norm": 3.897276226226099,
"learning_rate": 1.9997959638373626e-05,
"loss": 1.6507,
"step": 3960
},
{
"epoch": 0.016410756692963838,
"grad_norm": 3.778326978683171,
"learning_rate": 1.9997933055053402e-05,
"loss": 1.7378,
"step": 3970
},
{
"epoch": 0.01645209361158591,
"grad_norm": 4.014730222130603,
"learning_rate": 1.9997906299696635e-05,
"loss": 1.6651,
"step": 3980
},
{
"epoch": 0.016493430530207985,
"grad_norm": 3.8164751076978223,
"learning_rate": 1.9997879372303797e-05,
"loss": 1.7007,
"step": 3990
},
{
"epoch": 0.016534767448830062,
"grad_norm": 3.922371704332535,
"learning_rate": 1.999785227287534e-05,
"loss": 1.7161,
"step": 4000
},
{
"epoch": 0.016576104367452136,
"grad_norm": 3.934785675300376,
"learning_rate": 1.9997825001411738e-05,
"loss": 1.6704,
"step": 4010
},
{
"epoch": 0.016617441286074213,
"grad_norm": 4.564033996587743,
"learning_rate": 1.9997797557913455e-05,
"loss": 1.6918,
"step": 4020
},
{
"epoch": 0.016658778204696287,
"grad_norm": 4.4245567390274365,
"learning_rate": 1.9997769942380968e-05,
"loss": 1.7143,
"step": 4030
},
{
"epoch": 0.016700115123318364,
"grad_norm": 3.8624198473379874,
"learning_rate": 1.9997742154814744e-05,
"loss": 1.7298,
"step": 4040
},
{
"epoch": 0.016741452041940438,
"grad_norm": 4.010446146589402,
"learning_rate": 1.9997714195215275e-05,
"loss": 1.6851,
"step": 4050
},
{
"epoch": 0.01678278896056251,
"grad_norm": 4.139527737935189,
"learning_rate": 1.9997686063583028e-05,
"loss": 1.6597,
"step": 4060
},
{
"epoch": 0.01682412587918459,
"grad_norm": 3.617422879629344,
"learning_rate": 1.9997657759918498e-05,
"loss": 1.7078,
"step": 4070
},
{
"epoch": 0.016865462797806662,
"grad_norm": 4.492323213426353,
"learning_rate": 1.9997629284222165e-05,
"loss": 1.6521,
"step": 4080
},
{
"epoch": 0.01690679971642874,
"grad_norm": 5.007903819964739,
"learning_rate": 1.999760063649452e-05,
"loss": 1.6694,
"step": 4090
},
{
"epoch": 0.016948136635050813,
"grad_norm": 4.960862868620129,
"learning_rate": 1.999757181673606e-05,
"loss": 1.68,
"step": 4100
},
{
"epoch": 0.01698947355367289,
"grad_norm": 5.878432740559922,
"learning_rate": 1.9997542824947276e-05,
"loss": 1.6736,
"step": 4110
},
{
"epoch": 0.017030810472294964,
"grad_norm": 4.440326929426054,
"learning_rate": 1.999751366112867e-05,
"loss": 1.6335,
"step": 4120
},
{
"epoch": 0.017072147390917038,
"grad_norm": 4.263618522816504,
"learning_rate": 1.999748432528074e-05,
"loss": 1.7186,
"step": 4130
},
{
"epoch": 0.017113484309539115,
"grad_norm": 4.292363992231819,
"learning_rate": 1.9997454817403996e-05,
"loss": 1.6416,
"step": 4140
},
{
"epoch": 0.01715482122816119,
"grad_norm": 4.013314862106662,
"learning_rate": 1.9997425137498944e-05,
"loss": 1.723,
"step": 4150
},
{
"epoch": 0.017196158146783266,
"grad_norm": 4.07382683143937,
"learning_rate": 1.999739528556609e-05,
"loss": 1.6604,
"step": 4160
},
{
"epoch": 0.01723749506540534,
"grad_norm": 4.533516304139438,
"learning_rate": 1.9997365261605957e-05,
"loss": 1.6683,
"step": 4170
},
{
"epoch": 0.017278831984027413,
"grad_norm": 5.114666733039835,
"learning_rate": 1.999733506561905e-05,
"loss": 1.6925,
"step": 4180
},
{
"epoch": 0.01732016890264949,
"grad_norm": 3.895641699630939,
"learning_rate": 1.99973046976059e-05,
"loss": 1.6743,
"step": 4190
},
{
"epoch": 0.017361505821271564,
"grad_norm": 3.9125805892169465,
"learning_rate": 1.9997274157567025e-05,
"loss": 1.6823,
"step": 4200
},
{
"epoch": 0.01740284273989364,
"grad_norm": 4.530982763817902,
"learning_rate": 1.999724344550295e-05,
"loss": 1.666,
"step": 4210
},
{
"epoch": 0.017444179658515715,
"grad_norm": 4.806928145874966,
"learning_rate": 1.9997212561414198e-05,
"loss": 1.7254,
"step": 4220
},
{
"epoch": 0.017485516577137792,
"grad_norm": 3.9697720655534483,
"learning_rate": 1.999718150530131e-05,
"loss": 1.6241,
"step": 4230
},
{
"epoch": 0.017526853495759866,
"grad_norm": 4.257480914158059,
"learning_rate": 1.9997150277164815e-05,
"loss": 1.6346,
"step": 4240
},
{
"epoch": 0.01756819041438194,
"grad_norm": 3.799531767116148,
"learning_rate": 1.999711887700525e-05,
"loss": 1.6296,
"step": 4250
},
{
"epoch": 0.017609527333004017,
"grad_norm": 3.802902072405634,
"learning_rate": 1.999708730482316e-05,
"loss": 1.6296,
"step": 4260
},
{
"epoch": 0.01765086425162609,
"grad_norm": 5.118064089629252,
"learning_rate": 1.9997055560619082e-05,
"loss": 1.643,
"step": 4270
},
{
"epoch": 0.017692201170248167,
"grad_norm": 4.227158901611068,
"learning_rate": 1.9997023644393567e-05,
"loss": 1.6698,
"step": 4280
},
{
"epoch": 0.01773353808887024,
"grad_norm": 4.238927562799819,
"learning_rate": 1.9996991556147166e-05,
"loss": 1.653,
"step": 4290
},
{
"epoch": 0.017774875007492315,
"grad_norm": 4.204830304370112,
"learning_rate": 1.9996959295880423e-05,
"loss": 1.6844,
"step": 4300
},
{
"epoch": 0.017816211926114392,
"grad_norm": 4.097133417277415,
"learning_rate": 1.99969268635939e-05,
"loss": 1.6212,
"step": 4310
},
{
"epoch": 0.017857548844736466,
"grad_norm": 4.65335395814053,
"learning_rate": 1.999689425928815e-05,
"loss": 1.6882,
"step": 4320
},
{
"epoch": 0.017898885763358543,
"grad_norm": 4.112571966210029,
"learning_rate": 1.999686148296374e-05,
"loss": 1.6929,
"step": 4330
},
{
"epoch": 0.017940222681980617,
"grad_norm": 5.088602258322444,
"learning_rate": 1.999682853462123e-05,
"loss": 1.6648,
"step": 4340
},
{
"epoch": 0.017981559600602694,
"grad_norm": 3.9480572889086147,
"learning_rate": 1.9996795414261186e-05,
"loss": 1.5896,
"step": 4350
},
{
"epoch": 0.018022896519224767,
"grad_norm": 4.8104711694243,
"learning_rate": 1.9996762121884186e-05,
"loss": 1.6709,
"step": 4360
},
{
"epoch": 0.01806423343784684,
"grad_norm": 5.388396623467715,
"learning_rate": 1.999672865749079e-05,
"loss": 1.6716,
"step": 4370
},
{
"epoch": 0.01810557035646892,
"grad_norm": 4.279793170082693,
"learning_rate": 1.9996695021081584e-05,
"loss": 1.632,
"step": 4380
},
{
"epoch": 0.018146907275090992,
"grad_norm": 4.624743597271427,
"learning_rate": 1.999666121265714e-05,
"loss": 1.6054,
"step": 4390
},
{
"epoch": 0.01818824419371307,
"grad_norm": 4.133320200289432,
"learning_rate": 1.9996627232218048e-05,
"loss": 1.6418,
"step": 4400
},
{
"epoch": 0.018229581112335143,
"grad_norm": 4.0963463496824986,
"learning_rate": 1.9996593079764884e-05,
"loss": 1.6683,
"step": 4410
},
{
"epoch": 0.01827091803095722,
"grad_norm": 4.03547359932741,
"learning_rate": 1.9996558755298238e-05,
"loss": 1.5996,
"step": 4420
},
{
"epoch": 0.018312254949579294,
"grad_norm": 4.156363997210419,
"learning_rate": 1.9996524258818706e-05,
"loss": 1.6471,
"step": 4430
},
{
"epoch": 0.018353591868201367,
"grad_norm": 4.075479637959615,
"learning_rate": 1.9996489590326874e-05,
"loss": 1.5989,
"step": 4440
},
{
"epoch": 0.018394928786823445,
"grad_norm": 4.63601174765512,
"learning_rate": 1.9996454749823345e-05,
"loss": 1.6642,
"step": 4450
},
{
"epoch": 0.018436265705445518,
"grad_norm": 3.760851338042477,
"learning_rate": 1.9996419737308715e-05,
"loss": 1.6579,
"step": 4460
},
{
"epoch": 0.018477602624067595,
"grad_norm": 3.979536768168784,
"learning_rate": 1.9996384552783588e-05,
"loss": 1.6006,
"step": 4470
},
{
"epoch": 0.01851893954268967,
"grad_norm": 4.246767902971398,
"learning_rate": 1.9996349196248563e-05,
"loss": 1.6715,
"step": 4480
},
{
"epoch": 0.018560276461311743,
"grad_norm": 4.26779353731614,
"learning_rate": 1.999631366770426e-05,
"loss": 1.6859,
"step": 4490
},
{
"epoch": 0.01860161337993382,
"grad_norm": 4.049582440808523,
"learning_rate": 1.9996277967151283e-05,
"loss": 1.6882,
"step": 4500
},
{
"epoch": 0.018642950298555894,
"grad_norm": 4.066185344313316,
"learning_rate": 1.9996242094590248e-05,
"loss": 1.6601,
"step": 4510
},
{
"epoch": 0.01868428721717797,
"grad_norm": 3.7309702600230494,
"learning_rate": 1.9996206050021768e-05,
"loss": 1.6453,
"step": 4520
},
{
"epoch": 0.018725624135800045,
"grad_norm": 4.307728435051617,
"learning_rate": 1.9996169833446473e-05,
"loss": 1.6728,
"step": 4530
},
{
"epoch": 0.01876696105442212,
"grad_norm": 3.892468749865279,
"learning_rate": 1.9996133444864974e-05,
"loss": 1.6996,
"step": 4540
},
{
"epoch": 0.018808297973044195,
"grad_norm": 4.172694653615993,
"learning_rate": 1.999609688427791e-05,
"loss": 1.6519,
"step": 4550
},
{
"epoch": 0.01884963489166627,
"grad_norm": 4.211392128772361,
"learning_rate": 1.9996060151685895e-05,
"loss": 1.6096,
"step": 4560
},
{
"epoch": 0.018890971810288346,
"grad_norm": 4.728429773380645,
"learning_rate": 1.9996023247089576e-05,
"loss": 1.6217,
"step": 4570
},
{
"epoch": 0.01893230872891042,
"grad_norm": 3.7603074265755745,
"learning_rate": 1.999598617048958e-05,
"loss": 1.617,
"step": 4580
},
{
"epoch": 0.018973645647532497,
"grad_norm": 4.5264911357846165,
"learning_rate": 1.9995948921886547e-05,
"loss": 1.6009,
"step": 4590
},
{
"epoch": 0.01901498256615457,
"grad_norm": 4.285402551531064,
"learning_rate": 1.999591150128112e-05,
"loss": 1.6666,
"step": 4600
},
{
"epoch": 0.019056319484776648,
"grad_norm": 4.528562163332608,
"learning_rate": 1.9995873908673936e-05,
"loss": 1.6967,
"step": 4610
},
{
"epoch": 0.01909765640339872,
"grad_norm": 4.331142545150304,
"learning_rate": 1.999583614406565e-05,
"loss": 1.6387,
"step": 4620
},
{
"epoch": 0.019138993322020795,
"grad_norm": 4.277497333006759,
"learning_rate": 1.9995798207456906e-05,
"loss": 1.6407,
"step": 4630
},
{
"epoch": 0.019180330240642873,
"grad_norm": 4.236531733677237,
"learning_rate": 1.999576009884836e-05,
"loss": 1.6528,
"step": 4640
},
{
"epoch": 0.019221667159264946,
"grad_norm": 4.1527404087837825,
"learning_rate": 1.9995721818240664e-05,
"loss": 1.6386,
"step": 4650
},
{
"epoch": 0.019263004077887023,
"grad_norm": 4.21734134516066,
"learning_rate": 1.999568336563448e-05,
"loss": 1.6531,
"step": 4660
},
{
"epoch": 0.019304340996509097,
"grad_norm": 4.010277949791672,
"learning_rate": 1.999564474103047e-05,
"loss": 1.6125,
"step": 4670
},
{
"epoch": 0.01934567791513117,
"grad_norm": 4.974363400314765,
"learning_rate": 1.99956059444293e-05,
"loss": 1.6562,
"step": 4680
},
{
"epoch": 0.019387014833753248,
"grad_norm": 3.461845715262989,
"learning_rate": 1.999556697583163e-05,
"loss": 1.6715,
"step": 4690
},
{
"epoch": 0.01942835175237532,
"grad_norm": 4.501289760535044,
"learning_rate": 1.999552783523814e-05,
"loss": 1.6276,
"step": 4700
},
{
"epoch": 0.0194696886709974,
"grad_norm": 3.980526992455661,
"learning_rate": 1.99954885226495e-05,
"loss": 1.6512,
"step": 4710
},
{
"epoch": 0.019511025589619473,
"grad_norm": 4.754361998561602,
"learning_rate": 1.9995449038066385e-05,
"loss": 1.6563,
"step": 4720
},
{
"epoch": 0.01955236250824155,
"grad_norm": 3.962924389993788,
"learning_rate": 1.9995409381489473e-05,
"loss": 1.5921,
"step": 4730
},
{
"epoch": 0.019593699426863623,
"grad_norm": 4.230038259640959,
"learning_rate": 1.999536955291945e-05,
"loss": 1.6266,
"step": 4740
},
{
"epoch": 0.019635036345485697,
"grad_norm": 3.4637303252434863,
"learning_rate": 1.9995329552356996e-05,
"loss": 1.5613,
"step": 4750
},
{
"epoch": 0.019676373264107774,
"grad_norm": 4.180047059414082,
"learning_rate": 1.999528937980281e-05,
"loss": 1.6358,
"step": 4760
},
{
"epoch": 0.019717710182729848,
"grad_norm": 4.407688478601427,
"learning_rate": 1.9995249035257572e-05,
"loss": 1.6276,
"step": 4770
},
{
"epoch": 0.019759047101351925,
"grad_norm": 5.682179019619738,
"learning_rate": 1.999520851872198e-05,
"loss": 1.6339,
"step": 4780
},
{
"epoch": 0.019800384019974,
"grad_norm": 5.80296950656401,
"learning_rate": 1.9995167830196732e-05,
"loss": 1.6735,
"step": 4790
},
{
"epoch": 0.019841720938596073,
"grad_norm": 4.788010741660107,
"learning_rate": 1.999512696968253e-05,
"loss": 1.6183,
"step": 4800
},
{
"epoch": 0.01988305785721815,
"grad_norm": 3.2823877198029683,
"learning_rate": 1.9995085937180075e-05,
"loss": 1.6314,
"step": 4810
},
{
"epoch": 0.019924394775840223,
"grad_norm": 4.513204723991569,
"learning_rate": 1.9995044732690074e-05,
"loss": 1.6558,
"step": 4820
},
{
"epoch": 0.0199657316944623,
"grad_norm": 3.710887033971277,
"learning_rate": 1.999500335621323e-05,
"loss": 1.6339,
"step": 4830
},
{
"epoch": 0.020007068613084374,
"grad_norm": 3.914180149814728,
"learning_rate": 1.9994961807750264e-05,
"loss": 1.6263,
"step": 4840
},
{
"epoch": 0.02004840553170645,
"grad_norm": 4.149254446951243,
"learning_rate": 1.999492008730189e-05,
"loss": 1.6276,
"step": 4850
},
{
"epoch": 0.020089742450328525,
"grad_norm": 3.8520876610172756,
"learning_rate": 1.9994878194868817e-05,
"loss": 1.6168,
"step": 4860
},
{
"epoch": 0.0201310793689506,
"grad_norm": 4.315135033151227,
"learning_rate": 1.9994836130451777e-05,
"loss": 1.6799,
"step": 4870
},
{
"epoch": 0.020172416287572676,
"grad_norm": 4.299172694880712,
"learning_rate": 1.9994793894051483e-05,
"loss": 1.6094,
"step": 4880
},
{
"epoch": 0.02021375320619475,
"grad_norm": 3.9099719074716974,
"learning_rate": 1.999475148566867e-05,
"loss": 1.6002,
"step": 4890
},
{
"epoch": 0.020255090124816827,
"grad_norm": 3.621204913700773,
"learning_rate": 1.9994708905304066e-05,
"loss": 1.627,
"step": 4900
},
{
"epoch": 0.0202964270434389,
"grad_norm": 4.002608239997497,
"learning_rate": 1.9994666152958403e-05,
"loss": 1.6377,
"step": 4910
},
{
"epoch": 0.020337763962060978,
"grad_norm": 3.509839578650558,
"learning_rate": 1.9994623228632413e-05,
"loss": 1.6498,
"step": 4920
},
{
"epoch": 0.02037910088068305,
"grad_norm": 3.948041169756955,
"learning_rate": 1.9994580132326843e-05,
"loss": 1.6605,
"step": 4930
},
{
"epoch": 0.020420437799305125,
"grad_norm": 3.7588684802290713,
"learning_rate": 1.9994536864042428e-05,
"loss": 1.6845,
"step": 4940
},
{
"epoch": 0.020461774717927202,
"grad_norm": 4.867688920782023,
"learning_rate": 1.999449342377991e-05,
"loss": 1.5575,
"step": 4950
},
{
"epoch": 0.020503111636549276,
"grad_norm": 4.235921275935457,
"learning_rate": 1.9994449811540044e-05,
"loss": 1.6329,
"step": 4960
},
{
"epoch": 0.020544448555171353,
"grad_norm": 5.353004787701509,
"learning_rate": 1.9994406027323578e-05,
"loss": 1.5961,
"step": 4970
},
{
"epoch": 0.020585785473793427,
"grad_norm": 4.49092979482084,
"learning_rate": 1.999436207113126e-05,
"loss": 1.6152,
"step": 4980
},
{
"epoch": 0.0206271223924155,
"grad_norm": 4.786632872232947,
"learning_rate": 1.9994317942963856e-05,
"loss": 1.5889,
"step": 4990
},
{
"epoch": 0.020668459311037578,
"grad_norm": 3.7616100197105324,
"learning_rate": 1.999427364282212e-05,
"loss": 1.6428,
"step": 5000
},
{
"epoch": 0.02070979622965965,
"grad_norm": 4.922026489251745,
"learning_rate": 1.999422917070681e-05,
"loss": 1.6404,
"step": 5010
},
{
"epoch": 0.02075113314828173,
"grad_norm": 4.51143708824428,
"learning_rate": 1.9994184526618698e-05,
"loss": 1.6532,
"step": 5020
},
{
"epoch": 0.020792470066903802,
"grad_norm": 4.104589032058005,
"learning_rate": 1.999413971055855e-05,
"loss": 1.6071,
"step": 5030
},
{
"epoch": 0.02083380698552588,
"grad_norm": 4.89262784656072,
"learning_rate": 1.999409472252714e-05,
"loss": 1.6516,
"step": 5040
},
{
"epoch": 0.020875143904147953,
"grad_norm": 3.6347037714340122,
"learning_rate": 1.9994049562525235e-05,
"loss": 1.5681,
"step": 5050
},
{
"epoch": 0.020916480822770027,
"grad_norm": 3.986687295644655,
"learning_rate": 1.9994004230553616e-05,
"loss": 1.6061,
"step": 5060
},
{
"epoch": 0.020957817741392104,
"grad_norm": 5.1196884550128825,
"learning_rate": 1.999395872661307e-05,
"loss": 1.646,
"step": 5070
},
{
"epoch": 0.020999154660014178,
"grad_norm": 4.073313251564883,
"learning_rate": 1.9993913050704362e-05,
"loss": 1.5632,
"step": 5080
},
{
"epoch": 0.021040491578636255,
"grad_norm": 3.773829349198683,
"learning_rate": 1.99938672028283e-05,
"loss": 1.596,
"step": 5090
},
{
"epoch": 0.02108182849725833,
"grad_norm": 5.707286361857388,
"learning_rate": 1.9993821182985655e-05,
"loss": 1.587,
"step": 5100
},
{
"epoch": 0.021123165415880402,
"grad_norm": 4.135913165404502,
"learning_rate": 1.9993774991177227e-05,
"loss": 1.6229,
"step": 5110
},
{
"epoch": 0.02116450233450248,
"grad_norm": 4.538213401615244,
"learning_rate": 1.9993728627403814e-05,
"loss": 1.5913,
"step": 5120
},
{
"epoch": 0.021205839253124553,
"grad_norm": 4.103580788767663,
"learning_rate": 1.9993682091666206e-05,
"loss": 1.6532,
"step": 5130
},
{
"epoch": 0.02124717617174663,
"grad_norm": 3.6711472807654064,
"learning_rate": 1.9993635383965205e-05,
"loss": 1.5746,
"step": 5140
},
{
"epoch": 0.021288513090368704,
"grad_norm": 5.277279072305559,
"learning_rate": 1.9993588504301623e-05,
"loss": 1.597,
"step": 5150
},
{
"epoch": 0.02132985000899078,
"grad_norm": 3.646653216373581,
"learning_rate": 1.9993541452676257e-05,
"loss": 1.6045,
"step": 5160
},
{
"epoch": 0.021371186927612855,
"grad_norm": 4.454553625669168,
"learning_rate": 1.999349422908992e-05,
"loss": 1.6168,
"step": 5170
},
{
"epoch": 0.02141252384623493,
"grad_norm": 4.408940295701244,
"learning_rate": 1.999344683354343e-05,
"loss": 1.5688,
"step": 5180
},
{
"epoch": 0.021453860764857006,
"grad_norm": 4.30626191840598,
"learning_rate": 1.9993399266037593e-05,
"loss": 1.5743,
"step": 5190
},
{
"epoch": 0.02149519768347908,
"grad_norm": 3.674456985901954,
"learning_rate": 1.999335152657323e-05,
"loss": 1.5872,
"step": 5200
},
{
"epoch": 0.021536534602101157,
"grad_norm": 3.641790233464658,
"learning_rate": 1.9993303615151168e-05,
"loss": 1.5612,
"step": 5210
},
{
"epoch": 0.02157787152072323,
"grad_norm": 4.165728119210956,
"learning_rate": 1.9993255531772225e-05,
"loss": 1.59,
"step": 5220
},
{
"epoch": 0.021619208439345308,
"grad_norm": 3.8319777859342246,
"learning_rate": 1.9993207276437235e-05,
"loss": 1.5912,
"step": 5230
},
{
"epoch": 0.02166054535796738,
"grad_norm": 3.9855756729463168,
"learning_rate": 1.999315884914702e-05,
"loss": 1.58,
"step": 5240
},
{
"epoch": 0.021701882276589455,
"grad_norm": 3.8011477722676807,
"learning_rate": 1.999311024990242e-05,
"loss": 1.6003,
"step": 5250
},
{
"epoch": 0.021743219195211532,
"grad_norm": 3.985198206647649,
"learning_rate": 1.9993061478704275e-05,
"loss": 1.5986,
"step": 5260
},
{
"epoch": 0.021784556113833606,
"grad_norm": 3.9838081605823636,
"learning_rate": 1.9993012535553412e-05,
"loss": 1.6166,
"step": 5270
},
{
"epoch": 0.021825893032455683,
"grad_norm": 3.9996617755784043,
"learning_rate": 1.999296342045068e-05,
"loss": 1.5792,
"step": 5280
},
{
"epoch": 0.021867229951077757,
"grad_norm": 5.892962480457768,
"learning_rate": 1.9992914133396926e-05,
"loss": 1.6053,
"step": 5290
},
{
"epoch": 0.02190856686969983,
"grad_norm": 4.427789486632826,
"learning_rate": 1.9992864674392994e-05,
"loss": 1.6374,
"step": 5300
},
{
"epoch": 0.021949903788321908,
"grad_norm": 4.488482688049845,
"learning_rate": 1.9992815043439736e-05,
"loss": 1.6198,
"step": 5310
},
{
"epoch": 0.02199124070694398,
"grad_norm": 3.9697984164903035,
"learning_rate": 1.999276524053801e-05,
"loss": 1.6112,
"step": 5320
},
{
"epoch": 0.02203257762556606,
"grad_norm": 4.708237856178089,
"learning_rate": 1.9992715265688666e-05,
"loss": 1.569,
"step": 5330
},
{
"epoch": 0.022073914544188132,
"grad_norm": 4.180089792931872,
"learning_rate": 1.999266511889257e-05,
"loss": 1.564,
"step": 5340
},
{
"epoch": 0.02211525146281021,
"grad_norm": 4.540705844431402,
"learning_rate": 1.9992614800150582e-05,
"loss": 1.6062,
"step": 5350
},
{
"epoch": 0.022156588381432283,
"grad_norm": 3.6164199548569256,
"learning_rate": 1.999256430946357e-05,
"loss": 1.614,
"step": 5360
},
{
"epoch": 0.022197925300054357,
"grad_norm": 3.815681996528154,
"learning_rate": 1.9992513646832398e-05,
"loss": 1.5836,
"step": 5370
},
{
"epoch": 0.022239262218676434,
"grad_norm": 4.806439757203068,
"learning_rate": 1.9992462812257943e-05,
"loss": 1.6162,
"step": 5380
},
{
"epoch": 0.022280599137298508,
"grad_norm": 4.354139965343947,
"learning_rate": 1.999241180574108e-05,
"loss": 1.5888,
"step": 5390
},
{
"epoch": 0.022321936055920585,
"grad_norm": 4.126817858976234,
"learning_rate": 1.999236062728268e-05,
"loss": 1.5879,
"step": 5400
},
{
"epoch": 0.02236327297454266,
"grad_norm": 4.47607737943672,
"learning_rate": 1.9992309276883632e-05,
"loss": 1.6099,
"step": 5410
},
{
"epoch": 0.022404609893164732,
"grad_norm": 5.610066619695038,
"learning_rate": 1.9992257754544814e-05,
"loss": 1.593,
"step": 5420
},
{
"epoch": 0.02244594681178681,
"grad_norm": 4.2928973652861675,
"learning_rate": 1.9992206060267114e-05,
"loss": 1.5793,
"step": 5430
},
{
"epoch": 0.022487283730408883,
"grad_norm": 3.8921859700664325,
"learning_rate": 1.9992154194051422e-05,
"loss": 1.608,
"step": 5440
},
{
"epoch": 0.02252862064903096,
"grad_norm": 3.677731550454947,
"learning_rate": 1.999210215589863e-05,
"loss": 1.6151,
"step": 5450
},
{
"epoch": 0.022569957567653034,
"grad_norm": 4.200629201423265,
"learning_rate": 1.9992049945809632e-05,
"loss": 1.6246,
"step": 5460
},
{
"epoch": 0.02261129448627511,
"grad_norm": 4.064480908765512,
"learning_rate": 1.9991997563785332e-05,
"loss": 1.5607,
"step": 5470
},
{
"epoch": 0.022652631404897185,
"grad_norm": 3.5486537855524176,
"learning_rate": 1.9991945009826623e-05,
"loss": 1.5906,
"step": 5480
},
{
"epoch": 0.02269396832351926,
"grad_norm": 4.0698465101707,
"learning_rate": 1.9991892283934415e-05,
"loss": 1.5864,
"step": 5490
},
{
"epoch": 0.022735305242141336,
"grad_norm": 3.698399389749536,
"learning_rate": 1.9991839386109615e-05,
"loss": 1.593,
"step": 5500
},
{
"epoch": 0.02277664216076341,
"grad_norm": 4.854255782396672,
"learning_rate": 1.9991786316353134e-05,
"loss": 1.5961,
"step": 5510
},
{
"epoch": 0.022817979079385486,
"grad_norm": 3.5841353274799244,
"learning_rate": 1.9991733074665884e-05,
"loss": 1.5638,
"step": 5520
},
{
"epoch": 0.02285931599800756,
"grad_norm": 4.188646894537988,
"learning_rate": 1.9991679661048774e-05,
"loss": 1.5605,
"step": 5530
},
{
"epoch": 0.022900652916629637,
"grad_norm": 3.646293980881599,
"learning_rate": 1.9991626075502736e-05,
"loss": 1.5672,
"step": 5540
},
{
"epoch": 0.02294198983525171,
"grad_norm": 3.513345408175718,
"learning_rate": 1.999157231802868e-05,
"loss": 1.5228,
"step": 5550
},
{
"epoch": 0.022983326753873785,
"grad_norm": 4.22409759900443,
"learning_rate": 1.999151838862754e-05,
"loss": 1.5742,
"step": 5560
},
{
"epoch": 0.023024663672495862,
"grad_norm": 3.9606510772786674,
"learning_rate": 1.999146428730024e-05,
"loss": 1.5898,
"step": 5570
},
{
"epoch": 0.023066000591117936,
"grad_norm": 4.723833314885466,
"learning_rate": 1.9991410014047713e-05,
"loss": 1.6293,
"step": 5580
},
{
"epoch": 0.023107337509740013,
"grad_norm": 3.79738622812003,
"learning_rate": 1.999135556887089e-05,
"loss": 1.5347,
"step": 5590
},
{
"epoch": 0.023148674428362086,
"grad_norm": 3.5876021705924277,
"learning_rate": 1.9991300951770712e-05,
"loss": 1.5639,
"step": 5600
},
{
"epoch": 0.02319001134698416,
"grad_norm": 4.466727344043237,
"learning_rate": 1.9991246162748116e-05,
"loss": 1.5821,
"step": 5610
},
{
"epoch": 0.023231348265606237,
"grad_norm": 4.027485882579859,
"learning_rate": 1.999119120180404e-05,
"loss": 1.5641,
"step": 5620
},
{
"epoch": 0.02327268518422831,
"grad_norm": 4.698907728867797,
"learning_rate": 1.9991136068939436e-05,
"loss": 1.5717,
"step": 5630
},
{
"epoch": 0.023314022102850388,
"grad_norm": 3.9675562129009534,
"learning_rate": 1.9991080764155254e-05,
"loss": 1.5984,
"step": 5640
},
{
"epoch": 0.023355359021472462,
"grad_norm": 4.469330328433558,
"learning_rate": 1.9991025287452442e-05,
"loss": 1.5836,
"step": 5650
},
{
"epoch": 0.02339669594009454,
"grad_norm": 4.315359559691392,
"learning_rate": 1.9990969638831955e-05,
"loss": 1.5456,
"step": 5660
},
{
"epoch": 0.023438032858716613,
"grad_norm": 3.67958327218992,
"learning_rate": 1.9990913818294753e-05,
"loss": 1.6191,
"step": 5670
},
{
"epoch": 0.023479369777338686,
"grad_norm": 4.491956126894857,
"learning_rate": 1.9990857825841793e-05,
"loss": 1.5808,
"step": 5680
},
{
"epoch": 0.023520706695960764,
"grad_norm": 3.79674314266457,
"learning_rate": 1.999080166147404e-05,
"loss": 1.5183,
"step": 5690
},
{
"epoch": 0.023562043614582837,
"grad_norm": 4.5968252548890245,
"learning_rate": 1.999074532519246e-05,
"loss": 1.5757,
"step": 5700
},
{
"epoch": 0.023603380533204914,
"grad_norm": 3.5990834672231284,
"learning_rate": 1.9990688816998025e-05,
"loss": 1.6086,
"step": 5710
},
{
"epoch": 0.023644717451826988,
"grad_norm": 4.344410017466151,
"learning_rate": 1.99906321368917e-05,
"loss": 1.6113,
"step": 5720
},
{
"epoch": 0.023686054370449065,
"grad_norm": 3.7938891603257603,
"learning_rate": 1.9990575284874473e-05,
"loss": 1.6365,
"step": 5730
},
{
"epoch": 0.02372739128907114,
"grad_norm": 3.562057149121525,
"learning_rate": 1.999051826094731e-05,
"loss": 1.5485,
"step": 5740
},
{
"epoch": 0.023768728207693213,
"grad_norm": 4.081479989742111,
"learning_rate": 1.99904610651112e-05,
"loss": 1.5689,
"step": 5750
},
{
"epoch": 0.02381006512631529,
"grad_norm": 3.759485760858795,
"learning_rate": 1.999040369736712e-05,
"loss": 1.564,
"step": 5760
},
{
"epoch": 0.023851402044937364,
"grad_norm": 4.032363621919849,
"learning_rate": 1.9990346157716064e-05,
"loss": 1.6025,
"step": 5770
},
{
"epoch": 0.02389273896355944,
"grad_norm": 3.6432323322843403,
"learning_rate": 1.999028844615902e-05,
"loss": 1.5271,
"step": 5780
},
{
"epoch": 0.023934075882181514,
"grad_norm": 3.802770545609017,
"learning_rate": 1.9990230562696983e-05,
"loss": 1.5967,
"step": 5790
},
{
"epoch": 0.023975412800803588,
"grad_norm": 3.795072573463222,
"learning_rate": 1.9990172507330943e-05,
"loss": 1.5247,
"step": 5800
},
{
"epoch": 0.024016749719425665,
"grad_norm": 4.366382080210575,
"learning_rate": 1.99901142800619e-05,
"loss": 1.5781,
"step": 5810
},
{
"epoch": 0.02405808663804774,
"grad_norm": 3.9097914526353605,
"learning_rate": 1.9990055880890864e-05,
"loss": 1.6034,
"step": 5820
},
{
"epoch": 0.024099423556669816,
"grad_norm": 4.123926255872013,
"learning_rate": 1.9989997309818833e-05,
"loss": 1.5464,
"step": 5830
},
{
"epoch": 0.02414076047529189,
"grad_norm": 4.446493191993532,
"learning_rate": 1.9989938566846812e-05,
"loss": 1.5586,
"step": 5840
},
{
"epoch": 0.024182097393913967,
"grad_norm": 3.7337639849714233,
"learning_rate": 1.998987965197582e-05,
"loss": 1.5479,
"step": 5850
},
{
"epoch": 0.02422343431253604,
"grad_norm": 4.7444952313768525,
"learning_rate": 1.9989820565206865e-05,
"loss": 1.5808,
"step": 5860
},
{
"epoch": 0.024264771231158114,
"grad_norm": 4.247725775065283,
"learning_rate": 1.9989761306540966e-05,
"loss": 1.523,
"step": 5870
},
{
"epoch": 0.02430610814978019,
"grad_norm": 3.995186643530754,
"learning_rate": 1.998970187597914e-05,
"loss": 1.5785,
"step": 5880
},
{
"epoch": 0.024347445068402265,
"grad_norm": 4.816092056889684,
"learning_rate": 1.9989642273522416e-05,
"loss": 1.5746,
"step": 5890
},
{
"epoch": 0.024388781987024342,
"grad_norm": 4.290367502884436,
"learning_rate": 1.9989582499171813e-05,
"loss": 1.6119,
"step": 5900
},
{
"epoch": 0.024430118905646416,
"grad_norm": 3.5513668937922236,
"learning_rate": 1.9989522552928365e-05,
"loss": 1.5162,
"step": 5910
},
{
"epoch": 0.02447145582426849,
"grad_norm": 3.6198772954827665,
"learning_rate": 1.9989462434793096e-05,
"loss": 1.5323,
"step": 5920
},
{
"epoch": 0.024512792742890567,
"grad_norm": 3.852832040089439,
"learning_rate": 1.9989402144767046e-05,
"loss": 1.5311,
"step": 5930
},
{
"epoch": 0.02455412966151264,
"grad_norm": 3.9519174433535325,
"learning_rate": 1.9989341682851254e-05,
"loss": 1.5429,
"step": 5940
},
{
"epoch": 0.024595466580134718,
"grad_norm": 4.87353052847372,
"learning_rate": 1.9989281049046755e-05,
"loss": 1.6002,
"step": 5950
},
{
"epoch": 0.02463680349875679,
"grad_norm": 3.3803857370087225,
"learning_rate": 1.9989220243354595e-05,
"loss": 1.5793,
"step": 5960
},
{
"epoch": 0.02467814041737887,
"grad_norm": 3.8766963938819075,
"learning_rate": 1.998915926577582e-05,
"loss": 1.533,
"step": 5970
},
{
"epoch": 0.024719477336000942,
"grad_norm": 4.09044180490663,
"learning_rate": 1.998909811631148e-05,
"loss": 1.565,
"step": 5980
},
{
"epoch": 0.024760814254623016,
"grad_norm": 4.537575506546124,
"learning_rate": 1.998903679496263e-05,
"loss": 1.5523,
"step": 5990
},
{
"epoch": 0.024802151173245093,
"grad_norm": 3.4086201638465803,
"learning_rate": 1.9988975301730317e-05,
"loss": 1.5467,
"step": 6000
},
{
"epoch": 0.024843488091867167,
"grad_norm": 3.7512592579174244,
"learning_rate": 1.9988913636615608e-05,
"loss": 1.6148,
"step": 6010
},
{
"epoch": 0.024884825010489244,
"grad_norm": 3.9606857815229035,
"learning_rate": 1.9988851799619557e-05,
"loss": 1.5529,
"step": 6020
},
{
"epoch": 0.024926161929111318,
"grad_norm": 4.87271131926588,
"learning_rate": 1.9988789790743235e-05,
"loss": 1.624,
"step": 6030
},
{
"epoch": 0.024967498847733395,
"grad_norm": 4.562111872082575,
"learning_rate": 1.9988727609987705e-05,
"loss": 1.5954,
"step": 6040
},
{
"epoch": 0.02500883576635547,
"grad_norm": 4.160920766227917,
"learning_rate": 1.9988665257354035e-05,
"loss": 1.5745,
"step": 6050
},
{
"epoch": 0.025050172684977542,
"grad_norm": 3.7976329240225284,
"learning_rate": 1.9988602732843296e-05,
"loss": 1.539,
"step": 6060
},
{
"epoch": 0.02509150960359962,
"grad_norm": 3.9348324977710347,
"learning_rate": 1.9988540036456575e-05,
"loss": 1.5802,
"step": 6070
},
{
"epoch": 0.025132846522221693,
"grad_norm": 3.3649859713246313,
"learning_rate": 1.998847716819494e-05,
"loss": 1.5349,
"step": 6080
},
{
"epoch": 0.02517418344084377,
"grad_norm": 5.035730829505278,
"learning_rate": 1.998841412805948e-05,
"loss": 1.5522,
"step": 6090
},
{
"epoch": 0.025215520359465844,
"grad_norm": 4.38089533529463,
"learning_rate": 1.9988350916051272e-05,
"loss": 1.5696,
"step": 6100
},
{
"epoch": 0.025256857278087918,
"grad_norm": 4.0458062619048185,
"learning_rate": 1.9988287532171408e-05,
"loss": 1.582,
"step": 6110
},
{
"epoch": 0.025298194196709995,
"grad_norm": 5.197316936196237,
"learning_rate": 1.9988223976420983e-05,
"loss": 1.5685,
"step": 6120
},
{
"epoch": 0.02533953111533207,
"grad_norm": 3.701848060366763,
"learning_rate": 1.998816024880108e-05,
"loss": 1.568,
"step": 6130
},
{
"epoch": 0.025380868033954146,
"grad_norm": 4.576812131388496,
"learning_rate": 1.9988096349312808e-05,
"loss": 1.5925,
"step": 6140
},
{
"epoch": 0.02542220495257622,
"grad_norm": 3.626416937979281,
"learning_rate": 1.998803227795726e-05,
"loss": 1.6026,
"step": 6150
},
{
"epoch": 0.025463541871198297,
"grad_norm": 3.7415000301009016,
"learning_rate": 1.9987968034735535e-05,
"loss": 1.5632,
"step": 6160
},
{
"epoch": 0.02550487878982037,
"grad_norm": 4.093809033114078,
"learning_rate": 1.9987903619648745e-05,
"loss": 1.5442,
"step": 6170
},
{
"epoch": 0.025546215708442444,
"grad_norm": 3.782350165490308,
"learning_rate": 1.9987839032697995e-05,
"loss": 1.5423,
"step": 6180
},
{
"epoch": 0.02558755262706452,
"grad_norm": 3.1897173529667695,
"learning_rate": 1.9987774273884398e-05,
"loss": 1.5332,
"step": 6190
},
{
"epoch": 0.025628889545686595,
"grad_norm": 3.9224918301369276,
"learning_rate": 1.9987709343209066e-05,
"loss": 1.5133,
"step": 6200
},
{
"epoch": 0.025670226464308672,
"grad_norm": 3.830850927059349,
"learning_rate": 1.9987644240673118e-05,
"loss": 1.555,
"step": 6210
},
{
"epoch": 0.025711563382930746,
"grad_norm": 4.0209145103807575,
"learning_rate": 1.9987578966277678e-05,
"loss": 1.5114,
"step": 6220
},
{
"epoch": 0.02575290030155282,
"grad_norm": 3.936778500441379,
"learning_rate": 1.998751352002386e-05,
"loss": 1.5257,
"step": 6230
},
{
"epoch": 0.025794237220174897,
"grad_norm": 3.7293623909553313,
"learning_rate": 1.9987447901912794e-05,
"loss": 1.5694,
"step": 6240
},
{
"epoch": 0.02583557413879697,
"grad_norm": 3.5818490617695575,
"learning_rate": 1.9987382111945614e-05,
"loss": 1.5531,
"step": 6250
},
{
"epoch": 0.025876911057419048,
"grad_norm": 3.6430367540575928,
"learning_rate": 1.998731615012345e-05,
"loss": 1.5434,
"step": 6260
},
{
"epoch": 0.02591824797604112,
"grad_norm": 4.589782090699277,
"learning_rate": 1.998725001644743e-05,
"loss": 1.557,
"step": 6270
},
{
"epoch": 0.0259595848946632,
"grad_norm": 4.283265289643292,
"learning_rate": 1.99871837109187e-05,
"loss": 1.5624,
"step": 6280
},
{
"epoch": 0.026000921813285272,
"grad_norm": 4.946779268912546,
"learning_rate": 1.99871172335384e-05,
"loss": 1.5124,
"step": 6290
},
{
"epoch": 0.026042258731907346,
"grad_norm": 3.5038967204373694,
"learning_rate": 1.998705058430767e-05,
"loss": 1.5683,
"step": 6300
},
{
"epoch": 0.026083595650529423,
"grad_norm": 3.8770191196485886,
"learning_rate": 1.998698376322766e-05,
"loss": 1.5435,
"step": 6310
},
{
"epoch": 0.026124932569151497,
"grad_norm": 4.439160488934939,
"learning_rate": 1.998691677029952e-05,
"loss": 1.5295,
"step": 6320
},
{
"epoch": 0.026166269487773574,
"grad_norm": 4.092904098781107,
"learning_rate": 1.99868496055244e-05,
"loss": 1.55,
"step": 6330
},
{
"epoch": 0.026207606406395648,
"grad_norm": 4.303539009198583,
"learning_rate": 1.9986782268903457e-05,
"loss": 1.5484,
"step": 6340
},
{
"epoch": 0.026248943325017725,
"grad_norm": 3.9078194955949916,
"learning_rate": 1.9986714760437853e-05,
"loss": 1.5827,
"step": 6350
},
{
"epoch": 0.0262902802436398,
"grad_norm": 4.342380780259694,
"learning_rate": 1.9986647080128746e-05,
"loss": 1.557,
"step": 6360
},
{
"epoch": 0.026331617162261872,
"grad_norm": 3.985596918279314,
"learning_rate": 1.99865792279773e-05,
"loss": 1.578,
"step": 6370
},
{
"epoch": 0.02637295408088395,
"grad_norm": 3.653720676962278,
"learning_rate": 1.9986511203984683e-05,
"loss": 1.5668,
"step": 6380
},
{
"epoch": 0.026414290999506023,
"grad_norm": 3.751487721843964,
"learning_rate": 1.998644300815207e-05,
"loss": 1.5305,
"step": 6390
},
{
"epoch": 0.0264556279181281,
"grad_norm": 4.009883655861525,
"learning_rate": 1.9986374640480627e-05,
"loss": 1.5495,
"step": 6400
},
{
"epoch": 0.026496964836750174,
"grad_norm": 3.8266120037819396,
"learning_rate": 1.9986306100971533e-05,
"loss": 1.5255,
"step": 6410
},
{
"epoch": 0.026538301755372248,
"grad_norm": 3.6427176903376384,
"learning_rate": 1.9986237389625974e-05,
"loss": 1.5525,
"step": 6420
},
{
"epoch": 0.026579638673994325,
"grad_norm": 4.099974786255079,
"learning_rate": 1.998616850644512e-05,
"loss": 1.5424,
"step": 6430
},
{
"epoch": 0.0266209755926164,
"grad_norm": 3.8198383445190793,
"learning_rate": 1.998609945143017e-05,
"loss": 1.567,
"step": 6440
},
{
"epoch": 0.026662312511238476,
"grad_norm": 4.473728987789235,
"learning_rate": 1.9986030224582302e-05,
"loss": 1.4823,
"step": 6450
},
{
"epoch": 0.02670364942986055,
"grad_norm": 4.534257895704366,
"learning_rate": 1.998596082590271e-05,
"loss": 1.5712,
"step": 6460
},
{
"epoch": 0.026744986348482627,
"grad_norm": 3.644766820996961,
"learning_rate": 1.998589125539259e-05,
"loss": 1.4863,
"step": 6470
},
{
"epoch": 0.0267863232671047,
"grad_norm": 3.5999094373664526,
"learning_rate": 1.9985821513053137e-05,
"loss": 1.5326,
"step": 6480
},
{
"epoch": 0.026827660185726774,
"grad_norm": 4.1425167904001565,
"learning_rate": 1.9985751598885552e-05,
"loss": 1.5378,
"step": 6490
},
{
"epoch": 0.02686899710434885,
"grad_norm": 4.493755521920297,
"learning_rate": 1.998568151289104e-05,
"loss": 1.559,
"step": 6500
},
{
"epoch": 0.026910334022970925,
"grad_norm": 3.8756107746153896,
"learning_rate": 1.9985611255070806e-05,
"loss": 1.556,
"step": 6510
},
{
"epoch": 0.026951670941593002,
"grad_norm": 4.038084156630909,
"learning_rate": 1.9985540825426055e-05,
"loss": 1.5645,
"step": 6520
},
{
"epoch": 0.026993007860215076,
"grad_norm": 3.5809330138239392,
"learning_rate": 1.9985470223958e-05,
"loss": 1.5548,
"step": 6530
},
{
"epoch": 0.027034344778837153,
"grad_norm": 3.8676960332502963,
"learning_rate": 1.998539945066786e-05,
"loss": 1.5276,
"step": 6540
},
{
"epoch": 0.027075681697459227,
"grad_norm": 3.4449297103173593,
"learning_rate": 1.9985328505556852e-05,
"loss": 1.5651,
"step": 6550
},
{
"epoch": 0.0271170186160813,
"grad_norm": 4.288962654330007,
"learning_rate": 1.9985257388626196e-05,
"loss": 1.4996,
"step": 6560
},
{
"epoch": 0.027158355534703377,
"grad_norm": 4.128501380292175,
"learning_rate": 1.9985186099877112e-05,
"loss": 1.5419,
"step": 6570
},
{
"epoch": 0.02719969245332545,
"grad_norm": 4.575002926290647,
"learning_rate": 1.998511463931083e-05,
"loss": 1.5515,
"step": 6580
},
{
"epoch": 0.02724102937194753,
"grad_norm": 3.821068699345836,
"learning_rate": 1.998504300692858e-05,
"loss": 1.5233,
"step": 6590
},
{
"epoch": 0.027282366290569602,
"grad_norm": 3.2424897263365016,
"learning_rate": 1.9984971202731596e-05,
"loss": 1.5479,
"step": 6600
},
{
"epoch": 0.027323703209191676,
"grad_norm": 4.834170577555974,
"learning_rate": 1.9984899226721107e-05,
"loss": 1.5502,
"step": 6610
},
{
"epoch": 0.027365040127813753,
"grad_norm": 3.764169873278093,
"learning_rate": 1.998482707889836e-05,
"loss": 1.5891,
"step": 6620
},
{
"epoch": 0.027406377046435827,
"grad_norm": 3.414782018354158,
"learning_rate": 1.998475475926459e-05,
"loss": 1.5159,
"step": 6630
},
{
"epoch": 0.027447713965057904,
"grad_norm": 4.666184759190313,
"learning_rate": 1.9984682267821046e-05,
"loss": 1.5628,
"step": 6640
},
{
"epoch": 0.027489050883679977,
"grad_norm": 3.5574578914802943,
"learning_rate": 1.998460960456897e-05,
"loss": 1.5315,
"step": 6650
},
{
"epoch": 0.027530387802302055,
"grad_norm": 4.817382615755327,
"learning_rate": 1.9984536769509615e-05,
"loss": 1.5081,
"step": 6660
},
{
"epoch": 0.02757172472092413,
"grad_norm": 4.458867619168575,
"learning_rate": 1.998446376264424e-05,
"loss": 1.5099,
"step": 6670
},
{
"epoch": 0.027613061639546202,
"grad_norm": 4.929644851290023,
"learning_rate": 1.9984390583974093e-05,
"loss": 1.5122,
"step": 6680
},
{
"epoch": 0.02765439855816828,
"grad_norm": 4.625479741961043,
"learning_rate": 1.9984317233500435e-05,
"loss": 1.5516,
"step": 6690
},
{
"epoch": 0.027695735476790353,
"grad_norm": 4.116997057727521,
"learning_rate": 1.9984243711224535e-05,
"loss": 1.5376,
"step": 6700
},
{
"epoch": 0.02773707239541243,
"grad_norm": 3.5829728345047314,
"learning_rate": 1.998417001714765e-05,
"loss": 1.5175,
"step": 6710
},
{
"epoch": 0.027778409314034504,
"grad_norm": 3.9734979789101996,
"learning_rate": 1.9984096151271048e-05,
"loss": 1.4871,
"step": 6720
},
{
"epoch": 0.027819746232656577,
"grad_norm": 3.430905345503222,
"learning_rate": 1.9984022113596003e-05,
"loss": 1.5413,
"step": 6730
},
{
"epoch": 0.027861083151278655,
"grad_norm": 4.2373775185116465,
"learning_rate": 1.998394790412379e-05,
"loss": 1.508,
"step": 6740
},
{
"epoch": 0.027902420069900728,
"grad_norm": 3.973156898260741,
"learning_rate": 1.9983873522855684e-05,
"loss": 1.5283,
"step": 6750
},
{
"epoch": 0.027943756988522805,
"grad_norm": 3.714285928818181,
"learning_rate": 1.9983798969792966e-05,
"loss": 1.5362,
"step": 6760
},
{
"epoch": 0.02798509390714488,
"grad_norm": 4.251009623472971,
"learning_rate": 1.9983724244936916e-05,
"loss": 1.5282,
"step": 6770
},
{
"epoch": 0.028026430825766956,
"grad_norm": 3.886447728722872,
"learning_rate": 1.9983649348288825e-05,
"loss": 1.5719,
"step": 6780
},
{
"epoch": 0.02806776774438903,
"grad_norm": 3.5572049346515757,
"learning_rate": 1.9983574279849977e-05,
"loss": 1.5302,
"step": 6790
},
{
"epoch": 0.028109104663011104,
"grad_norm": 3.688224659646708,
"learning_rate": 1.9983499039621667e-05,
"loss": 1.5132,
"step": 6800
},
{
"epoch": 0.02815044158163318,
"grad_norm": 3.986436014630721,
"learning_rate": 1.998342362760519e-05,
"loss": 1.5045,
"step": 6810
},
{
"epoch": 0.028191778500255255,
"grad_norm": 4.042321521286428,
"learning_rate": 1.998334804380184e-05,
"loss": 1.52,
"step": 6820
},
{
"epoch": 0.02823311541887733,
"grad_norm": 3.936708692378881,
"learning_rate": 1.9983272288212917e-05,
"loss": 1.5208,
"step": 6830
},
{
"epoch": 0.028274452337499405,
"grad_norm": 4.692602969518199,
"learning_rate": 1.998319636083973e-05,
"loss": 1.5667,
"step": 6840
},
{
"epoch": 0.028315789256121483,
"grad_norm": 3.9313878236010598,
"learning_rate": 1.9983120261683582e-05,
"loss": 1.5831,
"step": 6850
},
{
"epoch": 0.028357126174743556,
"grad_norm": 3.551615629888668,
"learning_rate": 1.9983043990745784e-05,
"loss": 1.5308,
"step": 6860
},
{
"epoch": 0.02839846309336563,
"grad_norm": 4.6846872186437905,
"learning_rate": 1.9982967548027645e-05,
"loss": 1.4921,
"step": 6870
},
{
"epoch": 0.028439800011987707,
"grad_norm": 4.130277420309701,
"learning_rate": 1.9982890933530482e-05,
"loss": 1.4943,
"step": 6880
},
{
"epoch": 0.02848113693060978,
"grad_norm": 4.84212625045545,
"learning_rate": 1.9982814147255617e-05,
"loss": 1.5353,
"step": 6890
},
{
"epoch": 0.028522473849231858,
"grad_norm": 4.484005829649726,
"learning_rate": 1.9982737189204367e-05,
"loss": 1.5051,
"step": 6900
},
{
"epoch": 0.02856381076785393,
"grad_norm": 4.60797848842926,
"learning_rate": 1.998266005937806e-05,
"loss": 1.5816,
"step": 6910
},
{
"epoch": 0.028605147686476005,
"grad_norm": 4.6416638061065685,
"learning_rate": 1.998258275777802e-05,
"loss": 1.4904,
"step": 6920
},
{
"epoch": 0.028646484605098083,
"grad_norm": 3.477343383643086,
"learning_rate": 1.9982505284405574e-05,
"loss": 1.4904,
"step": 6930
},
{
"epoch": 0.028687821523720156,
"grad_norm": 3.6088868644511427,
"learning_rate": 1.9982427639262065e-05,
"loss": 1.5314,
"step": 6940
},
{
"epoch": 0.028729158442342233,
"grad_norm": 3.5925096362200333,
"learning_rate": 1.9982349822348816e-05,
"loss": 1.5714,
"step": 6950
},
{
"epoch": 0.028770495360964307,
"grad_norm": 3.2379164637124855,
"learning_rate": 1.9982271833667178e-05,
"loss": 1.538,
"step": 6960
},
{
"epoch": 0.028811832279586384,
"grad_norm": 4.70836437147594,
"learning_rate": 1.9982193673218487e-05,
"loss": 1.5221,
"step": 6970
},
{
"epoch": 0.028853169198208458,
"grad_norm": 4.481706228912118,
"learning_rate": 1.9982115341004088e-05,
"loss": 1.5313,
"step": 6980
},
{
"epoch": 0.02889450611683053,
"grad_norm": 4.687318727733607,
"learning_rate": 1.9982036837025332e-05,
"loss": 1.5051,
"step": 6990
},
{
"epoch": 0.02893584303545261,
"grad_norm": 4.598444922768698,
"learning_rate": 1.998195816128357e-05,
"loss": 1.4934,
"step": 7000
},
{
"epoch": 0.028977179954074683,
"grad_norm": 3.9891444926266555,
"learning_rate": 1.9981879313780145e-05,
"loss": 1.5511,
"step": 7010
},
{
"epoch": 0.02901851687269676,
"grad_norm": 4.416153022094333,
"learning_rate": 1.998180029451643e-05,
"loss": 1.5097,
"step": 7020
},
{
"epoch": 0.029059853791318833,
"grad_norm": 3.798723984502823,
"learning_rate": 1.9981721103493775e-05,
"loss": 1.4997,
"step": 7030
},
{
"epoch": 0.029101190709940907,
"grad_norm": 3.8640966990883223,
"learning_rate": 1.9981641740713545e-05,
"loss": 1.54,
"step": 7040
},
{
"epoch": 0.029142527628562984,
"grad_norm": 3.9923946800397436,
"learning_rate": 1.9981562206177104e-05,
"loss": 1.5511,
"step": 7050
},
{
"epoch": 0.029183864547185058,
"grad_norm": 3.451062899533455,
"learning_rate": 1.998148249988582e-05,
"loss": 1.5094,
"step": 7060
},
{
"epoch": 0.029225201465807135,
"grad_norm": 3.608872589804411,
"learning_rate": 1.998140262184107e-05,
"loss": 1.5162,
"step": 7070
},
{
"epoch": 0.02926653838442921,
"grad_norm": 4.033124276978576,
"learning_rate": 1.998132257204422e-05,
"loss": 1.4959,
"step": 7080
},
{
"epoch": 0.029307875303051286,
"grad_norm": 3.6841192145575397,
"learning_rate": 1.9981242350496656e-05,
"loss": 1.5223,
"step": 7090
},
{
"epoch": 0.02934921222167336,
"grad_norm": 3.5915932028439226,
"learning_rate": 1.9981161957199754e-05,
"loss": 1.5257,
"step": 7100
},
{
"epoch": 0.029390549140295433,
"grad_norm": 3.870756947521686,
"learning_rate": 1.9981081392154898e-05,
"loss": 1.4904,
"step": 7110
},
{
"epoch": 0.02943188605891751,
"grad_norm": 4.22816037060308,
"learning_rate": 1.9981000655363473e-05,
"loss": 1.4982,
"step": 7120
},
{
"epoch": 0.029473222977539584,
"grad_norm": 3.808092113157194,
"learning_rate": 1.9980919746826872e-05,
"loss": 1.519,
"step": 7130
},
{
"epoch": 0.02951455989616166,
"grad_norm": 3.709300764256846,
"learning_rate": 1.9980838666546483e-05,
"loss": 1.5533,
"step": 7140
},
{
"epoch": 0.029555896814783735,
"grad_norm": 3.4732849314384127,
"learning_rate": 1.9980757414523704e-05,
"loss": 1.5633,
"step": 7150
},
{
"epoch": 0.029597233733405812,
"grad_norm": 3.713987413842907,
"learning_rate": 1.998067599075993e-05,
"loss": 1.5201,
"step": 7160
},
{
"epoch": 0.029638570652027886,
"grad_norm": 4.113428203135297,
"learning_rate": 1.9980594395256564e-05,
"loss": 1.4594,
"step": 7170
},
{
"epoch": 0.02967990757064996,
"grad_norm": 2.9935199155014285,
"learning_rate": 1.9980512628015014e-05,
"loss": 1.4986,
"step": 7180
},
{
"epoch": 0.029721244489272037,
"grad_norm": 4.691195443226011,
"learning_rate": 1.998043068903668e-05,
"loss": 1.5357,
"step": 7190
},
{
"epoch": 0.02976258140789411,
"grad_norm": 3.4891344421625647,
"learning_rate": 1.9980348578322973e-05,
"loss": 1.5306,
"step": 7200
},
{
"epoch": 0.029803918326516188,
"grad_norm": 5.006688866087417,
"learning_rate": 1.9980266295875313e-05,
"loss": 1.512,
"step": 7210
},
{
"epoch": 0.02984525524513826,
"grad_norm": 4.053771198338998,
"learning_rate": 1.9980183841695107e-05,
"loss": 1.4794,
"step": 7220
},
{
"epoch": 0.029886592163760335,
"grad_norm": 3.352224471660883,
"learning_rate": 1.998010121578378e-05,
"loss": 1.4914,
"step": 7230
},
{
"epoch": 0.029927929082382412,
"grad_norm": 3.864235630776693,
"learning_rate": 1.998001841814275e-05,
"loss": 1.5253,
"step": 7240
},
{
"epoch": 0.029969266001004486,
"grad_norm": 3.706315048662037,
"learning_rate": 1.997993544877344e-05,
"loss": 1.5509,
"step": 7250
},
{
"epoch": 0.030010602919626563,
"grad_norm": 3.5465275464245676,
"learning_rate": 1.9979852307677285e-05,
"loss": 1.5605,
"step": 7260
},
{
"epoch": 0.030051939838248637,
"grad_norm": 4.129168530718907,
"learning_rate": 1.997976899485571e-05,
"loss": 1.5204,
"step": 7270
},
{
"epoch": 0.030093276756870714,
"grad_norm": 3.500344444775805,
"learning_rate": 1.997968551031015e-05,
"loss": 1.4853,
"step": 7280
},
{
"epoch": 0.030134613675492788,
"grad_norm": 3.930695708227359,
"learning_rate": 1.9979601854042044e-05,
"loss": 1.5186,
"step": 7290
},
{
"epoch": 0.03017595059411486,
"grad_norm": 3.8784171251524846,
"learning_rate": 1.9979518026052826e-05,
"loss": 1.5031,
"step": 7300
},
{
"epoch": 0.03021728751273694,
"grad_norm": 4.264401345696811,
"learning_rate": 1.997943402634394e-05,
"loss": 1.5175,
"step": 7310
},
{
"epoch": 0.030258624431359012,
"grad_norm": 3.924430485404916,
"learning_rate": 1.9979349854916836e-05,
"loss": 1.5415,
"step": 7320
},
{
"epoch": 0.03029996134998109,
"grad_norm": 5.108018679538745,
"learning_rate": 1.9979265511772958e-05,
"loss": 1.4635,
"step": 7330
},
{
"epoch": 0.030341298268603163,
"grad_norm": 3.5308579618903253,
"learning_rate": 1.997918099691376e-05,
"loss": 1.5076,
"step": 7340
},
{
"epoch": 0.03038263518722524,
"grad_norm": 5.35750207475576,
"learning_rate": 1.997909631034069e-05,
"loss": 1.5426,
"step": 7350
},
{
"epoch": 0.030423972105847314,
"grad_norm": 3.4906620060732645,
"learning_rate": 1.9979011452055216e-05,
"loss": 1.5012,
"step": 7360
},
{
"epoch": 0.030465309024469388,
"grad_norm": 3.059087960421475,
"learning_rate": 1.9978926422058788e-05,
"loss": 1.5542,
"step": 7370
},
{
"epoch": 0.030506645943091465,
"grad_norm": 3.9845949063681227,
"learning_rate": 1.9978841220352875e-05,
"loss": 1.546,
"step": 7380
},
{
"epoch": 0.03054798286171354,
"grad_norm": 3.8610381846741815,
"learning_rate": 1.9978755846938943e-05,
"loss": 1.5437,
"step": 7390
},
{
"epoch": 0.030589319780335616,
"grad_norm": 3.2351748039531154,
"learning_rate": 1.9978670301818456e-05,
"loss": 1.4819,
"step": 7400
},
{
"epoch": 0.03063065669895769,
"grad_norm": 3.3661408607769823,
"learning_rate": 1.997858458499289e-05,
"loss": 1.4698,
"step": 7410
},
{
"epoch": 0.030671993617579763,
"grad_norm": 3.882697263150638,
"learning_rate": 1.997849869646372e-05,
"loss": 1.5216,
"step": 7420
},
{
"epoch": 0.03071333053620184,
"grad_norm": 3.574753801928485,
"learning_rate": 1.9978412636232425e-05,
"loss": 1.4803,
"step": 7430
},
{
"epoch": 0.030754667454823914,
"grad_norm": 4.443698443398163,
"learning_rate": 1.997832640430048e-05,
"loss": 1.5006,
"step": 7440
},
{
"epoch": 0.03079600437344599,
"grad_norm": 4.180214183413532,
"learning_rate": 1.9978240000669377e-05,
"loss": 1.4823,
"step": 7450
},
{
"epoch": 0.030837341292068065,
"grad_norm": 4.271849788381411,
"learning_rate": 1.9978153425340596e-05,
"loss": 1.4888,
"step": 7460
},
{
"epoch": 0.030878678210690142,
"grad_norm": 3.638691493855987,
"learning_rate": 1.9978066678315634e-05,
"loss": 1.5171,
"step": 7470
},
{
"epoch": 0.030920015129312216,
"grad_norm": 4.31218852799806,
"learning_rate": 1.9977979759595972e-05,
"loss": 1.5313,
"step": 7480
},
{
"epoch": 0.03096135204793429,
"grad_norm": 4.188985873406339,
"learning_rate": 1.9977892669183115e-05,
"loss": 1.5333,
"step": 7490
},
{
"epoch": 0.031002688966556367,
"grad_norm": 4.091422365669455,
"learning_rate": 1.9977805407078563e-05,
"loss": 1.5104,
"step": 7500
},
{
"epoch": 0.03104402588517844,
"grad_norm": 3.7904969438772995,
"learning_rate": 1.997771797328381e-05,
"loss": 1.4961,
"step": 7510
},
{
"epoch": 0.031085362803800518,
"grad_norm": 4.393169555680857,
"learning_rate": 1.9977630367800366e-05,
"loss": 1.4876,
"step": 7520
},
{
"epoch": 0.03112669972242259,
"grad_norm": 3.3377888267921163,
"learning_rate": 1.9977542590629736e-05,
"loss": 1.5107,
"step": 7530
},
{
"epoch": 0.031168036641044665,
"grad_norm": 3.6979315229218512,
"learning_rate": 1.9977454641773432e-05,
"loss": 1.4984,
"step": 7540
},
{
"epoch": 0.031209373559666742,
"grad_norm": 4.422235985099495,
"learning_rate": 1.9977366521232966e-05,
"loss": 1.5166,
"step": 7550
},
{
"epoch": 0.03125071047828882,
"grad_norm": 4.357975123788466,
"learning_rate": 1.9977278229009854e-05,
"loss": 1.5133,
"step": 7560
},
{
"epoch": 0.03129204739691089,
"grad_norm": 3.618227619392518,
"learning_rate": 1.997718976510562e-05,
"loss": 1.4872,
"step": 7570
},
{
"epoch": 0.03133338431553297,
"grad_norm": 3.6903691829248175,
"learning_rate": 1.9977101129521778e-05,
"loss": 1.4968,
"step": 7580
},
{
"epoch": 0.031374721234155044,
"grad_norm": 3.325924769769221,
"learning_rate": 1.997701232225986e-05,
"loss": 1.484,
"step": 7590
},
{
"epoch": 0.03141605815277712,
"grad_norm": 3.244599139580768,
"learning_rate": 1.9976923343321388e-05,
"loss": 1.501,
"step": 7600
},
{
"epoch": 0.03145739507139919,
"grad_norm": 3.9512458600612224,
"learning_rate": 1.9976834192707898e-05,
"loss": 1.5146,
"step": 7610
},
{
"epoch": 0.03149873199002127,
"grad_norm": 6.319637912227645,
"learning_rate": 1.9976744870420925e-05,
"loss": 1.5232,
"step": 7620
},
{
"epoch": 0.031540068908643346,
"grad_norm": 3.385728813673924,
"learning_rate": 1.9976655376462003e-05,
"loss": 1.4964,
"step": 7630
},
{
"epoch": 0.031581405827265416,
"grad_norm": 3.5621843155206365,
"learning_rate": 1.997656571083267e-05,
"loss": 1.4948,
"step": 7640
},
{
"epoch": 0.03162274274588749,
"grad_norm": 3.5127469264785334,
"learning_rate": 1.9976475873534476e-05,
"loss": 1.4788,
"step": 7650
},
{
"epoch": 0.03166407966450957,
"grad_norm": 5.123261236833159,
"learning_rate": 1.9976385864568958e-05,
"loss": 1.4906,
"step": 7660
},
{
"epoch": 0.03170541658313164,
"grad_norm": 3.0872059709849378,
"learning_rate": 1.997629568393767e-05,
"loss": 1.5013,
"step": 7670
},
{
"epoch": 0.03174675350175372,
"grad_norm": 3.516937874543627,
"learning_rate": 1.9976205331642165e-05,
"loss": 1.4802,
"step": 7680
},
{
"epoch": 0.031788090420375795,
"grad_norm": 3.7194230277933684,
"learning_rate": 1.9976114807683996e-05,
"loss": 1.4776,
"step": 7690
},
{
"epoch": 0.03182942733899787,
"grad_norm": 3.6171028748264016,
"learning_rate": 1.9976024112064718e-05,
"loss": 1.4867,
"step": 7700
},
{
"epoch": 0.03187076425761994,
"grad_norm": 4.74734081777917,
"learning_rate": 1.9975933244785894e-05,
"loss": 1.5321,
"step": 7710
},
{
"epoch": 0.03191210117624202,
"grad_norm": 3.4998159594245912,
"learning_rate": 1.997584220584909e-05,
"loss": 1.4555,
"step": 7720
},
{
"epoch": 0.031953438094864096,
"grad_norm": 4.0664157482197245,
"learning_rate": 1.9975750995255865e-05,
"loss": 1.4982,
"step": 7730
},
{
"epoch": 0.03199477501348617,
"grad_norm": 4.448956391697098,
"learning_rate": 1.9975659613007797e-05,
"loss": 1.4877,
"step": 7740
},
{
"epoch": 0.032036111932108244,
"grad_norm": 3.9116050665163056,
"learning_rate": 1.9975568059106455e-05,
"loss": 1.51,
"step": 7750
},
{
"epoch": 0.03207744885073032,
"grad_norm": 3.8835007718258874,
"learning_rate": 1.9975476333553416e-05,
"loss": 1.5245,
"step": 7760
},
{
"epoch": 0.0321187857693524,
"grad_norm": 3.8364373709477215,
"learning_rate": 1.9975384436350254e-05,
"loss": 1.467,
"step": 7770
},
{
"epoch": 0.03216012268797447,
"grad_norm": 3.848935245597496,
"learning_rate": 1.9975292367498556e-05,
"loss": 1.4999,
"step": 7780
},
{
"epoch": 0.032201459606596546,
"grad_norm": 3.309302500010883,
"learning_rate": 1.99752001269999e-05,
"loss": 1.513,
"step": 7790
},
{
"epoch": 0.03224279652521862,
"grad_norm": 3.417545265677783,
"learning_rate": 1.9975107714855875e-05,
"loss": 1.5138,
"step": 7800
},
{
"epoch": 0.03228413344384069,
"grad_norm": 4.218881911775881,
"learning_rate": 1.9975015131068078e-05,
"loss": 1.4763,
"step": 7810
},
{
"epoch": 0.03232547036246277,
"grad_norm": 3.581778505125333,
"learning_rate": 1.997492237563809e-05,
"loss": 1.5195,
"step": 7820
},
{
"epoch": 0.03236680728108485,
"grad_norm": 3.7604938020140226,
"learning_rate": 1.997482944856752e-05,
"loss": 1.4933,
"step": 7830
},
{
"epoch": 0.032408144199706924,
"grad_norm": 4.37155325952897,
"learning_rate": 1.997473634985796e-05,
"loss": 1.4557,
"step": 7840
},
{
"epoch": 0.032449481118328995,
"grad_norm": 3.3738873014096153,
"learning_rate": 1.9974643079511008e-05,
"loss": 1.5323,
"step": 7850
},
{
"epoch": 0.03249081803695107,
"grad_norm": 4.587428592575524,
"learning_rate": 1.9974549637528276e-05,
"loss": 1.5129,
"step": 7860
},
{
"epoch": 0.03253215495557315,
"grad_norm": 3.7141451702858252,
"learning_rate": 1.997445602391137e-05,
"loss": 1.5176,
"step": 7870
},
{
"epoch": 0.03257349187419522,
"grad_norm": 3.585481088159076,
"learning_rate": 1.9974362238661903e-05,
"loss": 1.5109,
"step": 7880
},
{
"epoch": 0.032614828792817296,
"grad_norm": 3.4290626560881936,
"learning_rate": 1.9974268281781484e-05,
"loss": 1.4477,
"step": 7890
},
{
"epoch": 0.032656165711439374,
"grad_norm": 3.612891113663602,
"learning_rate": 1.9974174153271728e-05,
"loss": 1.4229,
"step": 7900
},
{
"epoch": 0.03269750263006145,
"grad_norm": 3.487845093010647,
"learning_rate": 1.9974079853134266e-05,
"loss": 1.5321,
"step": 7910
},
{
"epoch": 0.03273883954868352,
"grad_norm": 5.587764895917656,
"learning_rate": 1.9973985381370707e-05,
"loss": 1.4645,
"step": 7920
},
{
"epoch": 0.0327801764673056,
"grad_norm": 3.5490448514590494,
"learning_rate": 1.9973890737982684e-05,
"loss": 1.5374,
"step": 7930
},
{
"epoch": 0.032821513385927675,
"grad_norm": 3.94930169954439,
"learning_rate": 1.9973795922971827e-05,
"loss": 1.4661,
"step": 7940
},
{
"epoch": 0.032862850304549746,
"grad_norm": 3.494031781745132,
"learning_rate": 1.9973700936339763e-05,
"loss": 1.4291,
"step": 7950
},
{
"epoch": 0.03290418722317182,
"grad_norm": 3.0746272722997414,
"learning_rate": 1.9973605778088126e-05,
"loss": 1.5493,
"step": 7960
},
{
"epoch": 0.0329455241417939,
"grad_norm": 3.6416998354027736,
"learning_rate": 1.9973510448218558e-05,
"loss": 1.4471,
"step": 7970
},
{
"epoch": 0.03298686106041597,
"grad_norm": 3.8385152763033883,
"learning_rate": 1.99734149467327e-05,
"loss": 1.5182,
"step": 7980
},
{
"epoch": 0.03302819797903805,
"grad_norm": 3.5463908529630985,
"learning_rate": 1.9973319273632187e-05,
"loss": 1.4848,
"step": 7990
},
{
"epoch": 0.033069534897660124,
"grad_norm": 4.494540529736681,
"learning_rate": 1.9973223428918677e-05,
"loss": 1.4656,
"step": 8000
},
{
"epoch": 0.0331108718162822,
"grad_norm": 4.138487839617815,
"learning_rate": 1.997312741259381e-05,
"loss": 1.4533,
"step": 8010
},
{
"epoch": 0.03315220873490427,
"grad_norm": 3.2836571982439957,
"learning_rate": 1.9973031224659238e-05,
"loss": 1.4637,
"step": 8020
},
{
"epoch": 0.03319354565352635,
"grad_norm": 4.101602166042781,
"learning_rate": 1.9972934865116622e-05,
"loss": 1.4656,
"step": 8030
},
{
"epoch": 0.033234882572148426,
"grad_norm": 4.372499340047587,
"learning_rate": 1.9972838333967615e-05,
"loss": 1.5377,
"step": 8040
},
{
"epoch": 0.033276219490770496,
"grad_norm": 3.984477694550147,
"learning_rate": 1.997274163121388e-05,
"loss": 1.5216,
"step": 8050
},
{
"epoch": 0.033317556409392574,
"grad_norm": 3.484892612102319,
"learning_rate": 1.9972644756857087e-05,
"loss": 1.459,
"step": 8060
},
{
"epoch": 0.03335889332801465,
"grad_norm": 3.473953984247227,
"learning_rate": 1.9972547710898894e-05,
"loss": 1.4889,
"step": 8070
},
{
"epoch": 0.03340023024663673,
"grad_norm": 3.6982560350258384,
"learning_rate": 1.9972450493340973e-05,
"loss": 1.4529,
"step": 8080
},
{
"epoch": 0.0334415671652588,
"grad_norm": 4.338255434805353,
"learning_rate": 1.9972353104185e-05,
"loss": 1.4906,
"step": 8090
},
{
"epoch": 0.033482904083880875,
"grad_norm": 3.2105273217876897,
"learning_rate": 1.9972255543432644e-05,
"loss": 1.4846,
"step": 8100
},
{
"epoch": 0.03352424100250295,
"grad_norm": 3.7063825752894037,
"learning_rate": 1.997215781108559e-05,
"loss": 1.4354,
"step": 8110
},
{
"epoch": 0.03356557792112502,
"grad_norm": 3.606082322653538,
"learning_rate": 1.997205990714552e-05,
"loss": 1.5067,
"step": 8120
},
{
"epoch": 0.0336069148397471,
"grad_norm": 3.8291971667898643,
"learning_rate": 1.9971961831614116e-05,
"loss": 1.4619,
"step": 8130
},
{
"epoch": 0.03364825175836918,
"grad_norm": 3.331721665794406,
"learning_rate": 1.997186358449307e-05,
"loss": 1.4484,
"step": 8140
},
{
"epoch": 0.033689588676991254,
"grad_norm": 4.417828154129124,
"learning_rate": 1.9971765165784065e-05,
"loss": 1.508,
"step": 8150
},
{
"epoch": 0.033730925595613324,
"grad_norm": 3.782991542711988,
"learning_rate": 1.9971666575488798e-05,
"loss": 1.3925,
"step": 8160
},
{
"epoch": 0.0337722625142354,
"grad_norm": 4.079542042860949,
"learning_rate": 1.997156781360897e-05,
"loss": 1.4996,
"step": 8170
},
{
"epoch": 0.03381359943285748,
"grad_norm": 4.1626484536651995,
"learning_rate": 1.9971468880146273e-05,
"loss": 1.5178,
"step": 8180
},
{
"epoch": 0.03385493635147955,
"grad_norm": 4.122058968257828,
"learning_rate": 1.9971369775102417e-05,
"loss": 1.4267,
"step": 8190
},
{
"epoch": 0.033896273270101626,
"grad_norm": 4.017697912034419,
"learning_rate": 1.9971270498479097e-05,
"loss": 1.5129,
"step": 8200
},
{
"epoch": 0.0339376101887237,
"grad_norm": 3.3575071407879977,
"learning_rate": 1.997117105027803e-05,
"loss": 1.4796,
"step": 8210
},
{
"epoch": 0.03397894710734578,
"grad_norm": 3.9246891678069598,
"learning_rate": 1.9971071430500924e-05,
"loss": 1.5052,
"step": 8220
},
{
"epoch": 0.03402028402596785,
"grad_norm": 4.075111279351767,
"learning_rate": 1.9970971639149493e-05,
"loss": 1.4606,
"step": 8230
},
{
"epoch": 0.03406162094458993,
"grad_norm": 4.564662754708322,
"learning_rate": 1.997087167622546e-05,
"loss": 1.5111,
"step": 8240
},
{
"epoch": 0.034102957863212005,
"grad_norm": 3.753297984489193,
"learning_rate": 1.9970771541730536e-05,
"loss": 1.4899,
"step": 8250
},
{
"epoch": 0.034144294781834075,
"grad_norm": 3.913489388979073,
"learning_rate": 1.997067123566645e-05,
"loss": 1.4796,
"step": 8260
},
{
"epoch": 0.03418563170045615,
"grad_norm": 3.440711906522703,
"learning_rate": 1.9970570758034924e-05,
"loss": 1.5184,
"step": 8270
},
{
"epoch": 0.03422696861907823,
"grad_norm": 3.7595940155205363,
"learning_rate": 1.997047010883769e-05,
"loss": 1.4901,
"step": 8280
},
{
"epoch": 0.0342683055377003,
"grad_norm": 4.387403962431217,
"learning_rate": 1.9970369288076478e-05,
"loss": 1.4553,
"step": 8290
},
{
"epoch": 0.03430964245632238,
"grad_norm": 3.7034201439850594,
"learning_rate": 1.9970268295753022e-05,
"loss": 1.4534,
"step": 8300
},
{
"epoch": 0.034350979374944454,
"grad_norm": 4.0052763549673225,
"learning_rate": 1.9970167131869064e-05,
"loss": 1.4539,
"step": 8310
},
{
"epoch": 0.03439231629356653,
"grad_norm": 4.079349801665228,
"learning_rate": 1.9970065796426342e-05,
"loss": 1.4698,
"step": 8320
},
{
"epoch": 0.0344336532121886,
"grad_norm": 3.9322937150085875,
"learning_rate": 1.99699642894266e-05,
"loss": 1.4318,
"step": 8330
},
{
"epoch": 0.03447499013081068,
"grad_norm": 3.834658918473933,
"learning_rate": 1.9969862610871586e-05,
"loss": 1.4687,
"step": 8340
},
{
"epoch": 0.034516327049432756,
"grad_norm": 3.3637972502778912,
"learning_rate": 1.9969760760763045e-05,
"loss": 1.4661,
"step": 8350
},
{
"epoch": 0.034557663968054826,
"grad_norm": 3.5654841117273026,
"learning_rate": 1.9969658739102733e-05,
"loss": 1.4302,
"step": 8360
},
{
"epoch": 0.0345990008866769,
"grad_norm": 3.7270409908212194,
"learning_rate": 1.9969556545892405e-05,
"loss": 1.4447,
"step": 8370
},
{
"epoch": 0.03464033780529898,
"grad_norm": 3.914859995823126,
"learning_rate": 1.996945418113382e-05,
"loss": 1.4519,
"step": 8380
},
{
"epoch": 0.03468167472392106,
"grad_norm": 4.5791406660012095,
"learning_rate": 1.9969351644828742e-05,
"loss": 1.5204,
"step": 8390
},
{
"epoch": 0.03472301164254313,
"grad_norm": 3.6418617205879817,
"learning_rate": 1.9969248936978932e-05,
"loss": 1.4943,
"step": 8400
},
{
"epoch": 0.034764348561165205,
"grad_norm": 3.4609359361113534,
"learning_rate": 1.9969146057586156e-05,
"loss": 1.4799,
"step": 8410
},
{
"epoch": 0.03480568547978728,
"grad_norm": 3.9925889106780987,
"learning_rate": 1.9969043006652186e-05,
"loss": 1.4687,
"step": 8420
},
{
"epoch": 0.03484702239840935,
"grad_norm": 3.2054804086428548,
"learning_rate": 1.9968939784178794e-05,
"loss": 1.4816,
"step": 8430
},
{
"epoch": 0.03488835931703143,
"grad_norm": 3.349889124617882,
"learning_rate": 1.996883639016776e-05,
"loss": 1.4577,
"step": 8440
},
{
"epoch": 0.03492969623565351,
"grad_norm": 3.5789857070884086,
"learning_rate": 1.996873282462086e-05,
"loss": 1.5172,
"step": 8450
},
{
"epoch": 0.034971033154275584,
"grad_norm": 3.4989039985130272,
"learning_rate": 1.9968629087539876e-05,
"loss": 1.4852,
"step": 8460
},
{
"epoch": 0.035012370072897654,
"grad_norm": 3.3811833684889154,
"learning_rate": 1.9968525178926595e-05,
"loss": 1.4594,
"step": 8470
},
{
"epoch": 0.03505370699151973,
"grad_norm": 3.4385931194022223,
"learning_rate": 1.9968421098782803e-05,
"loss": 1.4595,
"step": 8480
},
{
"epoch": 0.03509504391014181,
"grad_norm": 4.16413923561225,
"learning_rate": 1.9968316847110292e-05,
"loss": 1.4963,
"step": 8490
},
{
"epoch": 0.03513638082876388,
"grad_norm": 4.289305042774894,
"learning_rate": 1.9968212423910855e-05,
"loss": 1.4551,
"step": 8500
},
{
"epoch": 0.035177717747385956,
"grad_norm": 5.453654756696216,
"learning_rate": 1.9968107829186287e-05,
"loss": 1.4885,
"step": 8510
},
{
"epoch": 0.03521905466600803,
"grad_norm": 3.6797118668666795,
"learning_rate": 1.996800306293839e-05,
"loss": 1.4984,
"step": 8520
},
{
"epoch": 0.03526039158463011,
"grad_norm": 3.2371854818646995,
"learning_rate": 1.9967898125168973e-05,
"loss": 1.4481,
"step": 8530
},
{
"epoch": 0.03530172850325218,
"grad_norm": 3.238508861502653,
"learning_rate": 1.9967793015879828e-05,
"loss": 1.4562,
"step": 8540
},
{
"epoch": 0.03534306542187426,
"grad_norm": 3.5415115005606177,
"learning_rate": 1.9967687735072776e-05,
"loss": 1.476,
"step": 8550
},
{
"epoch": 0.035384402340496335,
"grad_norm": 3.843042698193225,
"learning_rate": 1.9967582282749622e-05,
"loss": 1.4751,
"step": 8560
},
{
"epoch": 0.035425739259118405,
"grad_norm": 3.5779391668735006,
"learning_rate": 1.9967476658912184e-05,
"loss": 1.4804,
"step": 8570
},
{
"epoch": 0.03546707617774048,
"grad_norm": 4.949952686769368,
"learning_rate": 1.9967370863562276e-05,
"loss": 1.4245,
"step": 8580
},
{
"epoch": 0.03550841309636256,
"grad_norm": 3.8134978579481924,
"learning_rate": 1.996726489670172e-05,
"loss": 1.494,
"step": 8590
},
{
"epoch": 0.03554975001498463,
"grad_norm": 4.098567290916666,
"learning_rate": 1.996715875833234e-05,
"loss": 1.4339,
"step": 8600
},
{
"epoch": 0.03559108693360671,
"grad_norm": 3.4443466301897074,
"learning_rate": 1.9967052448455962e-05,
"loss": 1.4808,
"step": 8610
},
{
"epoch": 0.035632423852228784,
"grad_norm": 3.939242075931307,
"learning_rate": 1.9966945967074416e-05,
"loss": 1.4884,
"step": 8620
},
{
"epoch": 0.03567376077085086,
"grad_norm": 3.3941498280577975,
"learning_rate": 1.996683931418953e-05,
"loss": 1.4635,
"step": 8630
},
{
"epoch": 0.03571509768947293,
"grad_norm": 3.911248054251368,
"learning_rate": 1.996673248980315e-05,
"loss": 1.4785,
"step": 8640
},
{
"epoch": 0.03575643460809501,
"grad_norm": 4.0383619484944155,
"learning_rate": 1.99666254939171e-05,
"loss": 1.4334,
"step": 8650
},
{
"epoch": 0.035797771526717086,
"grad_norm": 3.21818266356431,
"learning_rate": 1.996651832653323e-05,
"loss": 1.5279,
"step": 8660
},
{
"epoch": 0.035839108445339156,
"grad_norm": 4.068360221073268,
"learning_rate": 1.9966410987653383e-05,
"loss": 1.5073,
"step": 8670
},
{
"epoch": 0.03588044536396123,
"grad_norm": 5.64416307388456,
"learning_rate": 1.9966303477279404e-05,
"loss": 1.4595,
"step": 8680
},
{
"epoch": 0.03592178228258331,
"grad_norm": 4.456991706091006,
"learning_rate": 1.9966195795413145e-05,
"loss": 1.5152,
"step": 8690
},
{
"epoch": 0.03596311920120539,
"grad_norm": 3.541237309488241,
"learning_rate": 1.9966087942056457e-05,
"loss": 1.4773,
"step": 8700
},
{
"epoch": 0.03600445611982746,
"grad_norm": 3.5306668816992186,
"learning_rate": 1.9965979917211196e-05,
"loss": 1.4838,
"step": 8710
},
{
"epoch": 0.036045793038449535,
"grad_norm": 4.051613339547623,
"learning_rate": 1.9965871720879223e-05,
"loss": 1.463,
"step": 8720
},
{
"epoch": 0.03608712995707161,
"grad_norm": 3.6129954404785,
"learning_rate": 1.9965763353062394e-05,
"loss": 1.4479,
"step": 8730
},
{
"epoch": 0.03612846687569368,
"grad_norm": 4.2384349637413825,
"learning_rate": 1.9965654813762582e-05,
"loss": 1.4928,
"step": 8740
},
{
"epoch": 0.03616980379431576,
"grad_norm": 4.343148391315392,
"learning_rate": 1.9965546102981652e-05,
"loss": 1.4418,
"step": 8750
},
{
"epoch": 0.03621114071293784,
"grad_norm": 3.9477945474327276,
"learning_rate": 1.996543722072147e-05,
"loss": 1.4417,
"step": 8760
},
{
"epoch": 0.036252477631559914,
"grad_norm": 3.912481869555381,
"learning_rate": 1.9965328166983916e-05,
"loss": 1.4877,
"step": 8770
},
{
"epoch": 0.036293814550181984,
"grad_norm": 4.391935734682612,
"learning_rate": 1.9965218941770857e-05,
"loss": 1.4335,
"step": 8780
},
{
"epoch": 0.03633515146880406,
"grad_norm": 4.493537291846412,
"learning_rate": 1.9965109545084185e-05,
"loss": 1.4919,
"step": 8790
},
{
"epoch": 0.03637648838742614,
"grad_norm": 2.93026955700472,
"learning_rate": 1.9964999976925775e-05,
"loss": 1.4304,
"step": 8800
},
{
"epoch": 0.03641782530604821,
"grad_norm": 3.6053506467813032,
"learning_rate": 1.9964890237297512e-05,
"loss": 1.4635,
"step": 8810
},
{
"epoch": 0.036459162224670286,
"grad_norm": 3.5234834011018648,
"learning_rate": 1.9964780326201286e-05,
"loss": 1.4981,
"step": 8820
},
{
"epoch": 0.03650049914329236,
"grad_norm": 3.750450253620856,
"learning_rate": 1.996467024363899e-05,
"loss": 1.4627,
"step": 8830
},
{
"epoch": 0.03654183606191444,
"grad_norm": 3.666723051780572,
"learning_rate": 1.9964559989612516e-05,
"loss": 1.4514,
"step": 8840
},
{
"epoch": 0.03658317298053651,
"grad_norm": 3.3239044375214633,
"learning_rate": 1.996444956412376e-05,
"loss": 1.4972,
"step": 8850
},
{
"epoch": 0.03662450989915859,
"grad_norm": 3.8599698199624064,
"learning_rate": 1.9964338967174625e-05,
"loss": 1.5057,
"step": 8860
},
{
"epoch": 0.036665846817780665,
"grad_norm": 4.132699231086706,
"learning_rate": 1.9964228198767012e-05,
"loss": 1.4519,
"step": 8870
},
{
"epoch": 0.036707183736402735,
"grad_norm": 3.0714085451165745,
"learning_rate": 1.9964117258902828e-05,
"loss": 1.434,
"step": 8880
},
{
"epoch": 0.03674852065502481,
"grad_norm": 3.8796486291954904,
"learning_rate": 1.9964006147583982e-05,
"loss": 1.4505,
"step": 8890
},
{
"epoch": 0.03678985757364689,
"grad_norm": 3.832002897416075,
"learning_rate": 1.9963894864812383e-05,
"loss": 1.4526,
"step": 8900
},
{
"epoch": 0.03683119449226896,
"grad_norm": 4.887224283091199,
"learning_rate": 1.9963783410589948e-05,
"loss": 1.4644,
"step": 8910
},
{
"epoch": 0.036872531410891037,
"grad_norm": 4.158724114940273,
"learning_rate": 1.99636717849186e-05,
"loss": 1.4417,
"step": 8920
},
{
"epoch": 0.036913868329513114,
"grad_norm": 3.81771878130769,
"learning_rate": 1.9963559987800253e-05,
"loss": 1.508,
"step": 8930
},
{
"epoch": 0.03695520524813519,
"grad_norm": 3.5553407292065207,
"learning_rate": 1.9963448019236834e-05,
"loss": 1.383,
"step": 8940
},
{
"epoch": 0.03699654216675726,
"grad_norm": 5.03061141095772,
"learning_rate": 1.9963335879230264e-05,
"loss": 1.4293,
"step": 8950
},
{
"epoch": 0.03703787908537934,
"grad_norm": 3.183233332739406,
"learning_rate": 1.996322356778248e-05,
"loss": 1.4355,
"step": 8960
},
{
"epoch": 0.037079216004001415,
"grad_norm": 3.555732688914675,
"learning_rate": 1.996311108489541e-05,
"loss": 1.4338,
"step": 8970
},
{
"epoch": 0.037120552922623486,
"grad_norm": 3.696220192021282,
"learning_rate": 1.9962998430570994e-05,
"loss": 1.4883,
"step": 8980
},
{
"epoch": 0.03716188984124556,
"grad_norm": 4.796096029475931,
"learning_rate": 1.9962885604811168e-05,
"loss": 1.4901,
"step": 8990
},
{
"epoch": 0.03720322675986764,
"grad_norm": 5.814203236754815,
"learning_rate": 1.996277260761787e-05,
"loss": 1.4053,
"step": 9000
},
{
"epoch": 0.03724456367848972,
"grad_norm": 3.3287110492970795,
"learning_rate": 1.996265943899305e-05,
"loss": 1.4202,
"step": 9010
},
{
"epoch": 0.03728590059711179,
"grad_norm": 3.877230681858091,
"learning_rate": 1.996254609893865e-05,
"loss": 1.4297,
"step": 9020
},
{
"epoch": 0.037327237515733865,
"grad_norm": 3.48844533397734,
"learning_rate": 1.9962432587456622e-05,
"loss": 1.4652,
"step": 9030
},
{
"epoch": 0.03736857443435594,
"grad_norm": 3.5520610987897943,
"learning_rate": 1.9962318904548923e-05,
"loss": 1.4807,
"step": 9040
},
{
"epoch": 0.03740991135297801,
"grad_norm": 3.181838391240591,
"learning_rate": 1.9962205050217504e-05,
"loss": 1.4757,
"step": 9050
},
{
"epoch": 0.03745124827160009,
"grad_norm": 3.7425531387998907,
"learning_rate": 1.996209102446433e-05,
"loss": 1.4331,
"step": 9060
},
{
"epoch": 0.037492585190222166,
"grad_norm": 3.663633392520708,
"learning_rate": 1.9961976827291358e-05,
"loss": 1.4718,
"step": 9070
},
{
"epoch": 0.03753392210884424,
"grad_norm": 4.833995454604731,
"learning_rate": 1.9961862458700554e-05,
"loss": 1.4217,
"step": 9080
},
{
"epoch": 0.037575259027466314,
"grad_norm": 3.6290459016542216,
"learning_rate": 1.9961747918693887e-05,
"loss": 1.4848,
"step": 9090
},
{
"epoch": 0.03761659594608839,
"grad_norm": 3.585806885070931,
"learning_rate": 1.9961633207273325e-05,
"loss": 1.4358,
"step": 9100
},
{
"epoch": 0.03765793286471047,
"grad_norm": 3.4952003665857134,
"learning_rate": 1.9961518324440847e-05,
"loss": 1.3939,
"step": 9110
},
{
"epoch": 0.03769926978333254,
"grad_norm": 3.279719203181294,
"learning_rate": 1.9961403270198424e-05,
"loss": 1.4808,
"step": 9120
},
{
"epoch": 0.037740606701954615,
"grad_norm": 3.2692766545796528,
"learning_rate": 1.9961288044548043e-05,
"loss": 1.3822,
"step": 9130
},
{
"epoch": 0.03778194362057669,
"grad_norm": 3.6490123739235623,
"learning_rate": 1.996117264749168e-05,
"loss": 1.4485,
"step": 9140
},
{
"epoch": 0.03782328053919877,
"grad_norm": 4.464763724322134,
"learning_rate": 1.996105707903132e-05,
"loss": 1.4795,
"step": 9150
},
{
"epoch": 0.03786461745782084,
"grad_norm": 3.529618572994803,
"learning_rate": 1.9960941339168963e-05,
"loss": 1.4452,
"step": 9160
},
{
"epoch": 0.03790595437644292,
"grad_norm": 3.949852891089842,
"learning_rate": 1.9960825427906587e-05,
"loss": 1.4866,
"step": 9170
},
{
"epoch": 0.037947291295064994,
"grad_norm": 6.3198129841396735,
"learning_rate": 1.9960709345246192e-05,
"loss": 1.4661,
"step": 9180
},
{
"epoch": 0.037988628213687065,
"grad_norm": 4.3016466998403775,
"learning_rate": 1.9960593091189776e-05,
"loss": 1.4575,
"step": 9190
},
{
"epoch": 0.03802996513230914,
"grad_norm": 3.218809542898574,
"learning_rate": 1.996047666573934e-05,
"loss": 1.4385,
"step": 9200
},
{
"epoch": 0.03807130205093122,
"grad_norm": 3.507546904929844,
"learning_rate": 1.9960360068896884e-05,
"loss": 1.456,
"step": 9210
},
{
"epoch": 0.038112638969553296,
"grad_norm": 3.2658287561416866,
"learning_rate": 1.9960243300664418e-05,
"loss": 1.4937,
"step": 9220
},
{
"epoch": 0.038153975888175366,
"grad_norm": 3.9657257849748078,
"learning_rate": 1.996012636104395e-05,
"loss": 1.4743,
"step": 9230
},
{
"epoch": 0.03819531280679744,
"grad_norm": 3.7419945345055865,
"learning_rate": 1.996000925003749e-05,
"loss": 1.4645,
"step": 9240
},
{
"epoch": 0.03823664972541952,
"grad_norm": 3.717998186266208,
"learning_rate": 1.9959891967647055e-05,
"loss": 1.4304,
"step": 9250
},
{
"epoch": 0.03827798664404159,
"grad_norm": 4.122611974230224,
"learning_rate": 1.9959774513874666e-05,
"loss": 1.4396,
"step": 9260
},
{
"epoch": 0.03831932356266367,
"grad_norm": 4.081766829152903,
"learning_rate": 1.9959656888722338e-05,
"loss": 1.4296,
"step": 9270
},
{
"epoch": 0.038360660481285745,
"grad_norm": 3.4327932618086807,
"learning_rate": 1.99595390921921e-05,
"loss": 1.479,
"step": 9280
},
{
"epoch": 0.038401997399907815,
"grad_norm": 4.251866528393302,
"learning_rate": 1.9959421124285976e-05,
"loss": 1.4399,
"step": 9290
},
{
"epoch": 0.03844333431852989,
"grad_norm": 4.132921022210262,
"learning_rate": 1.9959302985006e-05,
"loss": 1.4366,
"step": 9300
},
{
"epoch": 0.03848467123715197,
"grad_norm": 4.791211851168452,
"learning_rate": 1.9959184674354198e-05,
"loss": 1.4838,
"step": 9310
},
{
"epoch": 0.03852600815577405,
"grad_norm": 3.000007579210258,
"learning_rate": 1.995906619233261e-05,
"loss": 1.4541,
"step": 9320
},
{
"epoch": 0.03856734507439612,
"grad_norm": 5.059959210643911,
"learning_rate": 1.9958947538943278e-05,
"loss": 1.5233,
"step": 9330
},
{
"epoch": 0.038608681993018194,
"grad_norm": 3.20842711732194,
"learning_rate": 1.9958828714188236e-05,
"loss": 1.4718,
"step": 9340
},
{
"epoch": 0.03865001891164027,
"grad_norm": 3.796018357701994,
"learning_rate": 1.9958709718069532e-05,
"loss": 1.4522,
"step": 9350
},
{
"epoch": 0.03869135583026234,
"grad_norm": 3.9321479256125347,
"learning_rate": 1.995859055058922e-05,
"loss": 1.5065,
"step": 9360
},
{
"epoch": 0.03873269274888442,
"grad_norm": 3.254019632954085,
"learning_rate": 1.9958471211749342e-05,
"loss": 1.4114,
"step": 9370
},
{
"epoch": 0.038774029667506496,
"grad_norm": 3.3308294037697896,
"learning_rate": 1.9958351701551953e-05,
"loss": 1.4285,
"step": 9380
},
{
"epoch": 0.03881536658612857,
"grad_norm": 4.08834871777043,
"learning_rate": 1.9958232019999114e-05,
"loss": 1.4295,
"step": 9390
},
{
"epoch": 0.03885670350475064,
"grad_norm": 3.441069579264666,
"learning_rate": 1.995811216709288e-05,
"loss": 1.4472,
"step": 9400
},
{
"epoch": 0.03889804042337272,
"grad_norm": 3.426532775633606,
"learning_rate": 1.995799214283531e-05,
"loss": 1.4566,
"step": 9410
},
{
"epoch": 0.0389393773419948,
"grad_norm": 3.399689817601649,
"learning_rate": 1.9957871947228476e-05,
"loss": 1.4642,
"step": 9420
},
{
"epoch": 0.03898071426061687,
"grad_norm": 3.388140389856613,
"learning_rate": 1.995775158027445e-05,
"loss": 1.4593,
"step": 9430
},
{
"epoch": 0.039022051179238945,
"grad_norm": 3.325888034679041,
"learning_rate": 1.9957631041975292e-05,
"loss": 1.473,
"step": 9440
},
{
"epoch": 0.03906338809786102,
"grad_norm": 5.25058506424265,
"learning_rate": 1.995751033233308e-05,
"loss": 1.4085,
"step": 9450
},
{
"epoch": 0.0391047250164831,
"grad_norm": 3.8257776442726135,
"learning_rate": 1.9957389451349898e-05,
"loss": 1.4926,
"step": 9460
},
{
"epoch": 0.03914606193510517,
"grad_norm": 3.9914355755037514,
"learning_rate": 1.9957268399027815e-05,
"loss": 1.4433,
"step": 9470
},
{
"epoch": 0.03918739885372725,
"grad_norm": 4.259453650516103,
"learning_rate": 1.9957147175368923e-05,
"loss": 1.4435,
"step": 9480
},
{
"epoch": 0.039228735772349324,
"grad_norm": 3.4057039561381974,
"learning_rate": 1.99570257803753e-05,
"loss": 1.4021,
"step": 9490
},
{
"epoch": 0.039270072690971394,
"grad_norm": 3.9702568689341735,
"learning_rate": 1.9956904214049044e-05,
"loss": 1.3975,
"step": 9500
},
{
"epoch": 0.03931140960959347,
"grad_norm": 4.162984306124767,
"learning_rate": 1.995678247639224e-05,
"loss": 1.4269,
"step": 9510
},
{
"epoch": 0.03935274652821555,
"grad_norm": 3.4623660466543216,
"learning_rate": 1.9956660567406984e-05,
"loss": 1.4812,
"step": 9520
},
{
"epoch": 0.039394083446837626,
"grad_norm": 3.9487634862208663,
"learning_rate": 1.9956538487095375e-05,
"loss": 1.3904,
"step": 9530
},
{
"epoch": 0.039435420365459696,
"grad_norm": 3.940768474272943,
"learning_rate": 1.9956416235459514e-05,
"loss": 1.4627,
"step": 9540
},
{
"epoch": 0.03947675728408177,
"grad_norm": 3.7240510688214488,
"learning_rate": 1.9956293812501503e-05,
"loss": 1.4714,
"step": 9550
},
{
"epoch": 0.03951809420270385,
"grad_norm": 3.544248199002313,
"learning_rate": 1.995617121822345e-05,
"loss": 1.4418,
"step": 9560
},
{
"epoch": 0.03955943112132592,
"grad_norm": 3.7941720521427453,
"learning_rate": 1.9956048452627463e-05,
"loss": 1.398,
"step": 9570
},
{
"epoch": 0.039600768039948,
"grad_norm": 3.231769382614049,
"learning_rate": 1.9955925515715656e-05,
"loss": 1.4323,
"step": 9580
},
{
"epoch": 0.039642104958570075,
"grad_norm": 3.4504677343753585,
"learning_rate": 1.9955802407490144e-05,
"loss": 1.4508,
"step": 9590
},
{
"epoch": 0.039683441877192145,
"grad_norm": 4.608743387499926,
"learning_rate": 1.9955679127953046e-05,
"loss": 1.4849,
"step": 9600
},
{
"epoch": 0.03972477879581422,
"grad_norm": 3.2583619571782223,
"learning_rate": 1.995555567710648e-05,
"loss": 1.4528,
"step": 9610
},
{
"epoch": 0.0397661157144363,
"grad_norm": 3.592600847545303,
"learning_rate": 1.9955432054952573e-05,
"loss": 1.4222,
"step": 9620
},
{
"epoch": 0.03980745263305838,
"grad_norm": 3.935340478064598,
"learning_rate": 1.9955308261493457e-05,
"loss": 1.4243,
"step": 9630
},
{
"epoch": 0.03984878955168045,
"grad_norm": 3.7051161334020075,
"learning_rate": 1.995518429673125e-05,
"loss": 1.4487,
"step": 9640
},
{
"epoch": 0.039890126470302524,
"grad_norm": 3.6647142900939977,
"learning_rate": 1.9955060160668095e-05,
"loss": 1.4458,
"step": 9650
},
{
"epoch": 0.0399314633889246,
"grad_norm": 4.428497991354939,
"learning_rate": 1.9954935853306124e-05,
"loss": 1.4721,
"step": 9660
},
{
"epoch": 0.03997280030754667,
"grad_norm": 3.2958564393103056,
"learning_rate": 1.9954811374647474e-05,
"loss": 1.4394,
"step": 9670
},
{
"epoch": 0.04001413722616875,
"grad_norm": 3.10104196973718,
"learning_rate": 1.9954686724694297e-05,
"loss": 1.4361,
"step": 9680
},
{
"epoch": 0.040055474144790826,
"grad_norm": 3.957938872776804,
"learning_rate": 1.9954561903448727e-05,
"loss": 1.4602,
"step": 9690
},
{
"epoch": 0.0400968110634129,
"grad_norm": 3.760794840185392,
"learning_rate": 1.9954436910912914e-05,
"loss": 1.4285,
"step": 9700
},
{
"epoch": 0.04013814798203497,
"grad_norm": 3.421396807117046,
"learning_rate": 1.9954311747089012e-05,
"loss": 1.4774,
"step": 9710
},
{
"epoch": 0.04017948490065705,
"grad_norm": 3.91789094802535,
"learning_rate": 1.9954186411979175e-05,
"loss": 1.4021,
"step": 9720
},
{
"epoch": 0.04022082181927913,
"grad_norm": 3.081464490088515,
"learning_rate": 1.9954060905585556e-05,
"loss": 1.4219,
"step": 9730
},
{
"epoch": 0.0402621587379012,
"grad_norm": 3.3381107703512507,
"learning_rate": 1.9953935227910316e-05,
"loss": 1.4632,
"step": 9740
},
{
"epoch": 0.040303495656523275,
"grad_norm": 3.8300980744875828,
"learning_rate": 1.995380937895562e-05,
"loss": 1.4322,
"step": 9750
},
{
"epoch": 0.04034483257514535,
"grad_norm": 3.4534661824404633,
"learning_rate": 1.995368335872363e-05,
"loss": 1.4436,
"step": 9760
},
{
"epoch": 0.04038616949376743,
"grad_norm": 3.983712880561037,
"learning_rate": 1.995355716721652e-05,
"loss": 1.4598,
"step": 9770
},
{
"epoch": 0.0404275064123895,
"grad_norm": 3.840919795852268,
"learning_rate": 1.995343080443645e-05,
"loss": 1.4456,
"step": 9780
},
{
"epoch": 0.04046884333101158,
"grad_norm": 3.901157368681076,
"learning_rate": 1.9953304270385607e-05,
"loss": 1.4525,
"step": 9790
},
{
"epoch": 0.040510180249633654,
"grad_norm": 3.189808091891606,
"learning_rate": 1.9953177565066163e-05,
"loss": 1.4462,
"step": 9800
},
{
"epoch": 0.040551517168255724,
"grad_norm": 4.025890961267514,
"learning_rate": 1.9953050688480293e-05,
"loss": 1.443,
"step": 9810
},
{
"epoch": 0.0405928540868778,
"grad_norm": 3.3710964799433007,
"learning_rate": 1.995292364063019e-05,
"loss": 1.4262,
"step": 9820
},
{
"epoch": 0.04063419100549988,
"grad_norm": 3.883950857165337,
"learning_rate": 1.9952796421518034e-05,
"loss": 1.4174,
"step": 9830
},
{
"epoch": 0.040675527924121956,
"grad_norm": 3.474777308443348,
"learning_rate": 1.995266903114602e-05,
"loss": 1.4654,
"step": 9840
},
{
"epoch": 0.040716864842744026,
"grad_norm": 3.116715119287666,
"learning_rate": 1.995254146951633e-05,
"loss": 1.3727,
"step": 9850
},
{
"epoch": 0.0407582017613661,
"grad_norm": 3.9762493552410203,
"learning_rate": 1.9952413736631165e-05,
"loss": 1.4567,
"step": 9860
},
{
"epoch": 0.04079953867998818,
"grad_norm": 2.9468825909120033,
"learning_rate": 1.9952285832492726e-05,
"loss": 1.4422,
"step": 9870
},
{
"epoch": 0.04084087559861025,
"grad_norm": 3.5348361015353444,
"learning_rate": 1.995215775710321e-05,
"loss": 1.3795,
"step": 9880
},
{
"epoch": 0.04088221251723233,
"grad_norm": 3.276927230678387,
"learning_rate": 1.995202951046482e-05,
"loss": 1.4218,
"step": 9890
},
{
"epoch": 0.040923549435854405,
"grad_norm": 3.606741214717579,
"learning_rate": 1.9951901092579763e-05,
"loss": 1.4364,
"step": 9900
},
{
"epoch": 0.040964886354476475,
"grad_norm": 4.3260733895333425,
"learning_rate": 1.9951772503450252e-05,
"loss": 1.4398,
"step": 9910
},
{
"epoch": 0.04100622327309855,
"grad_norm": 3.1621905456493544,
"learning_rate": 1.9951643743078496e-05,
"loss": 1.4397,
"step": 9920
},
{
"epoch": 0.04104756019172063,
"grad_norm": 3.4412582079657623,
"learning_rate": 1.9951514811466713e-05,
"loss": 1.4036,
"step": 9930
},
{
"epoch": 0.041088897110342706,
"grad_norm": 4.873896899838307,
"learning_rate": 1.995138570861712e-05,
"loss": 1.4263,
"step": 9940
},
{
"epoch": 0.04113023402896478,
"grad_norm": 4.331814069124075,
"learning_rate": 1.9951256434531943e-05,
"loss": 1.4817,
"step": 9950
},
{
"epoch": 0.041171570947586854,
"grad_norm": 3.9349360259135926,
"learning_rate": 1.9951126989213398e-05,
"loss": 1.4483,
"step": 9960
},
{
"epoch": 0.04121290786620893,
"grad_norm": 3.3613448968668864,
"learning_rate": 1.995099737266372e-05,
"loss": 1.4229,
"step": 9970
},
{
"epoch": 0.041254244784831,
"grad_norm": 3.549433934959654,
"learning_rate": 1.9950867584885132e-05,
"loss": 1.4283,
"step": 9980
},
{
"epoch": 0.04129558170345308,
"grad_norm": 3.5652364655273208,
"learning_rate": 1.995073762587987e-05,
"loss": 1.4642,
"step": 9990
},
{
"epoch": 0.041336918622075156,
"grad_norm": 4.029695967481624,
"learning_rate": 1.995060749565018e-05,
"loss": 1.3657,
"step": 10000
},
{
"epoch": 0.041336918622075156,
"eval_loss": 1.736175537109375,
"eval_runtime": 393.8494,
"eval_samples_per_second": 10.4,
"eval_steps_per_second": 2.6,
"step": 10000
},
{
"epoch": 0.04137825554069723,
"grad_norm": 3.414046152937389,
"learning_rate": 1.9950477194198287e-05,
"loss": 1.3957,
"step": 10010
},
{
"epoch": 0.0414195924593193,
"grad_norm": 5.320606616740586,
"learning_rate": 1.9950346721526443e-05,
"loss": 1.4508,
"step": 10020
},
{
"epoch": 0.04146092937794138,
"grad_norm": 3.9807925522216423,
"learning_rate": 1.9950216077636886e-05,
"loss": 1.3943,
"step": 10030
},
{
"epoch": 0.04150226629656346,
"grad_norm": 3.501083066632413,
"learning_rate": 1.9950085262531868e-05,
"loss": 1.4352,
"step": 10040
},
{
"epoch": 0.04154360321518553,
"grad_norm": 3.771268569637735,
"learning_rate": 1.994995427621364e-05,
"loss": 1.452,
"step": 10050
},
{
"epoch": 0.041584940133807605,
"grad_norm": 3.8515101224909216,
"learning_rate": 1.9949823118684454e-05,
"loss": 1.4306,
"step": 10060
},
{
"epoch": 0.04162627705242968,
"grad_norm": 3.7934745333554782,
"learning_rate": 1.9949691789946567e-05,
"loss": 1.4805,
"step": 10070
},
{
"epoch": 0.04166761397105176,
"grad_norm": 3.396103842576295,
"learning_rate": 1.9949560290002245e-05,
"loss": 1.4516,
"step": 10080
},
{
"epoch": 0.04170895088967383,
"grad_norm": 3.4268415637061085,
"learning_rate": 1.994942861885374e-05,
"loss": 1.4143,
"step": 10090
},
{
"epoch": 0.041750287808295906,
"grad_norm": 3.4582505595292203,
"learning_rate": 1.9949296776503324e-05,
"loss": 1.3815,
"step": 10100
},
{
"epoch": 0.041791624726917984,
"grad_norm": 3.5395990026077304,
"learning_rate": 1.994916476295327e-05,
"loss": 1.4449,
"step": 10110
},
{
"epoch": 0.041832961645540054,
"grad_norm": 3.4229281128115403,
"learning_rate": 1.9949032578205834e-05,
"loss": 1.4526,
"step": 10120
},
{
"epoch": 0.04187429856416213,
"grad_norm": 3.983206436567361,
"learning_rate": 1.994890022226331e-05,
"loss": 1.4463,
"step": 10130
},
{
"epoch": 0.04191563548278421,
"grad_norm": 3.668734425155437,
"learning_rate": 1.9948767695127964e-05,
"loss": 1.419,
"step": 10140
},
{
"epoch": 0.041956972401406285,
"grad_norm": 3.3634372280714517,
"learning_rate": 1.9948634996802078e-05,
"loss": 1.4329,
"step": 10150
},
{
"epoch": 0.041998309320028356,
"grad_norm": 4.062775728402737,
"learning_rate": 1.9948502127287936e-05,
"loss": 1.4361,
"step": 10160
},
{
"epoch": 0.04203964623865043,
"grad_norm": 3.4149660693597084,
"learning_rate": 1.9948369086587823e-05,
"loss": 1.4725,
"step": 10170
},
{
"epoch": 0.04208098315727251,
"grad_norm": 3.6916128915527313,
"learning_rate": 1.9948235874704035e-05,
"loss": 1.4732,
"step": 10180
},
{
"epoch": 0.04212232007589458,
"grad_norm": 3.9231999868924206,
"learning_rate": 1.9948102491638853e-05,
"loss": 1.4558,
"step": 10190
},
{
"epoch": 0.04216365699451666,
"grad_norm": 4.846870150341976,
"learning_rate": 1.9947968937394583e-05,
"loss": 1.4455,
"step": 10200
},
{
"epoch": 0.042204993913138734,
"grad_norm": 3.426175390236964,
"learning_rate": 1.9947835211973517e-05,
"loss": 1.3997,
"step": 10210
},
{
"epoch": 0.042246330831760805,
"grad_norm": 3.7909997652306258,
"learning_rate": 1.9947701315377954e-05,
"loss": 1.4361,
"step": 10220
},
{
"epoch": 0.04228766775038288,
"grad_norm": 3.535939765317278,
"learning_rate": 1.9947567247610206e-05,
"loss": 1.4449,
"step": 10230
},
{
"epoch": 0.04232900466900496,
"grad_norm": 3.3731810089302523,
"learning_rate": 1.9947433008672572e-05,
"loss": 1.4193,
"step": 10240
},
{
"epoch": 0.042370341587627036,
"grad_norm": 3.9292291070435077,
"learning_rate": 1.9947298598567364e-05,
"loss": 1.4657,
"step": 10250
},
{
"epoch": 0.042411678506249106,
"grad_norm": 3.369066359531392,
"learning_rate": 1.99471640172969e-05,
"loss": 1.4509,
"step": 10260
},
{
"epoch": 0.042453015424871184,
"grad_norm": 3.6668982318612495,
"learning_rate": 1.994702926486349e-05,
"loss": 1.3931,
"step": 10270
},
{
"epoch": 0.04249435234349326,
"grad_norm": 3.2034209344506097,
"learning_rate": 1.9946894341269453e-05,
"loss": 1.4217,
"step": 10280
},
{
"epoch": 0.04253568926211533,
"grad_norm": 4.400853617662863,
"learning_rate": 1.9946759246517113e-05,
"loss": 1.4544,
"step": 10290
},
{
"epoch": 0.04257702618073741,
"grad_norm": 3.1712083272819473,
"learning_rate": 1.9946623980608792e-05,
"loss": 1.4813,
"step": 10300
},
{
"epoch": 0.042618363099359485,
"grad_norm": 3.5677581184867395,
"learning_rate": 1.994648854354682e-05,
"loss": 1.4321,
"step": 10310
},
{
"epoch": 0.04265970001798156,
"grad_norm": 3.38462741867337,
"learning_rate": 1.9946352935333528e-05,
"loss": 1.3907,
"step": 10320
},
{
"epoch": 0.04270103693660363,
"grad_norm": 3.6690520985143054,
"learning_rate": 1.994621715597125e-05,
"loss": 1.453,
"step": 10330
},
{
"epoch": 0.04274237385522571,
"grad_norm": 3.628541207308318,
"learning_rate": 1.9946081205462315e-05,
"loss": 1.4224,
"step": 10340
},
{
"epoch": 0.04278371077384779,
"grad_norm": 3.573675637942579,
"learning_rate": 1.994594508380907e-05,
"loss": 1.4409,
"step": 10350
},
{
"epoch": 0.04282504769246986,
"grad_norm": 3.9512735810988584,
"learning_rate": 1.9945808791013857e-05,
"loss": 1.4116,
"step": 10360
},
{
"epoch": 0.042866384611091934,
"grad_norm": 3.2189804332946936,
"learning_rate": 1.994567232707902e-05,
"loss": 1.4239,
"step": 10370
},
{
"epoch": 0.04290772152971401,
"grad_norm": 3.277452726822312,
"learning_rate": 1.9945535692006903e-05,
"loss": 1.419,
"step": 10380
},
{
"epoch": 0.04294905844833609,
"grad_norm": 3.2409307004738594,
"learning_rate": 1.994539888579986e-05,
"loss": 1.412,
"step": 10390
},
{
"epoch": 0.04299039536695816,
"grad_norm": 5.955091940094864,
"learning_rate": 1.9945261908460248e-05,
"loss": 1.4001,
"step": 10400
},
{
"epoch": 0.043031732285580236,
"grad_norm": 3.626590483447886,
"learning_rate": 1.9945124759990424e-05,
"loss": 1.4598,
"step": 10410
},
{
"epoch": 0.04307306920420231,
"grad_norm": 3.4065599382832197,
"learning_rate": 1.9944987440392742e-05,
"loss": 1.3991,
"step": 10420
},
{
"epoch": 0.043114406122824384,
"grad_norm": 3.3228235524159824,
"learning_rate": 1.994484994966957e-05,
"loss": 1.4069,
"step": 10430
},
{
"epoch": 0.04315574304144646,
"grad_norm": 4.069300982498486,
"learning_rate": 1.9944712287823275e-05,
"loss": 1.4376,
"step": 10440
},
{
"epoch": 0.04319707996006854,
"grad_norm": 3.668990751455877,
"learning_rate": 1.9944574454856216e-05,
"loss": 1.4185,
"step": 10450
},
{
"epoch": 0.043238416878690615,
"grad_norm": 3.189803317003545,
"learning_rate": 1.9944436450770775e-05,
"loss": 1.3998,
"step": 10460
},
{
"epoch": 0.043279753797312685,
"grad_norm": 3.7817594340150924,
"learning_rate": 1.9944298275569328e-05,
"loss": 1.4494,
"step": 10470
},
{
"epoch": 0.04332109071593476,
"grad_norm": 4.780904235096889,
"learning_rate": 1.9944159929254245e-05,
"loss": 1.4616,
"step": 10480
},
{
"epoch": 0.04336242763455684,
"grad_norm": 4.010329780152807,
"learning_rate": 1.9944021411827905e-05,
"loss": 1.4532,
"step": 10490
},
{
"epoch": 0.04340376455317891,
"grad_norm": 4.209509632753131,
"learning_rate": 1.9943882723292704e-05,
"loss": 1.4622,
"step": 10500
},
{
"epoch": 0.04344510147180099,
"grad_norm": 3.228687583673167,
"learning_rate": 1.9943743863651017e-05,
"loss": 1.4053,
"step": 10510
},
{
"epoch": 0.043486438390423064,
"grad_norm": 3.288729771085999,
"learning_rate": 1.994360483290523e-05,
"loss": 1.4192,
"step": 10520
},
{
"epoch": 0.043527775309045134,
"grad_norm": 4.41078023777337,
"learning_rate": 1.994346563105775e-05,
"loss": 1.4098,
"step": 10530
},
{
"epoch": 0.04356911222766721,
"grad_norm": 3.1815816594140487,
"learning_rate": 1.9943326258110963e-05,
"loss": 1.4676,
"step": 10540
},
{
"epoch": 0.04361044914628929,
"grad_norm": 3.554730176042178,
"learning_rate": 1.994318671406727e-05,
"loss": 1.4262,
"step": 10550
},
{
"epoch": 0.043651786064911366,
"grad_norm": 4.564103408690964,
"learning_rate": 1.9943046998929073e-05,
"loss": 1.4104,
"step": 10560
},
{
"epoch": 0.043693122983533436,
"grad_norm": 3.5454961573863994,
"learning_rate": 1.994290711269877e-05,
"loss": 1.4235,
"step": 10570
},
{
"epoch": 0.04373445990215551,
"grad_norm": 3.6248317766975857,
"learning_rate": 1.9942767055378775e-05,
"loss": 1.3733,
"step": 10580
},
{
"epoch": 0.04377579682077759,
"grad_norm": 3.2489128741687123,
"learning_rate": 1.9942626826971493e-05,
"loss": 1.4456,
"step": 10590
},
{
"epoch": 0.04381713373939966,
"grad_norm": 3.5799361135868057,
"learning_rate": 1.994248642747934e-05,
"loss": 1.4071,
"step": 10600
},
{
"epoch": 0.04385847065802174,
"grad_norm": 3.4391607635624033,
"learning_rate": 1.9942345856904727e-05,
"loss": 1.388,
"step": 10610
},
{
"epoch": 0.043899807576643815,
"grad_norm": 4.900926633402934,
"learning_rate": 1.994220511525008e-05,
"loss": 1.4214,
"step": 10620
},
{
"epoch": 0.04394114449526589,
"grad_norm": 3.1998682814537807,
"learning_rate": 1.994206420251782e-05,
"loss": 1.4392,
"step": 10630
},
{
"epoch": 0.04398248141388796,
"grad_norm": 3.512730762072939,
"learning_rate": 1.9941923118710366e-05,
"loss": 1.3833,
"step": 10640
},
{
"epoch": 0.04402381833251004,
"grad_norm": 3.5959575986075354,
"learning_rate": 1.9941781863830153e-05,
"loss": 1.4666,
"step": 10650
},
{
"epoch": 0.04406515525113212,
"grad_norm": 4.116993239444605,
"learning_rate": 1.9941640437879603e-05,
"loss": 1.417,
"step": 10660
},
{
"epoch": 0.04410649216975419,
"grad_norm": 4.587080576933717,
"learning_rate": 1.9941498840861153e-05,
"loss": 1.3558,
"step": 10670
},
{
"epoch": 0.044147829088376264,
"grad_norm": 3.923348655449712,
"learning_rate": 1.9941357072777245e-05,
"loss": 1.403,
"step": 10680
},
{
"epoch": 0.04418916600699834,
"grad_norm": 3.3131343328753884,
"learning_rate": 1.9941215133630312e-05,
"loss": 1.414,
"step": 10690
},
{
"epoch": 0.04423050292562042,
"grad_norm": 3.815180569497117,
"learning_rate": 1.9941073023422796e-05,
"loss": 1.4567,
"step": 10700
},
{
"epoch": 0.04427183984424249,
"grad_norm": 3.0191885771803264,
"learning_rate": 1.994093074215715e-05,
"loss": 1.4257,
"step": 10710
},
{
"epoch": 0.044313176762864566,
"grad_norm": 3.4376292652965494,
"learning_rate": 1.994078828983581e-05,
"loss": 1.4118,
"step": 10720
},
{
"epoch": 0.04435451368148664,
"grad_norm": 3.5106899932837643,
"learning_rate": 1.994064566646124e-05,
"loss": 1.4159,
"step": 10730
},
{
"epoch": 0.04439585060010871,
"grad_norm": 3.6846637686102413,
"learning_rate": 1.9940502872035888e-05,
"loss": 1.3948,
"step": 10740
},
{
"epoch": 0.04443718751873079,
"grad_norm": 3.657265133747329,
"learning_rate": 1.9940359906562207e-05,
"loss": 1.4087,
"step": 10750
},
{
"epoch": 0.04447852443735287,
"grad_norm": 4.430332521129557,
"learning_rate": 1.9940216770042666e-05,
"loss": 1.3989,
"step": 10760
},
{
"epoch": 0.044519861355974945,
"grad_norm": 4.43254812995105,
"learning_rate": 1.994007346247972e-05,
"loss": 1.3781,
"step": 10770
},
{
"epoch": 0.044561198274597015,
"grad_norm": 3.547905857131194,
"learning_rate": 1.9939929983875837e-05,
"loss": 1.443,
"step": 10780
},
{
"epoch": 0.04460253519321909,
"grad_norm": 4.225199610421922,
"learning_rate": 1.9939786334233492e-05,
"loss": 1.3992,
"step": 10790
},
{
"epoch": 0.04464387211184117,
"grad_norm": 3.2850031799014494,
"learning_rate": 1.993964251355515e-05,
"loss": 1.39,
"step": 10800
},
{
"epoch": 0.04468520903046324,
"grad_norm": 3.576860893151518,
"learning_rate": 1.993949852184329e-05,
"loss": 1.4019,
"step": 10810
},
{
"epoch": 0.04472654594908532,
"grad_norm": 4.14729049725031,
"learning_rate": 1.9939354359100385e-05,
"loss": 1.407,
"step": 10820
},
{
"epoch": 0.044767882867707394,
"grad_norm": 3.6785935387585447,
"learning_rate": 1.9939210025328915e-05,
"loss": 1.4188,
"step": 10830
},
{
"epoch": 0.044809219786329464,
"grad_norm": 3.475380301816819,
"learning_rate": 1.993906552053137e-05,
"loss": 1.4146,
"step": 10840
},
{
"epoch": 0.04485055670495154,
"grad_norm": 3.3679721828323217,
"learning_rate": 1.9938920844710235e-05,
"loss": 1.4208,
"step": 10850
},
{
"epoch": 0.04489189362357362,
"grad_norm": 3.679471702118622,
"learning_rate": 1.9938775997867995e-05,
"loss": 1.4209,
"step": 10860
},
{
"epoch": 0.044933230542195696,
"grad_norm": 3.8980049289176377,
"learning_rate": 1.9938630980007147e-05,
"loss": 1.4121,
"step": 10870
},
{
"epoch": 0.044974567460817766,
"grad_norm": 3.7079901840906713,
"learning_rate": 1.9938485791130183e-05,
"loss": 1.3969,
"step": 10880
},
{
"epoch": 0.04501590437943984,
"grad_norm": 3.7675855531387996,
"learning_rate": 1.9938340431239603e-05,
"loss": 1.4012,
"step": 10890
},
{
"epoch": 0.04505724129806192,
"grad_norm": 3.3894112723434127,
"learning_rate": 1.9938194900337908e-05,
"loss": 1.4184,
"step": 10900
},
{
"epoch": 0.04509857821668399,
"grad_norm": 4.1568950335530825,
"learning_rate": 1.9938049198427604e-05,
"loss": 1.452,
"step": 10910
},
{
"epoch": 0.04513991513530607,
"grad_norm": 3.630087506411177,
"learning_rate": 1.9937903325511193e-05,
"loss": 1.4657,
"step": 10920
},
{
"epoch": 0.045181252053928145,
"grad_norm": 3.510575809020148,
"learning_rate": 1.9937757281591187e-05,
"loss": 1.4341,
"step": 10930
},
{
"epoch": 0.04522258897255022,
"grad_norm": 3.309825385197255,
"learning_rate": 1.9937611066670106e-05,
"loss": 1.3789,
"step": 10940
},
{
"epoch": 0.04526392589117229,
"grad_norm": 3.239522136091904,
"learning_rate": 1.9937464680750454e-05,
"loss": 1.4103,
"step": 10950
},
{
"epoch": 0.04530526280979437,
"grad_norm": 4.673675936224972,
"learning_rate": 1.9937318123834762e-05,
"loss": 1.3989,
"step": 10960
},
{
"epoch": 0.04534659972841645,
"grad_norm": 4.627358104306948,
"learning_rate": 1.9937171395925544e-05,
"loss": 1.4203,
"step": 10970
},
{
"epoch": 0.04538793664703852,
"grad_norm": 3.311365466265083,
"learning_rate": 1.9937024497025325e-05,
"loss": 1.389,
"step": 10980
},
{
"epoch": 0.045429273565660594,
"grad_norm": 4.134318195617502,
"learning_rate": 1.9936877427136637e-05,
"loss": 1.4224,
"step": 10990
},
{
"epoch": 0.04547061048428267,
"grad_norm": 3.006959002241816,
"learning_rate": 1.9936730186262007e-05,
"loss": 1.3988,
"step": 11000
},
{
"epoch": 0.04551194740290475,
"grad_norm": 4.01529741437254,
"learning_rate": 1.993658277440397e-05,
"loss": 1.4396,
"step": 11010
},
{
"epoch": 0.04555328432152682,
"grad_norm": 3.2748525540941507,
"learning_rate": 1.993643519156506e-05,
"loss": 1.3921,
"step": 11020
},
{
"epoch": 0.045594621240148896,
"grad_norm": 4.018973443549097,
"learning_rate": 1.9936287437747822e-05,
"loss": 1.3617,
"step": 11030
},
{
"epoch": 0.04563595815877097,
"grad_norm": 3.462150874636636,
"learning_rate": 1.993613951295479e-05,
"loss": 1.4075,
"step": 11040
},
{
"epoch": 0.04567729507739304,
"grad_norm": 3.6284928503493528,
"learning_rate": 1.9935991417188523e-05,
"loss": 1.3774,
"step": 11050
},
{
"epoch": 0.04571863199601512,
"grad_norm": 4.331109900085688,
"learning_rate": 1.9935843150451558e-05,
"loss": 1.4156,
"step": 11060
},
{
"epoch": 0.0457599689146372,
"grad_norm": 3.749722584209809,
"learning_rate": 1.9935694712746448e-05,
"loss": 1.4314,
"step": 11070
},
{
"epoch": 0.045801305833259275,
"grad_norm": 3.1035349095523266,
"learning_rate": 1.9935546104075746e-05,
"loss": 1.4167,
"step": 11080
},
{
"epoch": 0.045842642751881345,
"grad_norm": 4.134657657963317,
"learning_rate": 1.9935397324442015e-05,
"loss": 1.4377,
"step": 11090
},
{
"epoch": 0.04588397967050342,
"grad_norm": 3.378268647534537,
"learning_rate": 1.993524837384781e-05,
"loss": 1.4201,
"step": 11100
},
{
"epoch": 0.0459253165891255,
"grad_norm": 3.3061200097201615,
"learning_rate": 1.9935099252295694e-05,
"loss": 1.391,
"step": 11110
},
{
"epoch": 0.04596665350774757,
"grad_norm": 3.6514603716731133,
"learning_rate": 1.9934949959788237e-05,
"loss": 1.4423,
"step": 11120
},
{
"epoch": 0.046007990426369647,
"grad_norm": 3.2198262717397896,
"learning_rate": 1.9934800496328006e-05,
"loss": 1.4049,
"step": 11130
},
{
"epoch": 0.046049327344991724,
"grad_norm": 3.1555815125987197,
"learning_rate": 1.993465086191757e-05,
"loss": 1.4418,
"step": 11140
},
{
"epoch": 0.0460906642636138,
"grad_norm": 3.6391685610732476,
"learning_rate": 1.993450105655951e-05,
"loss": 1.3824,
"step": 11150
},
{
"epoch": 0.04613200118223587,
"grad_norm": 3.426206028662225,
"learning_rate": 1.9934351080256395e-05,
"loss": 1.3837,
"step": 11160
},
{
"epoch": 0.04617333810085795,
"grad_norm": 4.616699518929945,
"learning_rate": 1.9934200933010816e-05,
"loss": 1.3886,
"step": 11170
},
{
"epoch": 0.046214675019480025,
"grad_norm": 3.919463255914606,
"learning_rate": 1.993405061482535e-05,
"loss": 1.418,
"step": 11180
},
{
"epoch": 0.046256011938102096,
"grad_norm": 4.372063175345489,
"learning_rate": 1.9933900125702582e-05,
"loss": 1.3976,
"step": 11190
},
{
"epoch": 0.04629734885672417,
"grad_norm": 4.040676043168612,
"learning_rate": 1.9933749465645103e-05,
"loss": 1.4122,
"step": 11200
},
{
"epoch": 0.04633868577534625,
"grad_norm": 3.3730911260973815,
"learning_rate": 1.9933598634655512e-05,
"loss": 1.3707,
"step": 11210
},
{
"epoch": 0.04638002269396832,
"grad_norm": 3.6851619767350066,
"learning_rate": 1.9933447632736393e-05,
"loss": 1.398,
"step": 11220
},
{
"epoch": 0.0464213596125904,
"grad_norm": 4.001189833407079,
"learning_rate": 1.9933296459890355e-05,
"loss": 1.4071,
"step": 11230
},
{
"epoch": 0.046462696531212475,
"grad_norm": 3.3898995076664757,
"learning_rate": 1.993314511611999e-05,
"loss": 1.408,
"step": 11240
},
{
"epoch": 0.04650403344983455,
"grad_norm": 3.6996997277102146,
"learning_rate": 1.9932993601427912e-05,
"loss": 1.3975,
"step": 11250
},
{
"epoch": 0.04654537036845662,
"grad_norm": 3.313365690401916,
"learning_rate": 1.993284191581672e-05,
"loss": 1.408,
"step": 11260
},
{
"epoch": 0.0465867072870787,
"grad_norm": 4.633876867393197,
"learning_rate": 1.993269005928903e-05,
"loss": 1.396,
"step": 11270
},
{
"epoch": 0.046628044205700776,
"grad_norm": 3.5037620526852304,
"learning_rate": 1.993253803184745e-05,
"loss": 1.4024,
"step": 11280
},
{
"epoch": 0.046669381124322847,
"grad_norm": 3.081503642322238,
"learning_rate": 1.9932385833494597e-05,
"loss": 1.4109,
"step": 11290
},
{
"epoch": 0.046710718042944924,
"grad_norm": 3.547058360569091,
"learning_rate": 1.9932233464233092e-05,
"loss": 1.3796,
"step": 11300
},
{
"epoch": 0.046752054961567,
"grad_norm": 3.814887745242394,
"learning_rate": 1.9932080924065556e-05,
"loss": 1.4401,
"step": 11310
},
{
"epoch": 0.04679339188018908,
"grad_norm": 3.5004316252867085,
"learning_rate": 1.993192821299461e-05,
"loss": 1.452,
"step": 11320
},
{
"epoch": 0.04683472879881115,
"grad_norm": 3.449228750426351,
"learning_rate": 1.993177533102289e-05,
"loss": 1.3734,
"step": 11330
},
{
"epoch": 0.046876065717433225,
"grad_norm": 3.2964308484381784,
"learning_rate": 1.9931622278153024e-05,
"loss": 1.4018,
"step": 11340
},
{
"epoch": 0.0469174026360553,
"grad_norm": 2.8180078039607697,
"learning_rate": 1.993146905438764e-05,
"loss": 1.4081,
"step": 11350
},
{
"epoch": 0.04695873955467737,
"grad_norm": 3.2236402061866545,
"learning_rate": 1.9931315659729376e-05,
"loss": 1.4534,
"step": 11360
},
{
"epoch": 0.04700007647329945,
"grad_norm": 3.5082002531342473,
"learning_rate": 1.9931162094180874e-05,
"loss": 1.4173,
"step": 11370
},
{
"epoch": 0.04704141339192153,
"grad_norm": 3.284436648082263,
"learning_rate": 1.993100835774478e-05,
"loss": 1.4092,
"step": 11380
},
{
"epoch": 0.047082750310543604,
"grad_norm": 3.4816460403688314,
"learning_rate": 1.9930854450423736e-05,
"loss": 1.3913,
"step": 11390
},
{
"epoch": 0.047124087229165675,
"grad_norm": 4.14476906280773,
"learning_rate": 1.9930700372220387e-05,
"loss": 1.3703,
"step": 11400
},
{
"epoch": 0.04716542414778775,
"grad_norm": 4.375945137034217,
"learning_rate": 1.993054612313739e-05,
"loss": 1.4362,
"step": 11410
},
{
"epoch": 0.04720676106640983,
"grad_norm": 4.277168672236525,
"learning_rate": 1.993039170317739e-05,
"loss": 1.479,
"step": 11420
},
{
"epoch": 0.0472480979850319,
"grad_norm": 3.927053905964369,
"learning_rate": 1.9930237112343056e-05,
"loss": 1.3872,
"step": 11430
},
{
"epoch": 0.047289434903653976,
"grad_norm": 3.607150421833661,
"learning_rate": 1.9930082350637042e-05,
"loss": 1.3891,
"step": 11440
},
{
"epoch": 0.04733077182227605,
"grad_norm": 3.338415521714108,
"learning_rate": 1.992992741806201e-05,
"loss": 1.4211,
"step": 11450
},
{
"epoch": 0.04737210874089813,
"grad_norm": 3.629104841971202,
"learning_rate": 1.9929772314620627e-05,
"loss": 1.3425,
"step": 11460
},
{
"epoch": 0.0474134456595202,
"grad_norm": 3.099599716044088,
"learning_rate": 1.9929617040315563e-05,
"loss": 1.382,
"step": 11470
},
{
"epoch": 0.04745478257814228,
"grad_norm": 3.7158273713517893,
"learning_rate": 1.992946159514949e-05,
"loss": 1.4361,
"step": 11480
},
{
"epoch": 0.047496119496764355,
"grad_norm": 4.389528378421289,
"learning_rate": 1.992930597912508e-05,
"loss": 1.4435,
"step": 11490
},
{
"epoch": 0.047537456415386425,
"grad_norm": 3.669631509338215,
"learning_rate": 1.9929150192245016e-05,
"loss": 1.4321,
"step": 11500
},
{
"epoch": 0.0475787933340085,
"grad_norm": 3.4995914344254624,
"learning_rate": 1.992899423451197e-05,
"loss": 1.446,
"step": 11510
},
{
"epoch": 0.04762013025263058,
"grad_norm": 3.3809354325443017,
"learning_rate": 1.9928838105928635e-05,
"loss": 1.3941,
"step": 11520
},
{
"epoch": 0.04766146717125265,
"grad_norm": 3.1788524195701684,
"learning_rate": 1.9928681806497693e-05,
"loss": 1.4027,
"step": 11530
},
{
"epoch": 0.04770280408987473,
"grad_norm": 3.368111520244943,
"learning_rate": 1.9928525336221837e-05,
"loss": 1.4038,
"step": 11540
},
{
"epoch": 0.047744141008496804,
"grad_norm": 3.6045389016529636,
"learning_rate": 1.992836869510375e-05,
"loss": 1.4523,
"step": 11550
},
{
"epoch": 0.04778547792711888,
"grad_norm": 3.859885628963866,
"learning_rate": 1.9928211883146136e-05,
"loss": 1.4307,
"step": 11560
},
{
"epoch": 0.04782681484574095,
"grad_norm": 3.2925025525674756,
"learning_rate": 1.9928054900351693e-05,
"loss": 1.4473,
"step": 11570
},
{
"epoch": 0.04786815176436303,
"grad_norm": 4.075772971569745,
"learning_rate": 1.992789774672312e-05,
"loss": 1.4384,
"step": 11580
},
{
"epoch": 0.047909488682985106,
"grad_norm": 3.391288577095074,
"learning_rate": 1.9927740422263117e-05,
"loss": 1.4038,
"step": 11590
},
{
"epoch": 0.047950825601607176,
"grad_norm": 3.4783586083297626,
"learning_rate": 1.9927582926974402e-05,
"loss": 1.3911,
"step": 11600
},
{
"epoch": 0.04799216252022925,
"grad_norm": 3.468149363696469,
"learning_rate": 1.9927425260859673e-05,
"loss": 1.4123,
"step": 11610
},
{
"epoch": 0.04803349943885133,
"grad_norm": 3.464512963986536,
"learning_rate": 1.992726742392165e-05,
"loss": 1.3973,
"step": 11620
},
{
"epoch": 0.04807483635747341,
"grad_norm": 4.061746689943389,
"learning_rate": 1.992710941616305e-05,
"loss": 1.3841,
"step": 11630
},
{
"epoch": 0.04811617327609548,
"grad_norm": 4.139608234706919,
"learning_rate": 1.992695123758659e-05,
"loss": 1.3787,
"step": 11640
},
{
"epoch": 0.048157510194717555,
"grad_norm": 3.8447070353873025,
"learning_rate": 1.992679288819499e-05,
"loss": 1.3815,
"step": 11650
},
{
"epoch": 0.04819884711333963,
"grad_norm": 3.6946628562082693,
"learning_rate": 1.9926634367990973e-05,
"loss": 1.3788,
"step": 11660
},
{
"epoch": 0.0482401840319617,
"grad_norm": 3.3618354267056465,
"learning_rate": 1.992647567697727e-05,
"loss": 1.4063,
"step": 11670
},
{
"epoch": 0.04828152095058378,
"grad_norm": 3.495643041214259,
"learning_rate": 1.9926316815156617e-05,
"loss": 1.4348,
"step": 11680
},
{
"epoch": 0.04832285786920586,
"grad_norm": 3.552951920155812,
"learning_rate": 1.9926157782531735e-05,
"loss": 1.3604,
"step": 11690
},
{
"epoch": 0.048364194787827934,
"grad_norm": 3.2600229354667025,
"learning_rate": 1.9925998579105374e-05,
"loss": 1.3395,
"step": 11700
},
{
"epoch": 0.048405531706450004,
"grad_norm": 3.4189972893062635,
"learning_rate": 1.9925839204880263e-05,
"loss": 1.4291,
"step": 11710
},
{
"epoch": 0.04844686862507208,
"grad_norm": 3.2696931259943494,
"learning_rate": 1.9925679659859148e-05,
"loss": 1.3748,
"step": 11720
},
{
"epoch": 0.04848820554369416,
"grad_norm": 3.400522141333647,
"learning_rate": 1.9925519944044772e-05,
"loss": 1.4141,
"step": 11730
},
{
"epoch": 0.04852954246231623,
"grad_norm": 3.347216625916271,
"learning_rate": 1.9925360057439887e-05,
"loss": 1.4062,
"step": 11740
},
{
"epoch": 0.048570879380938306,
"grad_norm": 3.55815106093295,
"learning_rate": 1.9925200000047248e-05,
"loss": 1.4056,
"step": 11750
},
{
"epoch": 0.04861221629956038,
"grad_norm": 4.75921473200291,
"learning_rate": 1.99250397718696e-05,
"loss": 1.418,
"step": 11760
},
{
"epoch": 0.04865355321818246,
"grad_norm": 3.2697324056422588,
"learning_rate": 1.9924879372909703e-05,
"loss": 1.4015,
"step": 11770
},
{
"epoch": 0.04869489013680453,
"grad_norm": 3.590517133814017,
"learning_rate": 1.9924718803170324e-05,
"loss": 1.3738,
"step": 11780
},
{
"epoch": 0.04873622705542661,
"grad_norm": 3.579850493829701,
"learning_rate": 1.9924558062654215e-05,
"loss": 1.334,
"step": 11790
},
{
"epoch": 0.048777563974048685,
"grad_norm": 3.552170187760315,
"learning_rate": 1.9924397151364148e-05,
"loss": 1.4169,
"step": 11800
},
{
"epoch": 0.048818900892670755,
"grad_norm": 3.0842304871945037,
"learning_rate": 1.992423606930289e-05,
"loss": 1.3906,
"step": 11810
},
{
"epoch": 0.04886023781129283,
"grad_norm": 3.379537547677814,
"learning_rate": 1.9924074816473215e-05,
"loss": 1.4351,
"step": 11820
},
{
"epoch": 0.04890157472991491,
"grad_norm": 3.53218922925584,
"learning_rate": 1.9923913392877896e-05,
"loss": 1.4032,
"step": 11830
},
{
"epoch": 0.04894291164853698,
"grad_norm": 5.016060492627983,
"learning_rate": 1.992375179851971e-05,
"loss": 1.3918,
"step": 11840
},
{
"epoch": 0.04898424856715906,
"grad_norm": 3.540388411237351,
"learning_rate": 1.9923590033401443e-05,
"loss": 1.4196,
"step": 11850
},
{
"epoch": 0.049025585485781134,
"grad_norm": 3.3104883369223765,
"learning_rate": 1.9923428097525872e-05,
"loss": 1.4141,
"step": 11860
},
{
"epoch": 0.04906692240440321,
"grad_norm": 3.580413781862322,
"learning_rate": 1.9923265990895785e-05,
"loss": 1.4291,
"step": 11870
},
{
"epoch": 0.04910825932302528,
"grad_norm": 4.057142008160253,
"learning_rate": 1.9923103713513972e-05,
"loss": 1.4193,
"step": 11880
},
{
"epoch": 0.04914959624164736,
"grad_norm": 3.157870137031843,
"learning_rate": 1.9922941265383226e-05,
"loss": 1.3949,
"step": 11890
},
{
"epoch": 0.049190933160269436,
"grad_norm": 3.7453792892755255,
"learning_rate": 1.992277864650634e-05,
"loss": 1.373,
"step": 11900
},
{
"epoch": 0.049232270078891506,
"grad_norm": 4.17858858176331,
"learning_rate": 1.992261585688611e-05,
"loss": 1.3732,
"step": 11910
},
{
"epoch": 0.04927360699751358,
"grad_norm": 3.3040489276601157,
"learning_rate": 1.992245289652535e-05,
"loss": 1.373,
"step": 11920
},
{
"epoch": 0.04931494391613566,
"grad_norm": 3.0410499391716117,
"learning_rate": 1.992228976542685e-05,
"loss": 1.3839,
"step": 11930
},
{
"epoch": 0.04935628083475774,
"grad_norm": 3.541114095436553,
"learning_rate": 1.9922126463593422e-05,
"loss": 1.4006,
"step": 11940
},
{
"epoch": 0.04939761775337981,
"grad_norm": 3.9811902872742673,
"learning_rate": 1.992196299102788e-05,
"loss": 1.3546,
"step": 11950
},
{
"epoch": 0.049438954672001885,
"grad_norm": 3.1780875587126705,
"learning_rate": 1.9921799347733026e-05,
"loss": 1.3693,
"step": 11960
},
{
"epoch": 0.04948029159062396,
"grad_norm": 3.5586898768297415,
"learning_rate": 1.9921635533711687e-05,
"loss": 1.4215,
"step": 11970
},
{
"epoch": 0.04952162850924603,
"grad_norm": 2.8477978920610565,
"learning_rate": 1.9921471548966678e-05,
"loss": 1.4256,
"step": 11980
},
{
"epoch": 0.04956296542786811,
"grad_norm": 3.5714848436239275,
"learning_rate": 1.9921307393500822e-05,
"loss": 1.4358,
"step": 11990
},
{
"epoch": 0.04960430234649019,
"grad_norm": 3.746696171382991,
"learning_rate": 1.992114306731694e-05,
"loss": 1.4556,
"step": 12000
},
{
"epoch": 0.049645639265112264,
"grad_norm": 3.6562155551861353,
"learning_rate": 1.992097857041786e-05,
"loss": 1.404,
"step": 12010
},
{
"epoch": 0.049686976183734334,
"grad_norm": 3.2699664906016803,
"learning_rate": 1.9920813902806414e-05,
"loss": 1.3946,
"step": 12020
},
{
"epoch": 0.04972831310235641,
"grad_norm": 4.034724844068233,
"learning_rate": 1.992064906448544e-05,
"loss": 1.4321,
"step": 12030
},
{
"epoch": 0.04976965002097849,
"grad_norm": 3.4822925081412497,
"learning_rate": 1.9920484055457767e-05,
"loss": 1.4105,
"step": 12040
},
{
"epoch": 0.04981098693960056,
"grad_norm": 3.587853195823534,
"learning_rate": 1.9920318875726238e-05,
"loss": 1.3697,
"step": 12050
},
{
"epoch": 0.049852323858222636,
"grad_norm": 3.205239993732016,
"learning_rate": 1.9920153525293694e-05,
"loss": 1.3979,
"step": 12060
},
{
"epoch": 0.04989366077684471,
"grad_norm": 3.388133162074283,
"learning_rate": 1.991998800416298e-05,
"loss": 1.4023,
"step": 12070
},
{
"epoch": 0.04993499769546679,
"grad_norm": 4.2719394165918985,
"learning_rate": 1.9919822312336947e-05,
"loss": 1.3956,
"step": 12080
},
{
"epoch": 0.04997633461408886,
"grad_norm": 3.234674011512212,
"learning_rate": 1.9919656449818444e-05,
"loss": 1.4101,
"step": 12090
},
{
"epoch": 0.05001767153271094,
"grad_norm": 3.4860946302341107,
"learning_rate": 1.9919490416610327e-05,
"loss": 1.3802,
"step": 12100
},
{
"epoch": 0.050059008451333015,
"grad_norm": 3.3542639404969483,
"learning_rate": 1.9919324212715448e-05,
"loss": 1.3865,
"step": 12110
},
{
"epoch": 0.050100345369955085,
"grad_norm": 3.7877647269232293,
"learning_rate": 1.9919157838136668e-05,
"loss": 1.4198,
"step": 12120
},
{
"epoch": 0.05014168228857716,
"grad_norm": 3.6768245303830214,
"learning_rate": 1.9918991292876857e-05,
"loss": 1.392,
"step": 12130
},
{
"epoch": 0.05018301920719924,
"grad_norm": 3.0504339083555054,
"learning_rate": 1.9918824576938872e-05,
"loss": 1.3943,
"step": 12140
},
{
"epoch": 0.05022435612582131,
"grad_norm": 4.003605772511745,
"learning_rate": 1.9918657690325586e-05,
"loss": 1.3627,
"step": 12150
},
{
"epoch": 0.05026569304444339,
"grad_norm": 4.04766902184383,
"learning_rate": 1.9918490633039873e-05,
"loss": 1.3867,
"step": 12160
},
{
"epoch": 0.050307029963065464,
"grad_norm": 4.054619230075598,
"learning_rate": 1.99183234050846e-05,
"loss": 1.3558,
"step": 12170
},
{
"epoch": 0.05034836688168754,
"grad_norm": 3.3575853249481873,
"learning_rate": 1.9918156006462653e-05,
"loss": 1.3863,
"step": 12180
},
{
"epoch": 0.05038970380030961,
"grad_norm": 3.5267112119515582,
"learning_rate": 1.9917988437176908e-05,
"loss": 1.3705,
"step": 12190
},
{
"epoch": 0.05043104071893169,
"grad_norm": 3.528333383483439,
"learning_rate": 1.9917820697230247e-05,
"loss": 1.4441,
"step": 12200
},
{
"epoch": 0.050472377637553766,
"grad_norm": 3.233459986557015,
"learning_rate": 1.991765278662556e-05,
"loss": 1.3532,
"step": 12210
},
{
"epoch": 0.050513714556175836,
"grad_norm": 3.4949233408933034,
"learning_rate": 1.991748470536573e-05,
"loss": 1.3658,
"step": 12220
},
{
"epoch": 0.05055505147479791,
"grad_norm": 3.194405798297394,
"learning_rate": 1.9917316453453657e-05,
"loss": 1.397,
"step": 12230
},
{
"epoch": 0.05059638839341999,
"grad_norm": 3.6606193297707046,
"learning_rate": 1.9917148030892238e-05,
"loss": 1.4072,
"step": 12240
},
{
"epoch": 0.05063772531204207,
"grad_norm": 4.069430004373759,
"learning_rate": 1.9916979437684362e-05,
"loss": 1.4136,
"step": 12250
},
{
"epoch": 0.05067906223066414,
"grad_norm": 3.1603885636318956,
"learning_rate": 1.991681067383293e-05,
"loss": 1.4097,
"step": 12260
},
{
"epoch": 0.050720399149286215,
"grad_norm": 3.177326748991991,
"learning_rate": 1.9916641739340857e-05,
"loss": 1.4195,
"step": 12270
},
{
"epoch": 0.05076173606790829,
"grad_norm": 3.918139727949541,
"learning_rate": 1.991647263421104e-05,
"loss": 1.3876,
"step": 12280
},
{
"epoch": 0.05080307298653036,
"grad_norm": 3.9573038369404427,
"learning_rate": 1.9916303358446392e-05,
"loss": 1.3683,
"step": 12290
},
{
"epoch": 0.05084440990515244,
"grad_norm": 4.275806482199394,
"learning_rate": 1.9916133912049825e-05,
"loss": 1.4204,
"step": 12300
},
{
"epoch": 0.050885746823774516,
"grad_norm": 4.847335915383492,
"learning_rate": 1.9915964295024254e-05,
"loss": 1.4034,
"step": 12310
},
{
"epoch": 0.050927083742396594,
"grad_norm": 3.273054866007932,
"learning_rate": 1.99157945073726e-05,
"loss": 1.3968,
"step": 12320
},
{
"epoch": 0.050968420661018664,
"grad_norm": 3.7167200760785173,
"learning_rate": 1.9915624549097784e-05,
"loss": 1.3999,
"step": 12330
},
{
"epoch": 0.05100975757964074,
"grad_norm": 3.4590376733691337,
"learning_rate": 1.991545442020273e-05,
"loss": 1.4122,
"step": 12340
},
{
"epoch": 0.05105109449826282,
"grad_norm": 3.1847142656240255,
"learning_rate": 1.9915284120690362e-05,
"loss": 1.4239,
"step": 12350
},
{
"epoch": 0.05109243141688489,
"grad_norm": 3.3467985786610353,
"learning_rate": 1.991511365056362e-05,
"loss": 1.3844,
"step": 12360
},
{
"epoch": 0.051133768335506966,
"grad_norm": 4.094929119314472,
"learning_rate": 1.9914943009825425e-05,
"loss": 1.3577,
"step": 12370
},
{
"epoch": 0.05117510525412904,
"grad_norm": 3.521925167773648,
"learning_rate": 1.9914772198478723e-05,
"loss": 1.3954,
"step": 12380
},
{
"epoch": 0.05121644217275112,
"grad_norm": 3.3998170027309067,
"learning_rate": 1.9914601216526446e-05,
"loss": 1.4102,
"step": 12390
},
{
"epoch": 0.05125777909137319,
"grad_norm": 3.6457154728502035,
"learning_rate": 1.9914430063971542e-05,
"loss": 1.3666,
"step": 12400
},
{
"epoch": 0.05129911600999527,
"grad_norm": 3.133616222950282,
"learning_rate": 1.9914258740816956e-05,
"loss": 1.4071,
"step": 12410
},
{
"epoch": 0.051340452928617344,
"grad_norm": 3.5165895133634133,
"learning_rate": 1.9914087247065634e-05,
"loss": 1.4127,
"step": 12420
},
{
"epoch": 0.051381789847239415,
"grad_norm": 3.1317185826456795,
"learning_rate": 1.991391558272052e-05,
"loss": 1.3609,
"step": 12430
},
{
"epoch": 0.05142312676586149,
"grad_norm": 3.7493626264431605,
"learning_rate": 1.991374374778458e-05,
"loss": 1.3832,
"step": 12440
},
{
"epoch": 0.05146446368448357,
"grad_norm": 3.5662863496670196,
"learning_rate": 1.991357174226076e-05,
"loss": 1.3665,
"step": 12450
},
{
"epoch": 0.05150580060310564,
"grad_norm": 3.1448919796113035,
"learning_rate": 1.9913399566152033e-05,
"loss": 1.3965,
"step": 12460
},
{
"epoch": 0.051547137521727716,
"grad_norm": 3.3154750106704625,
"learning_rate": 1.991322721946135e-05,
"loss": 1.3873,
"step": 12470
},
{
"epoch": 0.051588474440349794,
"grad_norm": 3.688390777094577,
"learning_rate": 1.991305470219168e-05,
"loss": 1.3224,
"step": 12480
},
{
"epoch": 0.05162981135897187,
"grad_norm": 3.0270982188671907,
"learning_rate": 1.9912882014345988e-05,
"loss": 1.3551,
"step": 12490
},
{
"epoch": 0.05167114827759394,
"grad_norm": 3.7441081256974136,
"learning_rate": 1.9912709155927254e-05,
"loss": 1.3945,
"step": 12500
},
{
"epoch": 0.05171248519621602,
"grad_norm": 3.697861466581755,
"learning_rate": 1.9912536126938446e-05,
"loss": 1.3612,
"step": 12510
},
{
"epoch": 0.051753822114838095,
"grad_norm": 3.523721321551833,
"learning_rate": 1.9912362927382546e-05,
"loss": 1.3747,
"step": 12520
},
{
"epoch": 0.051795159033460166,
"grad_norm": 3.0061067921910727,
"learning_rate": 1.9912189557262528e-05,
"loss": 1.4086,
"step": 12530
},
{
"epoch": 0.05183649595208224,
"grad_norm": 4.0409382024057425,
"learning_rate": 1.991201601658138e-05,
"loss": 1.3849,
"step": 12540
},
{
"epoch": 0.05187783287070432,
"grad_norm": 3.7066972530256983,
"learning_rate": 1.9911842305342085e-05,
"loss": 1.3775,
"step": 12550
},
{
"epoch": 0.0519191697893264,
"grad_norm": 3.8641850919990737,
"learning_rate": 1.9911668423547635e-05,
"loss": 1.4056,
"step": 12560
},
{
"epoch": 0.05196050670794847,
"grad_norm": 2.8845765281029325,
"learning_rate": 1.9911494371201023e-05,
"loss": 1.3433,
"step": 12570
},
{
"epoch": 0.052001843626570544,
"grad_norm": 2.9182840148796996,
"learning_rate": 1.9911320148305235e-05,
"loss": 1.4146,
"step": 12580
},
{
"epoch": 0.05204318054519262,
"grad_norm": 3.471309021112678,
"learning_rate": 1.991114575486328e-05,
"loss": 1.3769,
"step": 12590
},
{
"epoch": 0.05208451746381469,
"grad_norm": 3.132099407416561,
"learning_rate": 1.9910971190878157e-05,
"loss": 1.4006,
"step": 12600
},
{
"epoch": 0.05212585438243677,
"grad_norm": 3.5078473659046554,
"learning_rate": 1.9910796456352863e-05,
"loss": 1.3608,
"step": 12610
},
{
"epoch": 0.052167191301058846,
"grad_norm": 3.420611210950219,
"learning_rate": 1.991062155129041e-05,
"loss": 1.3477,
"step": 12620
},
{
"epoch": 0.05220852821968092,
"grad_norm": 3.3602682043236425,
"learning_rate": 1.991044647569381e-05,
"loss": 1.3821,
"step": 12630
},
{
"epoch": 0.052249865138302994,
"grad_norm": 3.3934199487204326,
"learning_rate": 1.9910271229566067e-05,
"loss": 1.3672,
"step": 12640
},
{
"epoch": 0.05229120205692507,
"grad_norm": 3.3930743766477636,
"learning_rate": 1.9910095812910205e-05,
"loss": 1.3805,
"step": 12650
},
{
"epoch": 0.05233253897554715,
"grad_norm": 3.803471056919821,
"learning_rate": 1.9909920225729237e-05,
"loss": 1.357,
"step": 12660
},
{
"epoch": 0.05237387589416922,
"grad_norm": 2.9908606514422766,
"learning_rate": 1.990974446802619e-05,
"loss": 1.4148,
"step": 12670
},
{
"epoch": 0.052415212812791295,
"grad_norm": 3.472506553773665,
"learning_rate": 1.990956853980408e-05,
"loss": 1.4119,
"step": 12680
},
{
"epoch": 0.05245654973141337,
"grad_norm": 3.652498665098648,
"learning_rate": 1.9909392441065944e-05,
"loss": 1.3896,
"step": 12690
},
{
"epoch": 0.05249788665003545,
"grad_norm": 3.556198285804122,
"learning_rate": 1.9909216171814802e-05,
"loss": 1.3556,
"step": 12700
},
{
"epoch": 0.05253922356865752,
"grad_norm": 3.3341578506950187,
"learning_rate": 1.9909039732053695e-05,
"loss": 1.3875,
"step": 12710
},
{
"epoch": 0.0525805604872796,
"grad_norm": 3.623737574209396,
"learning_rate": 1.9908863121785656e-05,
"loss": 1.3699,
"step": 12720
},
{
"epoch": 0.052621897405901674,
"grad_norm": 3.068120426816953,
"learning_rate": 1.9908686341013723e-05,
"loss": 1.3504,
"step": 12730
},
{
"epoch": 0.052663234324523744,
"grad_norm": 3.5988757581859643,
"learning_rate": 1.990850938974094e-05,
"loss": 1.3506,
"step": 12740
},
{
"epoch": 0.05270457124314582,
"grad_norm": 3.4850198824984724,
"learning_rate": 1.990833226797035e-05,
"loss": 1.3949,
"step": 12750
},
{
"epoch": 0.0527459081617679,
"grad_norm": 3.3573178296822834,
"learning_rate": 1.9908154975705e-05,
"loss": 1.3766,
"step": 12760
},
{
"epoch": 0.052787245080389976,
"grad_norm": 3.5288003708700186,
"learning_rate": 1.990797751294795e-05,
"loss": 1.3915,
"step": 12770
},
{
"epoch": 0.052828581999012046,
"grad_norm": 3.0065181585529794,
"learning_rate": 1.990779987970224e-05,
"loss": 1.3943,
"step": 12780
},
{
"epoch": 0.05286991891763412,
"grad_norm": 3.8902210517557787,
"learning_rate": 1.9907622075970933e-05,
"loss": 1.4339,
"step": 12790
},
{
"epoch": 0.0529112558362562,
"grad_norm": 3.0754498963080317,
"learning_rate": 1.990744410175709e-05,
"loss": 1.3633,
"step": 12800
},
{
"epoch": 0.05295259275487827,
"grad_norm": 3.373819633563616,
"learning_rate": 1.990726595706377e-05,
"loss": 1.3729,
"step": 12810
},
{
"epoch": 0.05299392967350035,
"grad_norm": 3.2413750593238277,
"learning_rate": 1.990708764189404e-05,
"loss": 1.3611,
"step": 12820
},
{
"epoch": 0.053035266592122425,
"grad_norm": 3.3175842583387287,
"learning_rate": 1.990690915625097e-05,
"loss": 1.4386,
"step": 12830
},
{
"epoch": 0.053076603510744495,
"grad_norm": 4.421464949416987,
"learning_rate": 1.9906730500137626e-05,
"loss": 1.3825,
"step": 12840
},
{
"epoch": 0.05311794042936657,
"grad_norm": 3.7375473757828312,
"learning_rate": 1.9906551673557092e-05,
"loss": 1.3584,
"step": 12850
},
{
"epoch": 0.05315927734798865,
"grad_norm": 4.23504822699641,
"learning_rate": 1.9906372676512435e-05,
"loss": 1.3655,
"step": 12860
},
{
"epoch": 0.05320061426661073,
"grad_norm": 3.674354440233681,
"learning_rate": 1.9906193509006737e-05,
"loss": 1.3652,
"step": 12870
},
{
"epoch": 0.0532419511852328,
"grad_norm": 3.270440974926962,
"learning_rate": 1.9906014171043085e-05,
"loss": 1.408,
"step": 12880
},
{
"epoch": 0.053283288103854874,
"grad_norm": 3.4328461661592007,
"learning_rate": 1.9905834662624562e-05,
"loss": 1.3881,
"step": 12890
},
{
"epoch": 0.05332462502247695,
"grad_norm": 3.296815547244285,
"learning_rate": 1.9905654983754255e-05,
"loss": 1.3099,
"step": 12900
},
{
"epoch": 0.05336596194109902,
"grad_norm": 3.182778307558256,
"learning_rate": 1.9905475134435265e-05,
"loss": 1.3887,
"step": 12910
},
{
"epoch": 0.0534072988597211,
"grad_norm": 4.066051141098089,
"learning_rate": 1.9905295114670674e-05,
"loss": 1.3615,
"step": 12920
},
{
"epoch": 0.053448635778343176,
"grad_norm": 3.62599590443558,
"learning_rate": 1.9905114924463592e-05,
"loss": 1.3461,
"step": 12930
},
{
"epoch": 0.05348997269696525,
"grad_norm": 4.5391383467222735,
"learning_rate": 1.9904934563817106e-05,
"loss": 1.3543,
"step": 12940
},
{
"epoch": 0.05353130961558732,
"grad_norm": 4.04368698393749,
"learning_rate": 1.990475403273433e-05,
"loss": 1.3712,
"step": 12950
},
{
"epoch": 0.0535726465342094,
"grad_norm": 3.9811173894333836,
"learning_rate": 1.9904573331218365e-05,
"loss": 1.4334,
"step": 12960
},
{
"epoch": 0.05361398345283148,
"grad_norm": 3.499953327542098,
"learning_rate": 1.9904392459272326e-05,
"loss": 1.3871,
"step": 12970
},
{
"epoch": 0.05365532037145355,
"grad_norm": 3.226290735311431,
"learning_rate": 1.9904211416899322e-05,
"loss": 1.4122,
"step": 12980
},
{
"epoch": 0.053696657290075625,
"grad_norm": 3.566091958099414,
"learning_rate": 1.990403020410247e-05,
"loss": 1.4075,
"step": 12990
},
{
"epoch": 0.0537379942086977,
"grad_norm": 3.4558175186897513,
"learning_rate": 1.990384882088488e-05,
"loss": 1.4553,
"step": 13000
},
{
"epoch": 0.05377933112731978,
"grad_norm": 3.238909449520725,
"learning_rate": 1.9903667267249683e-05,
"loss": 1.3791,
"step": 13010
},
{
"epoch": 0.05382066804594185,
"grad_norm": 3.517722296765338,
"learning_rate": 1.9903485543199995e-05,
"loss": 1.3283,
"step": 13020
},
{
"epoch": 0.05386200496456393,
"grad_norm": 3.42397575432932,
"learning_rate": 1.9903303648738954e-05,
"loss": 1.3335,
"step": 13030
},
{
"epoch": 0.053903341883186004,
"grad_norm": 3.350334059229468,
"learning_rate": 1.990312158386968e-05,
"loss": 1.3806,
"step": 13040
},
{
"epoch": 0.053944678801808074,
"grad_norm": 3.051548296138219,
"learning_rate": 1.9902939348595307e-05,
"loss": 1.3885,
"step": 13050
},
{
"epoch": 0.05398601572043015,
"grad_norm": 3.276091159978694,
"learning_rate": 1.9902756942918976e-05,
"loss": 1.359,
"step": 13060
},
{
"epoch": 0.05402735263905223,
"grad_norm": 3.5296387125760185,
"learning_rate": 1.9902574366843824e-05,
"loss": 1.3625,
"step": 13070
},
{
"epoch": 0.054068689557674306,
"grad_norm": 3.567948297220875,
"learning_rate": 1.990239162037299e-05,
"loss": 1.351,
"step": 13080
},
{
"epoch": 0.054110026476296376,
"grad_norm": 3.1640186718240266,
"learning_rate": 1.9902208703509617e-05,
"loss": 1.3458,
"step": 13090
},
{
"epoch": 0.05415136339491845,
"grad_norm": 3.9025546167384495,
"learning_rate": 1.9902025616256854e-05,
"loss": 1.3588,
"step": 13100
},
{
"epoch": 0.05419270031354053,
"grad_norm": 3.501658240766089,
"learning_rate": 1.9901842358617854e-05,
"loss": 1.3624,
"step": 13110
},
{
"epoch": 0.0542340372321626,
"grad_norm": 3.923570308465845,
"learning_rate": 1.9901658930595774e-05,
"loss": 1.3294,
"step": 13120
},
{
"epoch": 0.05427537415078468,
"grad_norm": 3.0232913372852406,
"learning_rate": 1.990147533219376e-05,
"loss": 1.3855,
"step": 13130
},
{
"epoch": 0.054316711069406755,
"grad_norm": 3.670482250458697,
"learning_rate": 1.9901291563414977e-05,
"loss": 1.3337,
"step": 13140
},
{
"epoch": 0.054358047988028825,
"grad_norm": 3.790854501668152,
"learning_rate": 1.990110762426259e-05,
"loss": 1.366,
"step": 13150
},
{
"epoch": 0.0543993849066509,
"grad_norm": 3.1103384131591256,
"learning_rate": 1.9900923514739758e-05,
"loss": 1.3574,
"step": 13160
},
{
"epoch": 0.05444072182527298,
"grad_norm": 3.2207958459794845,
"learning_rate": 1.990073923484965e-05,
"loss": 1.3675,
"step": 13170
},
{
"epoch": 0.05448205874389506,
"grad_norm": 3.2588291421463023,
"learning_rate": 1.990055478459544e-05,
"loss": 1.3313,
"step": 13180
},
{
"epoch": 0.05452339566251713,
"grad_norm": 2.9426904447180506,
"learning_rate": 1.99003701639803e-05,
"loss": 1.3995,
"step": 13190
},
{
"epoch": 0.054564732581139204,
"grad_norm": 3.892827987664763,
"learning_rate": 1.990018537300741e-05,
"loss": 1.4035,
"step": 13200
},
{
"epoch": 0.05460606949976128,
"grad_norm": 3.765962575470102,
"learning_rate": 1.9900000411679946e-05,
"loss": 1.3823,
"step": 13210
},
{
"epoch": 0.05464740641838335,
"grad_norm": 3.031044142550962,
"learning_rate": 1.9899815280001093e-05,
"loss": 1.3907,
"step": 13220
},
{
"epoch": 0.05468874333700543,
"grad_norm": 3.401074997651561,
"learning_rate": 1.9899629977974033e-05,
"loss": 1.3724,
"step": 13230
},
{
"epoch": 0.054730080255627506,
"grad_norm": 3.4363592487014367,
"learning_rate": 1.9899444505601957e-05,
"loss": 1.4044,
"step": 13240
},
{
"epoch": 0.05477141717424958,
"grad_norm": 3.4008170147404924,
"learning_rate": 1.9899258862888055e-05,
"loss": 1.4329,
"step": 13250
},
{
"epoch": 0.05481275409287165,
"grad_norm": 3.3535448510349086,
"learning_rate": 1.9899073049835526e-05,
"loss": 1.3803,
"step": 13260
},
{
"epoch": 0.05485409101149373,
"grad_norm": 3.6491506303085677,
"learning_rate": 1.9898887066447564e-05,
"loss": 1.4061,
"step": 13270
},
{
"epoch": 0.05489542793011581,
"grad_norm": 3.2649595342568754,
"learning_rate": 1.9898700912727365e-05,
"loss": 1.3548,
"step": 13280
},
{
"epoch": 0.05493676484873788,
"grad_norm": 3.260685808424658,
"learning_rate": 1.9898514588678138e-05,
"loss": 1.3798,
"step": 13290
},
{
"epoch": 0.054978101767359955,
"grad_norm": 3.3068054059856964,
"learning_rate": 1.989832809430309e-05,
"loss": 1.3873,
"step": 13300
},
{
"epoch": 0.05501943868598203,
"grad_norm": 3.3289477651913844,
"learning_rate": 1.9898141429605428e-05,
"loss": 1.42,
"step": 13310
},
{
"epoch": 0.05506077560460411,
"grad_norm": 3.899358403289862,
"learning_rate": 1.9897954594588366e-05,
"loss": 1.3612,
"step": 13320
},
{
"epoch": 0.05510211252322618,
"grad_norm": 3.4534257185508768,
"learning_rate": 1.989776758925511e-05,
"loss": 1.4139,
"step": 13330
},
{
"epoch": 0.05514344944184826,
"grad_norm": 3.0896933369894555,
"learning_rate": 1.9897580413608888e-05,
"loss": 1.3455,
"step": 13340
},
{
"epoch": 0.055184786360470334,
"grad_norm": 3.6880673723268895,
"learning_rate": 1.9897393067652916e-05,
"loss": 1.3553,
"step": 13350
},
{
"epoch": 0.055226123279092404,
"grad_norm": 4.959844713915171,
"learning_rate": 1.989720555139042e-05,
"loss": 1.3744,
"step": 13360
},
{
"epoch": 0.05526746019771448,
"grad_norm": 3.7512130358105535,
"learning_rate": 1.9897017864824623e-05,
"loss": 1.3967,
"step": 13370
},
{
"epoch": 0.05530879711633656,
"grad_norm": 3.438280531829743,
"learning_rate": 1.989683000795876e-05,
"loss": 1.3199,
"step": 13380
},
{
"epoch": 0.055350134034958635,
"grad_norm": 3.191658504407269,
"learning_rate": 1.989664198079606e-05,
"loss": 1.3843,
"step": 13390
},
{
"epoch": 0.055391470953580706,
"grad_norm": 3.246301259794481,
"learning_rate": 1.989645378333976e-05,
"loss": 1.3823,
"step": 13400
},
{
"epoch": 0.05543280787220278,
"grad_norm": 3.209808944542668,
"learning_rate": 1.9896265415593096e-05,
"loss": 1.4023,
"step": 13410
},
{
"epoch": 0.05547414479082486,
"grad_norm": 3.8876392074458055,
"learning_rate": 1.989607687755931e-05,
"loss": 1.4021,
"step": 13420
},
{
"epoch": 0.05551548170944693,
"grad_norm": 4.6622959130132635,
"learning_rate": 1.9895888169241643e-05,
"loss": 1.3941,
"step": 13430
},
{
"epoch": 0.05555681862806901,
"grad_norm": 3.2829449802312243,
"learning_rate": 1.989569929064335e-05,
"loss": 1.3914,
"step": 13440
},
{
"epoch": 0.055598155546691085,
"grad_norm": 2.975464909103463,
"learning_rate": 1.989551024176768e-05,
"loss": 1.3393,
"step": 13450
},
{
"epoch": 0.055639492465313155,
"grad_norm": 3.3616145226127374,
"learning_rate": 1.9895321022617877e-05,
"loss": 1.3691,
"step": 13460
},
{
"epoch": 0.05568082938393523,
"grad_norm": 3.551441103147202,
"learning_rate": 1.9895131633197206e-05,
"loss": 1.3748,
"step": 13470
},
{
"epoch": 0.05572216630255731,
"grad_norm": 3.1368088044777838,
"learning_rate": 1.9894942073508924e-05,
"loss": 1.3341,
"step": 13480
},
{
"epoch": 0.055763503221179386,
"grad_norm": 2.8747175172948722,
"learning_rate": 1.989475234355629e-05,
"loss": 1.3951,
"step": 13490
},
{
"epoch": 0.055804840139801457,
"grad_norm": 3.0419841938845975,
"learning_rate": 1.989456244334257e-05,
"loss": 1.3325,
"step": 13500
},
{
"epoch": 0.055846177058423534,
"grad_norm": 3.6672894427510947,
"learning_rate": 1.9894372372871036e-05,
"loss": 1.3847,
"step": 13510
},
{
"epoch": 0.05588751397704561,
"grad_norm": 3.3319466434724325,
"learning_rate": 1.989418213214495e-05,
"loss": 1.4007,
"step": 13520
},
{
"epoch": 0.05592885089566768,
"grad_norm": 2.756361075189523,
"learning_rate": 1.9893991721167593e-05,
"loss": 1.3962,
"step": 13530
},
{
"epoch": 0.05597018781428976,
"grad_norm": 3.2588019424168384,
"learning_rate": 1.989380113994224e-05,
"loss": 1.409,
"step": 13540
},
{
"epoch": 0.056011524732911835,
"grad_norm": 3.992120970935661,
"learning_rate": 1.9893610388472162e-05,
"loss": 1.3642,
"step": 13550
},
{
"epoch": 0.05605286165153391,
"grad_norm": 3.0454737775385885,
"learning_rate": 1.9893419466760653e-05,
"loss": 1.3696,
"step": 13560
},
{
"epoch": 0.05609419857015598,
"grad_norm": 3.1960156109377507,
"learning_rate": 1.9893228374810993e-05,
"loss": 1.3611,
"step": 13570
},
{
"epoch": 0.05613553548877806,
"grad_norm": 3.3282542329613496,
"learning_rate": 1.989303711262647e-05,
"loss": 1.3541,
"step": 13580
},
{
"epoch": 0.05617687240740014,
"grad_norm": 4.271766406501802,
"learning_rate": 1.9892845680210374e-05,
"loss": 1.3033,
"step": 13590
},
{
"epoch": 0.05621820932602221,
"grad_norm": 3.4542215892482964,
"learning_rate": 1.9892654077566003e-05,
"loss": 1.3853,
"step": 13600
},
{
"epoch": 0.056259546244644285,
"grad_norm": 5.10198450683926,
"learning_rate": 1.9892462304696653e-05,
"loss": 1.3758,
"step": 13610
},
{
"epoch": 0.05630088316326636,
"grad_norm": 4.67424032198832,
"learning_rate": 1.989227036160562e-05,
"loss": 1.3395,
"step": 13620
},
{
"epoch": 0.05634222008188844,
"grad_norm": 3.574141696384299,
"learning_rate": 1.989207824829621e-05,
"loss": 1.3953,
"step": 13630
},
{
"epoch": 0.05638355700051051,
"grad_norm": 3.4219942324444244,
"learning_rate": 1.989188596477173e-05,
"loss": 1.3493,
"step": 13640
},
{
"epoch": 0.056424893919132586,
"grad_norm": 3.135032092895971,
"learning_rate": 1.9891693511035484e-05,
"loss": 1.4203,
"step": 13650
},
{
"epoch": 0.05646623083775466,
"grad_norm": 3.0856331845063387,
"learning_rate": 1.989150088709079e-05,
"loss": 1.2854,
"step": 13660
},
{
"epoch": 0.056507567756376734,
"grad_norm": 3.9267068067232,
"learning_rate": 1.9891308092940953e-05,
"loss": 1.3701,
"step": 13670
},
{
"epoch": 0.05654890467499881,
"grad_norm": 3.8306681224418395,
"learning_rate": 1.98911151285893e-05,
"loss": 1.4076,
"step": 13680
},
{
"epoch": 0.05659024159362089,
"grad_norm": 3.306837169963007,
"learning_rate": 1.9890921994039148e-05,
"loss": 1.3873,
"step": 13690
},
{
"epoch": 0.056631578512242965,
"grad_norm": 3.2434462335337297,
"learning_rate": 1.989072868929382e-05,
"loss": 1.3531,
"step": 13700
},
{
"epoch": 0.056672915430865035,
"grad_norm": 3.3740272090711856,
"learning_rate": 1.989053521435664e-05,
"loss": 1.3772,
"step": 13710
},
{
"epoch": 0.05671425234948711,
"grad_norm": 3.141984063033404,
"learning_rate": 1.989034156923094e-05,
"loss": 1.3983,
"step": 13720
},
{
"epoch": 0.05675558926810919,
"grad_norm": 2.9680517660305847,
"learning_rate": 1.989014775392005e-05,
"loss": 1.3651,
"step": 13730
},
{
"epoch": 0.05679692618673126,
"grad_norm": 3.418890084499366,
"learning_rate": 1.9889953768427313e-05,
"loss": 1.4157,
"step": 13740
},
{
"epoch": 0.05683826310535334,
"grad_norm": 3.7254805946590706,
"learning_rate": 1.9889759612756053e-05,
"loss": 1.3979,
"step": 13750
},
{
"epoch": 0.056879600023975414,
"grad_norm": 3.617034367391942,
"learning_rate": 1.9889565286909623e-05,
"loss": 1.3549,
"step": 13760
},
{
"epoch": 0.056920936942597485,
"grad_norm": 3.8592922160592646,
"learning_rate": 1.9889370790891364e-05,
"loss": 1.4008,
"step": 13770
},
{
"epoch": 0.05696227386121956,
"grad_norm": 3.510616141867297,
"learning_rate": 1.9889176124704616e-05,
"loss": 1.4071,
"step": 13780
},
{
"epoch": 0.05700361077984164,
"grad_norm": 3.5434621547794105,
"learning_rate": 1.9888981288352736e-05,
"loss": 1.3782,
"step": 13790
},
{
"epoch": 0.057044947698463716,
"grad_norm": 3.0056956686627117,
"learning_rate": 1.988878628183907e-05,
"loss": 1.352,
"step": 13800
},
{
"epoch": 0.057086284617085786,
"grad_norm": 2.876862066794774,
"learning_rate": 1.9888591105166984e-05,
"loss": 1.3451,
"step": 13810
},
{
"epoch": 0.05712762153570786,
"grad_norm": 3.6994718345443776,
"learning_rate": 1.9888395758339823e-05,
"loss": 1.3711,
"step": 13820
},
{
"epoch": 0.05716895845432994,
"grad_norm": 3.7019952550478052,
"learning_rate": 1.988820024136096e-05,
"loss": 1.3542,
"step": 13830
},
{
"epoch": 0.05721029537295201,
"grad_norm": 3.149511937310367,
"learning_rate": 1.9888004554233757e-05,
"loss": 1.3498,
"step": 13840
},
{
"epoch": 0.05725163229157409,
"grad_norm": 3.3687656057902298,
"learning_rate": 1.9887808696961574e-05,
"loss": 1.3759,
"step": 13850
},
{
"epoch": 0.057292969210196165,
"grad_norm": 3.286956265957372,
"learning_rate": 1.988761266954779e-05,
"loss": 1.3416,
"step": 13860
},
{
"epoch": 0.05733430612881824,
"grad_norm": 3.8546233434442025,
"learning_rate": 1.988741647199577e-05,
"loss": 1.3601,
"step": 13870
},
{
"epoch": 0.05737564304744031,
"grad_norm": 4.043137550642432,
"learning_rate": 1.98872201043089e-05,
"loss": 1.4019,
"step": 13880
},
{
"epoch": 0.05741697996606239,
"grad_norm": 4.6033780589634805,
"learning_rate": 1.988702356649055e-05,
"loss": 1.406,
"step": 13890
},
{
"epoch": 0.05745831688468447,
"grad_norm": 3.4533366026357135,
"learning_rate": 1.9886826858544103e-05,
"loss": 1.3579,
"step": 13900
},
{
"epoch": 0.05749965380330654,
"grad_norm": 4.2918223423561725,
"learning_rate": 1.9886629980472945e-05,
"loss": 1.3238,
"step": 13910
},
{
"epoch": 0.057540990721928614,
"grad_norm": 3.1786603013459667,
"learning_rate": 1.988643293228047e-05,
"loss": 1.3815,
"step": 13920
},
{
"epoch": 0.05758232764055069,
"grad_norm": 4.187787674938507,
"learning_rate": 1.988623571397006e-05,
"loss": 1.3129,
"step": 13930
},
{
"epoch": 0.05762366455917277,
"grad_norm": 3.2066247160025956,
"learning_rate": 1.9886038325545112e-05,
"loss": 1.3604,
"step": 13940
},
{
"epoch": 0.05766500147779484,
"grad_norm": 4.137189558470061,
"learning_rate": 1.9885840767009023e-05,
"loss": 1.3683,
"step": 13950
},
{
"epoch": 0.057706338396416916,
"grad_norm": 3.21825868230931,
"learning_rate": 1.988564303836519e-05,
"loss": 1.3521,
"step": 13960
},
{
"epoch": 0.05774767531503899,
"grad_norm": 3.5331562264396097,
"learning_rate": 1.9885445139617018e-05,
"loss": 1.4079,
"step": 13970
},
{
"epoch": 0.05778901223366106,
"grad_norm": 3.1970430005062607,
"learning_rate": 1.9885247070767915e-05,
"loss": 1.3688,
"step": 13980
},
{
"epoch": 0.05783034915228314,
"grad_norm": 4.27476021372676,
"learning_rate": 1.988504883182128e-05,
"loss": 1.364,
"step": 13990
},
{
"epoch": 0.05787168607090522,
"grad_norm": 3.2156024105612278,
"learning_rate": 1.9884850422780534e-05,
"loss": 1.3814,
"step": 14000
},
{
"epoch": 0.057913022989527295,
"grad_norm": 3.6998253526421565,
"learning_rate": 1.9884651843649083e-05,
"loss": 1.3698,
"step": 14010
},
{
"epoch": 0.057954359908149365,
"grad_norm": 4.503662250473274,
"learning_rate": 1.988445309443035e-05,
"loss": 1.3564,
"step": 14020
},
{
"epoch": 0.05799569682677144,
"grad_norm": 3.762384040491392,
"learning_rate": 1.9884254175127754e-05,
"loss": 1.4119,
"step": 14030
},
{
"epoch": 0.05803703374539352,
"grad_norm": 3.285301388684364,
"learning_rate": 1.9884055085744713e-05,
"loss": 1.3501,
"step": 14040
},
{
"epoch": 0.05807837066401559,
"grad_norm": 4.336832986530797,
"learning_rate": 1.9883855826284656e-05,
"loss": 1.3662,
"step": 14050
},
{
"epoch": 0.05811970758263767,
"grad_norm": 3.4574445488885734,
"learning_rate": 1.9883656396751016e-05,
"loss": 1.3127,
"step": 14060
},
{
"epoch": 0.058161044501259744,
"grad_norm": 3.4098914920099883,
"learning_rate": 1.988345679714722e-05,
"loss": 1.391,
"step": 14070
},
{
"epoch": 0.058202381419881814,
"grad_norm": 3.6614081585424603,
"learning_rate": 1.98832570274767e-05,
"loss": 1.3659,
"step": 14080
},
{
"epoch": 0.05824371833850389,
"grad_norm": 3.276861233677139,
"learning_rate": 1.98830570877429e-05,
"loss": 1.3559,
"step": 14090
},
{
"epoch": 0.05828505525712597,
"grad_norm": 3.4536947240708997,
"learning_rate": 1.9882856977949257e-05,
"loss": 1.3779,
"step": 14100
},
{
"epoch": 0.058326392175748046,
"grad_norm": 3.242988394736396,
"learning_rate": 1.9882656698099213e-05,
"loss": 1.3353,
"step": 14110
},
{
"epoch": 0.058367729094370116,
"grad_norm": 3.033285353432935,
"learning_rate": 1.9882456248196216e-05,
"loss": 1.3831,
"step": 14120
},
{
"epoch": 0.05840906601299219,
"grad_norm": 2.9840093106321173,
"learning_rate": 1.9882255628243715e-05,
"loss": 1.399,
"step": 14130
},
{
"epoch": 0.05845040293161427,
"grad_norm": 3.424871961043564,
"learning_rate": 1.9882054838245158e-05,
"loss": 1.3774,
"step": 14140
},
{
"epoch": 0.05849173985023634,
"grad_norm": 4.206811562034524,
"learning_rate": 1.988185387820401e-05,
"loss": 1.3768,
"step": 14150
},
{
"epoch": 0.05853307676885842,
"grad_norm": 3.167196829826696,
"learning_rate": 1.9881652748123723e-05,
"loss": 1.3118,
"step": 14160
},
{
"epoch": 0.058574413687480495,
"grad_norm": 3.2647863004270583,
"learning_rate": 1.9881451448007752e-05,
"loss": 1.359,
"step": 14170
},
{
"epoch": 0.05861575060610257,
"grad_norm": 3.2249069204445506,
"learning_rate": 1.988124997785957e-05,
"loss": 1.3594,
"step": 14180
},
{
"epoch": 0.05865708752472464,
"grad_norm": 3.125833926204158,
"learning_rate": 1.9881048337682644e-05,
"loss": 1.3729,
"step": 14190
},
{
"epoch": 0.05869842444334672,
"grad_norm": 3.124857727130482,
"learning_rate": 1.9880846527480434e-05,
"loss": 1.3968,
"step": 14200
},
{
"epoch": 0.0587397613619688,
"grad_norm": 3.5324983045866416,
"learning_rate": 1.988064454725642e-05,
"loss": 1.3744,
"step": 14210
},
{
"epoch": 0.05878109828059087,
"grad_norm": 3.534173243273415,
"learning_rate": 1.9880442397014082e-05,
"loss": 1.3858,
"step": 14220
},
{
"epoch": 0.058822435199212944,
"grad_norm": 4.522930564227188,
"learning_rate": 1.9880240076756885e-05,
"loss": 1.3365,
"step": 14230
},
{
"epoch": 0.05886377211783502,
"grad_norm": 2.911847199383502,
"learning_rate": 1.9880037586488324e-05,
"loss": 1.3629,
"step": 14240
},
{
"epoch": 0.0589051090364571,
"grad_norm": 3.467653251925633,
"learning_rate": 1.9879834926211875e-05,
"loss": 1.3839,
"step": 14250
},
{
"epoch": 0.05894644595507917,
"grad_norm": 3.7166470032659618,
"learning_rate": 1.9879632095931024e-05,
"loss": 1.3358,
"step": 14260
},
{
"epoch": 0.058987782873701246,
"grad_norm": 3.7077022813203193,
"learning_rate": 1.987942909564927e-05,
"loss": 1.3474,
"step": 14270
},
{
"epoch": 0.05902911979232332,
"grad_norm": 3.1733151465540916,
"learning_rate": 1.9879225925370094e-05,
"loss": 1.3881,
"step": 14280
},
{
"epoch": 0.05907045671094539,
"grad_norm": 3.0349885556128378,
"learning_rate": 1.9879022585097005e-05,
"loss": 1.3686,
"step": 14290
},
{
"epoch": 0.05911179362956747,
"grad_norm": 3.2620109103700363,
"learning_rate": 1.9878819074833493e-05,
"loss": 1.3588,
"step": 14300
},
{
"epoch": 0.05915313054818955,
"grad_norm": 3.7133352574339873,
"learning_rate": 1.9878615394583062e-05,
"loss": 1.34,
"step": 14310
},
{
"epoch": 0.059194467466811625,
"grad_norm": 3.1624289931149114,
"learning_rate": 1.987841154434922e-05,
"loss": 1.3334,
"step": 14320
},
{
"epoch": 0.059235804385433695,
"grad_norm": 3.7047396553657643,
"learning_rate": 1.9878207524135468e-05,
"loss": 1.3375,
"step": 14330
},
{
"epoch": 0.05927714130405577,
"grad_norm": 3.3119519535402,
"learning_rate": 1.9878003333945325e-05,
"loss": 1.3537,
"step": 14340
},
{
"epoch": 0.05931847822267785,
"grad_norm": 3.3812187643072105,
"learning_rate": 1.98777989737823e-05,
"loss": 1.3374,
"step": 14350
},
{
"epoch": 0.05935981514129992,
"grad_norm": 3.299892572730851,
"learning_rate": 1.9877594443649902e-05,
"loss": 1.3704,
"step": 14360
},
{
"epoch": 0.059401152059922,
"grad_norm": 4.081245562156265,
"learning_rate": 1.9877389743551668e-05,
"loss": 1.3498,
"step": 14370
},
{
"epoch": 0.059442488978544074,
"grad_norm": 3.350158600172311,
"learning_rate": 1.9877184873491102e-05,
"loss": 1.3449,
"step": 14380
},
{
"epoch": 0.05948382589716615,
"grad_norm": 3.714541324538004,
"learning_rate": 1.9876979833471742e-05,
"loss": 1.3874,
"step": 14390
},
{
"epoch": 0.05952516281578822,
"grad_norm": 3.274100022702722,
"learning_rate": 1.9876774623497112e-05,
"loss": 1.3582,
"step": 14400
},
{
"epoch": 0.0595664997344103,
"grad_norm": 2.9329187414523425,
"learning_rate": 1.9876569243570742e-05,
"loss": 1.3901,
"step": 14410
},
{
"epoch": 0.059607836653032376,
"grad_norm": 3.73192299633757,
"learning_rate": 1.9876363693696166e-05,
"loss": 1.3898,
"step": 14420
},
{
"epoch": 0.059649173571654446,
"grad_norm": 3.2578219560804196,
"learning_rate": 1.987615797387692e-05,
"loss": 1.371,
"step": 14430
},
{
"epoch": 0.05969051049027652,
"grad_norm": 3.2376808673712807,
"learning_rate": 1.9875952084116548e-05,
"loss": 1.336,
"step": 14440
},
{
"epoch": 0.0597318474088986,
"grad_norm": 3.5969021652399253,
"learning_rate": 1.987574602441859e-05,
"loss": 1.3862,
"step": 14450
},
{
"epoch": 0.05977318432752067,
"grad_norm": 3.1882930317103844,
"learning_rate": 1.9875539794786593e-05,
"loss": 1.3734,
"step": 14460
},
{
"epoch": 0.05981452124614275,
"grad_norm": 3.147605709223492,
"learning_rate": 1.9875333395224102e-05,
"loss": 1.3739,
"step": 14470
},
{
"epoch": 0.059855858164764825,
"grad_norm": 3.4276761969558645,
"learning_rate": 1.9875126825734673e-05,
"loss": 1.3301,
"step": 14480
},
{
"epoch": 0.0598971950833869,
"grad_norm": 3.4226682382169997,
"learning_rate": 1.987492008632186e-05,
"loss": 1.3747,
"step": 14490
},
{
"epoch": 0.05993853200200897,
"grad_norm": 3.471596558825063,
"learning_rate": 1.987471317698922e-05,
"loss": 1.3349,
"step": 14500
},
{
"epoch": 0.05997986892063105,
"grad_norm": 3.676413482025792,
"learning_rate": 1.9874506097740308e-05,
"loss": 1.3963,
"step": 14510
},
{
"epoch": 0.060021205839253126,
"grad_norm": 3.2543359739910267,
"learning_rate": 1.9874298848578696e-05,
"loss": 1.3334,
"step": 14520
},
{
"epoch": 0.0600625427578752,
"grad_norm": 3.2372261281319377,
"learning_rate": 1.9874091429507943e-05,
"loss": 1.3367,
"step": 14530
},
{
"epoch": 0.060103879676497274,
"grad_norm": 3.3746909228055397,
"learning_rate": 1.987388384053162e-05,
"loss": 1.376,
"step": 14540
},
{
"epoch": 0.06014521659511935,
"grad_norm": 2.921092227637486,
"learning_rate": 1.9873676081653302e-05,
"loss": 1.3715,
"step": 14550
},
{
"epoch": 0.06018655351374143,
"grad_norm": 2.9872735994422417,
"learning_rate": 1.9873468152876563e-05,
"loss": 1.3457,
"step": 14560
},
{
"epoch": 0.0602278904323635,
"grad_norm": 3.713230272104003,
"learning_rate": 1.9873260054204978e-05,
"loss": 1.328,
"step": 14570
},
{
"epoch": 0.060269227350985576,
"grad_norm": 3.3242217719152496,
"learning_rate": 1.9873051785642134e-05,
"loss": 1.3433,
"step": 14580
},
{
"epoch": 0.06031056426960765,
"grad_norm": 3.3594706223759143,
"learning_rate": 1.9872843347191607e-05,
"loss": 1.4027,
"step": 14590
},
{
"epoch": 0.06035190118822972,
"grad_norm": 3.1573056170670846,
"learning_rate": 1.9872634738856987e-05,
"loss": 1.3798,
"step": 14600
},
{
"epoch": 0.0603932381068518,
"grad_norm": 3.076626392806199,
"learning_rate": 1.9872425960641863e-05,
"loss": 1.3581,
"step": 14610
},
{
"epoch": 0.06043457502547388,
"grad_norm": 3.468364476739333,
"learning_rate": 1.987221701254983e-05,
"loss": 1.3675,
"step": 14620
},
{
"epoch": 0.060475911944095954,
"grad_norm": 4.079855852909671,
"learning_rate": 1.987200789458448e-05,
"loss": 1.3197,
"step": 14630
},
{
"epoch": 0.060517248862718025,
"grad_norm": 4.960368430326063,
"learning_rate": 1.9871798606749415e-05,
"loss": 1.4018,
"step": 14640
},
{
"epoch": 0.0605585857813401,
"grad_norm": 3.1263141804205272,
"learning_rate": 1.9871589149048232e-05,
"loss": 1.4034,
"step": 14650
},
{
"epoch": 0.06059992269996218,
"grad_norm": 3.0030045708337876,
"learning_rate": 1.9871379521484538e-05,
"loss": 1.314,
"step": 14660
},
{
"epoch": 0.06064125961858425,
"grad_norm": 4.171309905380893,
"learning_rate": 1.987116972406194e-05,
"loss": 1.3454,
"step": 14670
},
{
"epoch": 0.060682596537206326,
"grad_norm": 3.678722232531344,
"learning_rate": 1.9870959756784044e-05,
"loss": 1.3644,
"step": 14680
},
{
"epoch": 0.060723933455828404,
"grad_norm": 3.5114052948471164,
"learning_rate": 1.987074961965447e-05,
"loss": 1.3446,
"step": 14690
},
{
"epoch": 0.06076527037445048,
"grad_norm": 3.3802619453865637,
"learning_rate": 1.987053931267683e-05,
"loss": 1.3603,
"step": 14700
},
{
"epoch": 0.06080660729307255,
"grad_norm": 3.239888006176567,
"learning_rate": 1.9870328835854743e-05,
"loss": 1.3263,
"step": 14710
},
{
"epoch": 0.06084794421169463,
"grad_norm": 2.7923482855439716,
"learning_rate": 1.9870118189191833e-05,
"loss": 1.3532,
"step": 14720
},
{
"epoch": 0.060889281130316705,
"grad_norm": 3.672374432583175,
"learning_rate": 1.9869907372691715e-05,
"loss": 1.3749,
"step": 14730
},
{
"epoch": 0.060930618048938776,
"grad_norm": 3.1371007241226017,
"learning_rate": 1.9869696386358032e-05,
"loss": 1.3529,
"step": 14740
},
{
"epoch": 0.06097195496756085,
"grad_norm": 3.5064852286924837,
"learning_rate": 1.9869485230194403e-05,
"loss": 1.3664,
"step": 14750
},
{
"epoch": 0.06101329188618293,
"grad_norm": 3.9587666778347073,
"learning_rate": 1.9869273904204465e-05,
"loss": 1.3847,
"step": 14760
},
{
"epoch": 0.061054628804805,
"grad_norm": 3.2129303491061973,
"learning_rate": 1.9869062408391855e-05,
"loss": 1.3625,
"step": 14770
},
{
"epoch": 0.06109596572342708,
"grad_norm": 3.1860777109810465,
"learning_rate": 1.9868850742760212e-05,
"loss": 1.3062,
"step": 14780
},
{
"epoch": 0.061137302642049154,
"grad_norm": 3.47772048599133,
"learning_rate": 1.9868638907313174e-05,
"loss": 1.3487,
"step": 14790
},
{
"epoch": 0.06117863956067123,
"grad_norm": 3.323868913803053,
"learning_rate": 1.9868426902054394e-05,
"loss": 1.3304,
"step": 14800
},
{
"epoch": 0.0612199764792933,
"grad_norm": 3.4796197385612735,
"learning_rate": 1.9868214726987513e-05,
"loss": 1.3143,
"step": 14810
},
{
"epoch": 0.06126131339791538,
"grad_norm": 3.8413485871243402,
"learning_rate": 1.9868002382116186e-05,
"loss": 1.3451,
"step": 14820
},
{
"epoch": 0.061302650316537456,
"grad_norm": 3.3376163822500278,
"learning_rate": 1.9867789867444066e-05,
"loss": 1.3486,
"step": 14830
},
{
"epoch": 0.061343987235159526,
"grad_norm": 3.3554967862457543,
"learning_rate": 1.9867577182974807e-05,
"loss": 1.3447,
"step": 14840
},
{
"epoch": 0.061385324153781604,
"grad_norm": 3.0553156261560757,
"learning_rate": 1.9867364328712074e-05,
"loss": 1.3436,
"step": 14850
},
{
"epoch": 0.06142666107240368,
"grad_norm": 3.9386923791793027,
"learning_rate": 1.9867151304659527e-05,
"loss": 1.3719,
"step": 14860
},
{
"epoch": 0.06146799799102576,
"grad_norm": 3.915463974315291,
"learning_rate": 1.986693811082083e-05,
"loss": 1.328,
"step": 14870
},
{
"epoch": 0.06150933490964783,
"grad_norm": 2.800179512440139,
"learning_rate": 1.986672474719965e-05,
"loss": 1.322,
"step": 14880
},
{
"epoch": 0.061550671828269905,
"grad_norm": 3.3411218270323664,
"learning_rate": 1.9866511213799665e-05,
"loss": 1.3899,
"step": 14890
},
{
"epoch": 0.06159200874689198,
"grad_norm": 3.323705843953213,
"learning_rate": 1.9866297510624544e-05,
"loss": 1.3615,
"step": 14900
},
{
"epoch": 0.06163334566551405,
"grad_norm": 4.754040048447529,
"learning_rate": 1.9866083637677963e-05,
"loss": 1.3726,
"step": 14910
},
{
"epoch": 0.06167468258413613,
"grad_norm": 3.45837680678418,
"learning_rate": 1.9865869594963607e-05,
"loss": 1.3519,
"step": 14920
},
{
"epoch": 0.06171601950275821,
"grad_norm": 3.183067903396876,
"learning_rate": 1.986565538248516e-05,
"loss": 1.3584,
"step": 14930
},
{
"epoch": 0.061757356421380284,
"grad_norm": 3.8419649238240656,
"learning_rate": 1.98654410002463e-05,
"loss": 1.3698,
"step": 14940
},
{
"epoch": 0.061798693340002354,
"grad_norm": 3.3977765733638168,
"learning_rate": 1.9865226448250725e-05,
"loss": 1.3702,
"step": 14950
},
{
"epoch": 0.06184003025862443,
"grad_norm": 3.5010199408218305,
"learning_rate": 1.9865011726502118e-05,
"loss": 1.3515,
"step": 14960
},
{
"epoch": 0.06188136717724651,
"grad_norm": 3.521969897290798,
"learning_rate": 1.9864796835004184e-05,
"loss": 1.3562,
"step": 14970
},
{
"epoch": 0.06192270409586858,
"grad_norm": 3.198260891262558,
"learning_rate": 1.986458177376061e-05,
"loss": 1.3265,
"step": 14980
},
{
"epoch": 0.061964041014490656,
"grad_norm": 4.203288617287408,
"learning_rate": 1.9864366542775104e-05,
"loss": 1.3445,
"step": 14990
},
{
"epoch": 0.06200537793311273,
"grad_norm": 3.0231024395080204,
"learning_rate": 1.9864151142051367e-05,
"loss": 1.3437,
"step": 15000
},
{
"epoch": 0.06204671485173481,
"grad_norm": 3.375301699368925,
"learning_rate": 1.9863935571593104e-05,
"loss": 1.3587,
"step": 15010
},
{
"epoch": 0.06208805177035688,
"grad_norm": 3.3942212627441433,
"learning_rate": 1.986371983140403e-05,
"loss": 1.3574,
"step": 15020
},
{
"epoch": 0.06212938868897896,
"grad_norm": 2.8229502868870906,
"learning_rate": 1.986350392148785e-05,
"loss": 1.3252,
"step": 15030
},
{
"epoch": 0.062170725607601035,
"grad_norm": 3.531378883228978,
"learning_rate": 1.9863287841848283e-05,
"loss": 1.3284,
"step": 15040
},
{
"epoch": 0.062212062526223105,
"grad_norm": 3.4111136482320057,
"learning_rate": 1.986307159248905e-05,
"loss": 1.3503,
"step": 15050
},
{
"epoch": 0.06225339944484518,
"grad_norm": 3.1924697026460525,
"learning_rate": 1.9862855173413864e-05,
"loss": 1.3316,
"step": 15060
},
{
"epoch": 0.06229473636346726,
"grad_norm": 3.673294835187914,
"learning_rate": 1.9862638584626456e-05,
"loss": 1.378,
"step": 15070
},
{
"epoch": 0.06233607328208933,
"grad_norm": 2.944502490931273,
"learning_rate": 1.9862421826130548e-05,
"loss": 1.3505,
"step": 15080
},
{
"epoch": 0.06237741020071141,
"grad_norm": 3.166457773127537,
"learning_rate": 1.9862204897929875e-05,
"loss": 1.3274,
"step": 15090
},
{
"epoch": 0.062418747119333484,
"grad_norm": 3.539140440652841,
"learning_rate": 1.9861987800028167e-05,
"loss": 1.3373,
"step": 15100
},
{
"epoch": 0.06246008403795556,
"grad_norm": 4.111976580182342,
"learning_rate": 1.986177053242916e-05,
"loss": 1.3795,
"step": 15110
},
{
"epoch": 0.06250142095657764,
"grad_norm": 3.518730561598167,
"learning_rate": 1.986155309513659e-05,
"loss": 1.3284,
"step": 15120
},
{
"epoch": 0.06254275787519971,
"grad_norm": 3.391425624844218,
"learning_rate": 1.9861335488154206e-05,
"loss": 1.3587,
"step": 15130
},
{
"epoch": 0.06258409479382178,
"grad_norm": 3.4961442083301097,
"learning_rate": 1.9861117711485743e-05,
"loss": 1.399,
"step": 15140
},
{
"epoch": 0.06262543171244386,
"grad_norm": 2.7302506739470784,
"learning_rate": 1.9860899765134953e-05,
"loss": 1.3654,
"step": 15150
},
{
"epoch": 0.06266676863106593,
"grad_norm": 4.411982690984872,
"learning_rate": 1.9860681649105585e-05,
"loss": 1.3409,
"step": 15160
},
{
"epoch": 0.062708105549688,
"grad_norm": 3.671422586435062,
"learning_rate": 1.9860463363401393e-05,
"loss": 1.3629,
"step": 15170
},
{
"epoch": 0.06274944246831009,
"grad_norm": 3.599702261798689,
"learning_rate": 1.9860244908026133e-05,
"loss": 1.3464,
"step": 15180
},
{
"epoch": 0.06279077938693216,
"grad_norm": 3.6962151021925598,
"learning_rate": 1.9860026282983568e-05,
"loss": 1.362,
"step": 15190
},
{
"epoch": 0.06283211630555424,
"grad_norm": 3.5740776610173284,
"learning_rate": 1.9859807488277453e-05,
"loss": 1.3657,
"step": 15200
},
{
"epoch": 0.06287345322417631,
"grad_norm": 3.437476924912017,
"learning_rate": 1.9859588523911554e-05,
"loss": 1.3384,
"step": 15210
},
{
"epoch": 0.06291479014279838,
"grad_norm": 3.2699157716406373,
"learning_rate": 1.9859369389889642e-05,
"loss": 1.3658,
"step": 15220
},
{
"epoch": 0.06295612706142047,
"grad_norm": 3.123305029392182,
"learning_rate": 1.9859150086215487e-05,
"loss": 1.352,
"step": 15230
},
{
"epoch": 0.06299746398004254,
"grad_norm": 3.2830928690839354,
"learning_rate": 1.985893061289286e-05,
"loss": 1.3799,
"step": 15240
},
{
"epoch": 0.06303880089866461,
"grad_norm": 3.664715842390073,
"learning_rate": 1.9858710969925547e-05,
"loss": 1.3669,
"step": 15250
},
{
"epoch": 0.06308013781728669,
"grad_norm": 3.2013150011378952,
"learning_rate": 1.985849115731731e-05,
"loss": 1.3403,
"step": 15260
},
{
"epoch": 0.06312147473590876,
"grad_norm": 3.1107330520735643,
"learning_rate": 1.9858271175071946e-05,
"loss": 1.348,
"step": 15270
},
{
"epoch": 0.06316281165453083,
"grad_norm": 3.188091096811774,
"learning_rate": 1.9858051023193234e-05,
"loss": 1.3219,
"step": 15280
},
{
"epoch": 0.06320414857315292,
"grad_norm": 3.1313626953852705,
"learning_rate": 1.9857830701684967e-05,
"loss": 1.3622,
"step": 15290
},
{
"epoch": 0.06324548549177499,
"grad_norm": 3.1702519850910127,
"learning_rate": 1.985761021055093e-05,
"loss": 1.3546,
"step": 15300
},
{
"epoch": 0.06328682241039706,
"grad_norm": 3.252596544143132,
"learning_rate": 1.9857389549794917e-05,
"loss": 1.2552,
"step": 15310
},
{
"epoch": 0.06332815932901914,
"grad_norm": 3.0313555049861374,
"learning_rate": 1.985716871942073e-05,
"loss": 1.4033,
"step": 15320
},
{
"epoch": 0.06336949624764121,
"grad_norm": 3.4443767622891115,
"learning_rate": 1.985694771943217e-05,
"loss": 1.3893,
"step": 15330
},
{
"epoch": 0.06341083316626328,
"grad_norm": 3.616984395155513,
"learning_rate": 1.9856726549833034e-05,
"loss": 1.3499,
"step": 15340
},
{
"epoch": 0.06345217008488536,
"grad_norm": 3.50828559499885,
"learning_rate": 1.985650521062713e-05,
"loss": 1.3298,
"step": 15350
},
{
"epoch": 0.06349350700350744,
"grad_norm": 2.8467914617151244,
"learning_rate": 1.9856283701818268e-05,
"loss": 1.3307,
"step": 15360
},
{
"epoch": 0.06353484392212952,
"grad_norm": 4.069010272907543,
"learning_rate": 1.9856062023410257e-05,
"loss": 1.3341,
"step": 15370
},
{
"epoch": 0.06357618084075159,
"grad_norm": 3.554716298604244,
"learning_rate": 1.985584017540691e-05,
"loss": 1.3366,
"step": 15380
},
{
"epoch": 0.06361751775937366,
"grad_norm": 3.2437195929255123,
"learning_rate": 1.985561815781205e-05,
"loss": 1.3241,
"step": 15390
},
{
"epoch": 0.06365885467799574,
"grad_norm": 3.0814014462588495,
"learning_rate": 1.9855395970629497e-05,
"loss": 1.3086,
"step": 15400
},
{
"epoch": 0.06370019159661781,
"grad_norm": 2.845186766341529,
"learning_rate": 1.985517361386307e-05,
"loss": 1.3302,
"step": 15410
},
{
"epoch": 0.06374152851523988,
"grad_norm": 3.594783956612939,
"learning_rate": 1.9854951087516598e-05,
"loss": 1.3374,
"step": 15420
},
{
"epoch": 0.06378286543386197,
"grad_norm": 2.8854977362042815,
"learning_rate": 1.9854728391593904e-05,
"loss": 1.3326,
"step": 15430
},
{
"epoch": 0.06382420235248404,
"grad_norm": 3.5259935395974784,
"learning_rate": 1.985450552609883e-05,
"loss": 1.3081,
"step": 15440
},
{
"epoch": 0.06386553927110611,
"grad_norm": 3.245306593778846,
"learning_rate": 1.9854282491035203e-05,
"loss": 1.3746,
"step": 15450
},
{
"epoch": 0.06390687618972819,
"grad_norm": 4.071308549266746,
"learning_rate": 1.9854059286406866e-05,
"loss": 1.3783,
"step": 15460
},
{
"epoch": 0.06394821310835026,
"grad_norm": 3.247822958885961,
"learning_rate": 1.9853835912217657e-05,
"loss": 1.3411,
"step": 15470
},
{
"epoch": 0.06398955002697233,
"grad_norm": 3.35088921885468,
"learning_rate": 1.9853612368471416e-05,
"loss": 1.3769,
"step": 15480
},
{
"epoch": 0.06403088694559442,
"grad_norm": 3.311416405062516,
"learning_rate": 1.9853388655171998e-05,
"loss": 1.3546,
"step": 15490
},
{
"epoch": 0.06407222386421649,
"grad_norm": 3.423751956404864,
"learning_rate": 1.985316477232325e-05,
"loss": 1.3082,
"step": 15500
},
{
"epoch": 0.06411356078283857,
"grad_norm": 3.5994672319763072,
"learning_rate": 1.9852940719929017e-05,
"loss": 1.308,
"step": 15510
},
{
"epoch": 0.06415489770146064,
"grad_norm": 6.092109341719527,
"learning_rate": 1.9852716497993164e-05,
"loss": 1.3333,
"step": 15520
},
{
"epoch": 0.06419623462008271,
"grad_norm": 3.4870247799249574,
"learning_rate": 1.985249210651954e-05,
"loss": 1.3755,
"step": 15530
},
{
"epoch": 0.0642375715387048,
"grad_norm": 3.1557901036816687,
"learning_rate": 1.9852267545512016e-05,
"loss": 1.3237,
"step": 15540
},
{
"epoch": 0.06427890845732687,
"grad_norm": 2.9168599988216655,
"learning_rate": 1.9852042814974448e-05,
"loss": 1.3333,
"step": 15550
},
{
"epoch": 0.06432024537594894,
"grad_norm": 3.658350430782456,
"learning_rate": 1.9851817914910707e-05,
"loss": 1.3157,
"step": 15560
},
{
"epoch": 0.06436158229457102,
"grad_norm": 2.98123289160538,
"learning_rate": 1.9851592845324664e-05,
"loss": 1.3461,
"step": 15570
},
{
"epoch": 0.06440291921319309,
"grad_norm": 3.2851248740018133,
"learning_rate": 1.9851367606220187e-05,
"loss": 1.3592,
"step": 15580
},
{
"epoch": 0.06444425613181516,
"grad_norm": 2.874060228764119,
"learning_rate": 1.9851142197601157e-05,
"loss": 1.3179,
"step": 15590
},
{
"epoch": 0.06448559305043725,
"grad_norm": 3.3595179685654286,
"learning_rate": 1.985091661947145e-05,
"loss": 1.358,
"step": 15600
},
{
"epoch": 0.06452692996905932,
"grad_norm": 4.343402000280312,
"learning_rate": 1.9850690871834945e-05,
"loss": 1.3387,
"step": 15610
},
{
"epoch": 0.06456826688768139,
"grad_norm": 4.4028919030557345,
"learning_rate": 1.985046495469553e-05,
"loss": 1.3405,
"step": 15620
},
{
"epoch": 0.06460960380630347,
"grad_norm": 3.3683021821213077,
"learning_rate": 1.9850238868057097e-05,
"loss": 1.3164,
"step": 15630
},
{
"epoch": 0.06465094072492554,
"grad_norm": 3.223018835339703,
"learning_rate": 1.9850012611923527e-05,
"loss": 1.2937,
"step": 15640
},
{
"epoch": 0.06469227764354761,
"grad_norm": 3.6479375571543584,
"learning_rate": 1.984978618629872e-05,
"loss": 1.3703,
"step": 15650
},
{
"epoch": 0.0647336145621697,
"grad_norm": 3.0694724692776107,
"learning_rate": 1.9849559591186566e-05,
"loss": 1.3239,
"step": 15660
},
{
"epoch": 0.06477495148079176,
"grad_norm": 3.595788633474915,
"learning_rate": 1.984933282659097e-05,
"loss": 1.2858,
"step": 15670
},
{
"epoch": 0.06481628839941385,
"grad_norm": 2.996464746206387,
"learning_rate": 1.984910589251583e-05,
"loss": 1.3677,
"step": 15680
},
{
"epoch": 0.06485762531803592,
"grad_norm": 2.978010213666374,
"learning_rate": 1.9848878788965053e-05,
"loss": 1.3612,
"step": 15690
},
{
"epoch": 0.06489896223665799,
"grad_norm": 4.107355827588679,
"learning_rate": 1.9848651515942545e-05,
"loss": 1.3473,
"step": 15700
},
{
"epoch": 0.06494029915528007,
"grad_norm": 3.405862899284087,
"learning_rate": 1.984842407345222e-05,
"loss": 1.329,
"step": 15710
},
{
"epoch": 0.06498163607390214,
"grad_norm": 3.56806926046013,
"learning_rate": 1.984819646149799e-05,
"loss": 1.3547,
"step": 15720
},
{
"epoch": 0.06502297299252421,
"grad_norm": 2.9244416633025234,
"learning_rate": 1.984796868008377e-05,
"loss": 1.3451,
"step": 15730
},
{
"epoch": 0.0650643099111463,
"grad_norm": 3.6605360363216133,
"learning_rate": 1.984774072921348e-05,
"loss": 1.3121,
"step": 15740
},
{
"epoch": 0.06510564682976837,
"grad_norm": 3.857082230167186,
"learning_rate": 1.9847512608891046e-05,
"loss": 1.3546,
"step": 15750
},
{
"epoch": 0.06514698374839044,
"grad_norm": 3.20861184076794,
"learning_rate": 1.9847284319120386e-05,
"loss": 1.3384,
"step": 15760
},
{
"epoch": 0.06518832066701252,
"grad_norm": 4.374471400346774,
"learning_rate": 1.9847055859905434e-05,
"loss": 1.3603,
"step": 15770
},
{
"epoch": 0.06522965758563459,
"grad_norm": 3.558128348565767,
"learning_rate": 1.984682723125012e-05,
"loss": 1.3307,
"step": 15780
},
{
"epoch": 0.06527099450425666,
"grad_norm": 3.740975960278226,
"learning_rate": 1.984659843315838e-05,
"loss": 1.3565,
"step": 15790
},
{
"epoch": 0.06531233142287875,
"grad_norm": 3.0884764960813254,
"learning_rate": 1.9846369465634146e-05,
"loss": 1.3371,
"step": 15800
},
{
"epoch": 0.06535366834150082,
"grad_norm": 3.0640927344766236,
"learning_rate": 1.9846140328681363e-05,
"loss": 1.3075,
"step": 15810
},
{
"epoch": 0.0653950052601229,
"grad_norm": 3.6774626803339285,
"learning_rate": 1.9845911022303973e-05,
"loss": 1.3647,
"step": 15820
},
{
"epoch": 0.06543634217874497,
"grad_norm": 2.9365431211187065,
"learning_rate": 1.9845681546505915e-05,
"loss": 1.3086,
"step": 15830
},
{
"epoch": 0.06547767909736704,
"grad_norm": 4.305431264385432,
"learning_rate": 1.9845451901291145e-05,
"loss": 1.3348,
"step": 15840
},
{
"epoch": 0.06551901601598913,
"grad_norm": 3.032533703820296,
"learning_rate": 1.9845222086663615e-05,
"loss": 1.3527,
"step": 15850
},
{
"epoch": 0.0655603529346112,
"grad_norm": 3.3387798006802782,
"learning_rate": 1.9844992102627273e-05,
"loss": 1.3249,
"step": 15860
},
{
"epoch": 0.06560168985323327,
"grad_norm": 3.3127539852292363,
"learning_rate": 1.9844761949186083e-05,
"loss": 1.3323,
"step": 15870
},
{
"epoch": 0.06564302677185535,
"grad_norm": 3.4862694527591307,
"learning_rate": 1.9844531626344003e-05,
"loss": 1.3224,
"step": 15880
},
{
"epoch": 0.06568436369047742,
"grad_norm": 3.215035735991411,
"learning_rate": 1.9844301134104996e-05,
"loss": 1.349,
"step": 15890
},
{
"epoch": 0.06572570060909949,
"grad_norm": 3.331575129362213,
"learning_rate": 1.9844070472473026e-05,
"loss": 1.3297,
"step": 15900
},
{
"epoch": 0.06576703752772158,
"grad_norm": 3.008247289759144,
"learning_rate": 1.9843839641452062e-05,
"loss": 1.368,
"step": 15910
},
{
"epoch": 0.06580837444634365,
"grad_norm": 3.314210218559566,
"learning_rate": 1.984360864104608e-05,
"loss": 1.3318,
"step": 15920
},
{
"epoch": 0.06584971136496572,
"grad_norm": 4.460695903393647,
"learning_rate": 1.9843377471259056e-05,
"loss": 1.363,
"step": 15930
},
{
"epoch": 0.0658910482835878,
"grad_norm": 3.591421453277731,
"learning_rate": 1.984314613209496e-05,
"loss": 1.3428,
"step": 15940
},
{
"epoch": 0.06593238520220987,
"grad_norm": 3.708262991759124,
"learning_rate": 1.984291462355778e-05,
"loss": 1.3564,
"step": 15950
},
{
"epoch": 0.06597372212083194,
"grad_norm": 3.1432924399561903,
"learning_rate": 1.9842682945651495e-05,
"loss": 1.3455,
"step": 15960
},
{
"epoch": 0.06601505903945402,
"grad_norm": 3.1161735074970216,
"learning_rate": 1.9842451098380096e-05,
"loss": 1.3514,
"step": 15970
},
{
"epoch": 0.0660563959580761,
"grad_norm": 2.8632026794139875,
"learning_rate": 1.984221908174757e-05,
"loss": 1.3446,
"step": 15980
},
{
"epoch": 0.06609773287669818,
"grad_norm": 3.934735081002441,
"learning_rate": 1.9841986895757907e-05,
"loss": 1.3298,
"step": 15990
},
{
"epoch": 0.06613906979532025,
"grad_norm": 4.324584533198296,
"learning_rate": 1.9841754540415102e-05,
"loss": 1.3537,
"step": 16000
},
{
"epoch": 0.06618040671394232,
"grad_norm": 3.6119231745645166,
"learning_rate": 1.9841522015723164e-05,
"loss": 1.3343,
"step": 16010
},
{
"epoch": 0.0662217436325644,
"grad_norm": 3.1193540137010887,
"learning_rate": 1.984128932168608e-05,
"loss": 1.3752,
"step": 16020
},
{
"epoch": 0.06626308055118647,
"grad_norm": 3.1351192552066762,
"learning_rate": 1.984105645830786e-05,
"loss": 1.3289,
"step": 16030
},
{
"epoch": 0.06630441746980854,
"grad_norm": 3.5610679476787737,
"learning_rate": 1.9840823425592512e-05,
"loss": 1.3543,
"step": 16040
},
{
"epoch": 0.06634575438843063,
"grad_norm": 3.5677181048737943,
"learning_rate": 1.984059022354404e-05,
"loss": 1.3483,
"step": 16050
},
{
"epoch": 0.0663870913070527,
"grad_norm": 3.6239317747912385,
"learning_rate": 1.9840356852166465e-05,
"loss": 1.3511,
"step": 16060
},
{
"epoch": 0.06642842822567477,
"grad_norm": 3.7952366513012636,
"learning_rate": 1.9840123311463803e-05,
"loss": 1.33,
"step": 16070
},
{
"epoch": 0.06646976514429685,
"grad_norm": 3.345456868567261,
"learning_rate": 1.9839889601440064e-05,
"loss": 1.3226,
"step": 16080
},
{
"epoch": 0.06651110206291892,
"grad_norm": 3.0646963347861194,
"learning_rate": 1.9839655722099277e-05,
"loss": 1.3142,
"step": 16090
},
{
"epoch": 0.06655243898154099,
"grad_norm": 3.1796454557601956,
"learning_rate": 1.9839421673445457e-05,
"loss": 1.3363,
"step": 16100
},
{
"epoch": 0.06659377590016308,
"grad_norm": 2.7604488860103453,
"learning_rate": 1.9839187455482646e-05,
"loss": 1.3453,
"step": 16110
},
{
"epoch": 0.06663511281878515,
"grad_norm": 3.1473856534090885,
"learning_rate": 1.9838953068214862e-05,
"loss": 1.3146,
"step": 16120
},
{
"epoch": 0.06667644973740723,
"grad_norm": 3.323185903362043,
"learning_rate": 1.983871851164614e-05,
"loss": 1.3013,
"step": 16130
},
{
"epoch": 0.0667177866560293,
"grad_norm": 3.0139283592638315,
"learning_rate": 1.9838483785780522e-05,
"loss": 1.3761,
"step": 16140
},
{
"epoch": 0.06675912357465137,
"grad_norm": 3.2332451066710535,
"learning_rate": 1.9838248890622043e-05,
"loss": 1.341,
"step": 16150
},
{
"epoch": 0.06680046049327346,
"grad_norm": 3.0517814224449826,
"learning_rate": 1.9838013826174745e-05,
"loss": 1.3003,
"step": 16160
},
{
"epoch": 0.06684179741189553,
"grad_norm": 4.164899927708623,
"learning_rate": 1.983777859244267e-05,
"loss": 1.3247,
"step": 16170
},
{
"epoch": 0.0668831343305176,
"grad_norm": 2.992166731140292,
"learning_rate": 1.983754318942987e-05,
"loss": 1.2762,
"step": 16180
},
{
"epoch": 0.06692447124913968,
"grad_norm": 3.774796705592006,
"learning_rate": 1.98373076171404e-05,
"loss": 1.3652,
"step": 16190
},
{
"epoch": 0.06696580816776175,
"grad_norm": 3.6727461962589296,
"learning_rate": 1.98370718755783e-05,
"loss": 1.3433,
"step": 16200
},
{
"epoch": 0.06700714508638382,
"grad_norm": 2.759830849179559,
"learning_rate": 1.983683596474764e-05,
"loss": 1.2917,
"step": 16210
},
{
"epoch": 0.0670484820050059,
"grad_norm": 4.142227423223113,
"learning_rate": 1.983659988465247e-05,
"loss": 1.3287,
"step": 16220
},
{
"epoch": 0.06708981892362798,
"grad_norm": 3.1361138389830305,
"learning_rate": 1.9836363635296856e-05,
"loss": 1.3526,
"step": 16230
},
{
"epoch": 0.06713115584225005,
"grad_norm": 4.684718067129147,
"learning_rate": 1.9836127216684864e-05,
"loss": 1.3398,
"step": 16240
},
{
"epoch": 0.06717249276087213,
"grad_norm": 3.9305271840952325,
"learning_rate": 1.9835890628820564e-05,
"loss": 1.3061,
"step": 16250
},
{
"epoch": 0.0672138296794942,
"grad_norm": 3.6945496877183754,
"learning_rate": 1.983565387170802e-05,
"loss": 1.3046,
"step": 16260
},
{
"epoch": 0.06725516659811627,
"grad_norm": 2.913410222912495,
"learning_rate": 1.983541694535131e-05,
"loss": 1.3439,
"step": 16270
},
{
"epoch": 0.06729650351673835,
"grad_norm": 3.4482083464660143,
"learning_rate": 1.9835179849754517e-05,
"loss": 1.3282,
"step": 16280
},
{
"epoch": 0.06733784043536042,
"grad_norm": 4.311927057182653,
"learning_rate": 1.983494258492171e-05,
"loss": 1.3156,
"step": 16290
},
{
"epoch": 0.06737917735398251,
"grad_norm": 3.5718375129378015,
"learning_rate": 1.9834705150856973e-05,
"loss": 1.3088,
"step": 16300
},
{
"epoch": 0.06742051427260458,
"grad_norm": 3.6069557479034704,
"learning_rate": 1.98344675475644e-05,
"loss": 1.3259,
"step": 16310
},
{
"epoch": 0.06746185119122665,
"grad_norm": 3.2551197753818393,
"learning_rate": 1.9834229775048076e-05,
"loss": 1.3389,
"step": 16320
},
{
"epoch": 0.06750318810984873,
"grad_norm": 3.4034457092581536,
"learning_rate": 1.9833991833312086e-05,
"loss": 1.3396,
"step": 16330
},
{
"epoch": 0.0675445250284708,
"grad_norm": 2.9661916406960858,
"learning_rate": 1.9833753722360534e-05,
"loss": 1.2989,
"step": 16340
},
{
"epoch": 0.06758586194709287,
"grad_norm": 5.279255386120152,
"learning_rate": 1.983351544219751e-05,
"loss": 1.3283,
"step": 16350
},
{
"epoch": 0.06762719886571496,
"grad_norm": 3.8221920564477028,
"learning_rate": 1.9833276992827117e-05,
"loss": 1.2918,
"step": 16360
},
{
"epoch": 0.06766853578433703,
"grad_norm": 3.2255728423614163,
"learning_rate": 1.9833038374253456e-05,
"loss": 1.327,
"step": 16370
},
{
"epoch": 0.0677098727029591,
"grad_norm": 3.2355832791038464,
"learning_rate": 1.9832799586480637e-05,
"loss": 1.3204,
"step": 16380
},
{
"epoch": 0.06775120962158118,
"grad_norm": 3.2467585154210155,
"learning_rate": 1.9832560629512767e-05,
"loss": 1.3338,
"step": 16390
},
{
"epoch": 0.06779254654020325,
"grad_norm": 4.988429472619141,
"learning_rate": 1.9832321503353954e-05,
"loss": 1.3876,
"step": 16400
},
{
"epoch": 0.06783388345882532,
"grad_norm": 3.446832489576954,
"learning_rate": 1.9832082208008317e-05,
"loss": 1.3233,
"step": 16410
},
{
"epoch": 0.0678752203774474,
"grad_norm": 3.605129460226094,
"learning_rate": 1.9831842743479975e-05,
"loss": 1.3386,
"step": 16420
},
{
"epoch": 0.06791655729606948,
"grad_norm": 4.275587631874799,
"learning_rate": 1.9831603109773044e-05,
"loss": 1.3613,
"step": 16430
},
{
"epoch": 0.06795789421469156,
"grad_norm": 4.1009269376864665,
"learning_rate": 1.983136330689165e-05,
"loss": 1.3662,
"step": 16440
},
{
"epoch": 0.06799923113331363,
"grad_norm": 2.9883770968491565,
"learning_rate": 1.983112333483992e-05,
"loss": 1.3304,
"step": 16450
},
{
"epoch": 0.0680405680519357,
"grad_norm": 2.9575768154010085,
"learning_rate": 1.983088319362198e-05,
"loss": 1.3075,
"step": 16460
},
{
"epoch": 0.06808190497055779,
"grad_norm": 3.1295830557654147,
"learning_rate": 1.9830642883241967e-05,
"loss": 1.3311,
"step": 16470
},
{
"epoch": 0.06812324188917986,
"grad_norm": 3.6574005023751774,
"learning_rate": 1.9830402403704008e-05,
"loss": 1.2925,
"step": 16480
},
{
"epoch": 0.06816457880780193,
"grad_norm": 3.080968609055009,
"learning_rate": 1.9830161755012255e-05,
"loss": 1.3156,
"step": 16490
},
{
"epoch": 0.06820591572642401,
"grad_norm": 3.0330618488937553,
"learning_rate": 1.9829920937170835e-05,
"loss": 1.3314,
"step": 16500
},
{
"epoch": 0.06824725264504608,
"grad_norm": 3.3116009543610665,
"learning_rate": 1.9829679950183895e-05,
"loss": 1.3034,
"step": 16510
},
{
"epoch": 0.06828858956366815,
"grad_norm": 3.783202270094679,
"learning_rate": 1.9829438794055584e-05,
"loss": 1.3313,
"step": 16520
},
{
"epoch": 0.06832992648229023,
"grad_norm": 3.969686559597743,
"learning_rate": 1.9829197468790054e-05,
"loss": 1.2911,
"step": 16530
},
{
"epoch": 0.0683712634009123,
"grad_norm": 3.580580495241376,
"learning_rate": 1.9828955974391455e-05,
"loss": 1.2912,
"step": 16540
},
{
"epoch": 0.06841260031953438,
"grad_norm": 2.9992747255679033,
"learning_rate": 1.982871431086394e-05,
"loss": 1.302,
"step": 16550
},
{
"epoch": 0.06845393723815646,
"grad_norm": 3.5773285600866256,
"learning_rate": 1.9828472478211673e-05,
"loss": 1.334,
"step": 16560
},
{
"epoch": 0.06849527415677853,
"grad_norm": 3.647217700504523,
"learning_rate": 1.982823047643881e-05,
"loss": 1.3137,
"step": 16570
},
{
"epoch": 0.0685366110754006,
"grad_norm": 3.7929582820880428,
"learning_rate": 1.982798830554952e-05,
"loss": 1.3598,
"step": 16580
},
{
"epoch": 0.06857794799402268,
"grad_norm": 3.3921261114157817,
"learning_rate": 1.982774596554796e-05,
"loss": 1.3159,
"step": 16590
},
{
"epoch": 0.06861928491264475,
"grad_norm": 3.948788035775027,
"learning_rate": 1.9827503456438314e-05,
"loss": 1.3487,
"step": 16600
},
{
"epoch": 0.06866062183126684,
"grad_norm": 3.4434803120058595,
"learning_rate": 1.9827260778224744e-05,
"loss": 1.3611,
"step": 16610
},
{
"epoch": 0.06870195874988891,
"grad_norm": 3.345318165402726,
"learning_rate": 1.9827017930911433e-05,
"loss": 1.3214,
"step": 16620
},
{
"epoch": 0.06874329566851098,
"grad_norm": 3.3238354847739675,
"learning_rate": 1.9826774914502554e-05,
"loss": 1.3415,
"step": 16630
},
{
"epoch": 0.06878463258713306,
"grad_norm": 3.3341319261133773,
"learning_rate": 1.9826531729002293e-05,
"loss": 1.2814,
"step": 16640
},
{
"epoch": 0.06882596950575513,
"grad_norm": 3.2557833923669937,
"learning_rate": 1.982628837441483e-05,
"loss": 1.3692,
"step": 16650
},
{
"epoch": 0.0688673064243772,
"grad_norm": 2.7035167555145545,
"learning_rate": 1.9826044850744358e-05,
"loss": 1.3045,
"step": 16660
},
{
"epoch": 0.06890864334299929,
"grad_norm": 3.2563919103242167,
"learning_rate": 1.9825801157995065e-05,
"loss": 1.2807,
"step": 16670
},
{
"epoch": 0.06894998026162136,
"grad_norm": 3.369904458766467,
"learning_rate": 1.9825557296171143e-05,
"loss": 1.2897,
"step": 16680
},
{
"epoch": 0.06899131718024343,
"grad_norm": 3.064586151511127,
"learning_rate": 1.982531326527679e-05,
"loss": 1.3332,
"step": 16690
},
{
"epoch": 0.06903265409886551,
"grad_norm": 3.0263447019462753,
"learning_rate": 1.9825069065316204e-05,
"loss": 1.2825,
"step": 16700
},
{
"epoch": 0.06907399101748758,
"grad_norm": 4.065735171694178,
"learning_rate": 1.9824824696293584e-05,
"loss": 1.2698,
"step": 16710
},
{
"epoch": 0.06911532793610965,
"grad_norm": 3.4683529768298262,
"learning_rate": 1.9824580158213142e-05,
"loss": 1.3135,
"step": 16720
},
{
"epoch": 0.06915666485473174,
"grad_norm": 2.955425396507506,
"learning_rate": 1.9824335451079083e-05,
"loss": 1.3571,
"step": 16730
},
{
"epoch": 0.0691980017733538,
"grad_norm": 3.099433278967918,
"learning_rate": 1.982409057489561e-05,
"loss": 1.3273,
"step": 16740
},
{
"epoch": 0.06923933869197589,
"grad_norm": 2.8197838698402093,
"learning_rate": 1.982384552966695e-05,
"loss": 1.3158,
"step": 16750
},
{
"epoch": 0.06928067561059796,
"grad_norm": 3.422610473722703,
"learning_rate": 1.982360031539731e-05,
"loss": 1.3072,
"step": 16760
},
{
"epoch": 0.06932201252922003,
"grad_norm": 3.654024548878873,
"learning_rate": 1.9823354932090913e-05,
"loss": 1.3174,
"step": 16770
},
{
"epoch": 0.06936334944784212,
"grad_norm": 2.69963783380391,
"learning_rate": 1.982310937975198e-05,
"loss": 1.301,
"step": 16780
},
{
"epoch": 0.06940468636646419,
"grad_norm": 3.119208859314543,
"learning_rate": 1.9822863658384736e-05,
"loss": 1.2915,
"step": 16790
},
{
"epoch": 0.06944602328508626,
"grad_norm": 3.983735003830943,
"learning_rate": 1.982261776799341e-05,
"loss": 1.3029,
"step": 16800
},
{
"epoch": 0.06948736020370834,
"grad_norm": 2.8833424065520292,
"learning_rate": 1.9822371708582236e-05,
"loss": 1.309,
"step": 16810
},
{
"epoch": 0.06952869712233041,
"grad_norm": 3.5651047449269138,
"learning_rate": 1.9822125480155442e-05,
"loss": 1.3408,
"step": 16820
},
{
"epoch": 0.06957003404095248,
"grad_norm": 2.9084817767691344,
"learning_rate": 1.982187908271727e-05,
"loss": 1.3217,
"step": 16830
},
{
"epoch": 0.06961137095957456,
"grad_norm": 3.38644936958733,
"learning_rate": 1.982163251627196e-05,
"loss": 1.3094,
"step": 16840
},
{
"epoch": 0.06965270787819663,
"grad_norm": 3.7040548837026384,
"learning_rate": 1.9821385780823748e-05,
"loss": 1.2973,
"step": 16850
},
{
"epoch": 0.0696940447968187,
"grad_norm": 3.1936456120357435,
"learning_rate": 1.982113887637689e-05,
"loss": 1.3362,
"step": 16860
},
{
"epoch": 0.06973538171544079,
"grad_norm": 3.676850292318552,
"learning_rate": 1.9820891802935623e-05,
"loss": 1.2947,
"step": 16870
},
{
"epoch": 0.06977671863406286,
"grad_norm": 3.484703646839946,
"learning_rate": 1.9820644560504207e-05,
"loss": 1.2488,
"step": 16880
},
{
"epoch": 0.06981805555268493,
"grad_norm": 3.4336203726483654,
"learning_rate": 1.9820397149086892e-05,
"loss": 1.3372,
"step": 16890
},
{
"epoch": 0.06985939247130701,
"grad_norm": 3.4895422854739437,
"learning_rate": 1.9820149568687937e-05,
"loss": 1.3434,
"step": 16900
},
{
"epoch": 0.06990072938992908,
"grad_norm": 3.5294671703855545,
"learning_rate": 1.98199018193116e-05,
"loss": 1.3299,
"step": 16910
},
{
"epoch": 0.06994206630855117,
"grad_norm": 4.136918538506862,
"learning_rate": 1.9819653900962153e-05,
"loss": 1.3082,
"step": 16920
},
{
"epoch": 0.06998340322717324,
"grad_norm": 3.1337022113010975,
"learning_rate": 1.981940581364385e-05,
"loss": 1.3131,
"step": 16930
},
{
"epoch": 0.07002474014579531,
"grad_norm": 3.3461906954820115,
"learning_rate": 1.9819157557360965e-05,
"loss": 1.3533,
"step": 16940
},
{
"epoch": 0.07006607706441739,
"grad_norm": 2.945652884357033,
"learning_rate": 1.981890913211777e-05,
"loss": 1.333,
"step": 16950
},
{
"epoch": 0.07010741398303946,
"grad_norm": 3.8269320260288375,
"learning_rate": 1.981866053791854e-05,
"loss": 1.3021,
"step": 16960
},
{
"epoch": 0.07014875090166153,
"grad_norm": 3.373539442185096,
"learning_rate": 1.9818411774767555e-05,
"loss": 1.3621,
"step": 16970
},
{
"epoch": 0.07019008782028362,
"grad_norm": 3.1789158671874724,
"learning_rate": 1.9818162842669087e-05,
"loss": 1.3357,
"step": 16980
},
{
"epoch": 0.07023142473890569,
"grad_norm": 3.379334004765994,
"learning_rate": 1.981791374162743e-05,
"loss": 1.3328,
"step": 16990
},
{
"epoch": 0.07027276165752776,
"grad_norm": 3.3263838295875794,
"learning_rate": 1.981766447164686e-05,
"loss": 1.3177,
"step": 17000
},
{
"epoch": 0.07031409857614984,
"grad_norm": 3.4846748393089744,
"learning_rate": 1.9817415032731676e-05,
"loss": 1.3088,
"step": 17010
},
{
"epoch": 0.07035543549477191,
"grad_norm": 3.1166768241545526,
"learning_rate": 1.9817165424886165e-05,
"loss": 1.3168,
"step": 17020
},
{
"epoch": 0.07039677241339398,
"grad_norm": 3.880062668262267,
"learning_rate": 1.9816915648114623e-05,
"loss": 1.3071,
"step": 17030
},
{
"epoch": 0.07043810933201607,
"grad_norm": 3.511807529042466,
"learning_rate": 1.9816665702421344e-05,
"loss": 1.3409,
"step": 17040
},
{
"epoch": 0.07047944625063814,
"grad_norm": 3.1993990782018256,
"learning_rate": 1.9816415587810636e-05,
"loss": 1.2918,
"step": 17050
},
{
"epoch": 0.07052078316926022,
"grad_norm": 2.9930648253290943,
"learning_rate": 1.98161653042868e-05,
"loss": 1.2926,
"step": 17060
},
{
"epoch": 0.07056212008788229,
"grad_norm": 3.0399828063698116,
"learning_rate": 1.981591485185414e-05,
"loss": 1.3183,
"step": 17070
},
{
"epoch": 0.07060345700650436,
"grad_norm": 3.233401985951665,
"learning_rate": 1.981566423051697e-05,
"loss": 1.3262,
"step": 17080
},
{
"epoch": 0.07064479392512645,
"grad_norm": 2.891852783093535,
"learning_rate": 1.9815413440279597e-05,
"loss": 1.2882,
"step": 17090
},
{
"epoch": 0.07068613084374852,
"grad_norm": 2.861582978848217,
"learning_rate": 1.9815162481146345e-05,
"loss": 1.3417,
"step": 17100
},
{
"epoch": 0.07072746776237059,
"grad_norm": 3.657647746254238,
"learning_rate": 1.981491135312152e-05,
"loss": 1.3239,
"step": 17110
},
{
"epoch": 0.07076880468099267,
"grad_norm": 3.422931095645904,
"learning_rate": 1.9814660056209454e-05,
"loss": 1.3471,
"step": 17120
},
{
"epoch": 0.07081014159961474,
"grad_norm": 3.233056855771981,
"learning_rate": 1.9814408590414466e-05,
"loss": 1.342,
"step": 17130
},
{
"epoch": 0.07085147851823681,
"grad_norm": 2.8978002673834706,
"learning_rate": 1.9814156955740885e-05,
"loss": 1.3526,
"step": 17140
},
{
"epoch": 0.0708928154368589,
"grad_norm": 3.1044166179845227,
"learning_rate": 1.981390515219304e-05,
"loss": 1.3338,
"step": 17150
},
{
"epoch": 0.07093415235548096,
"grad_norm": 3.4443852171387963,
"learning_rate": 1.9813653179775263e-05,
"loss": 1.2798,
"step": 17160
},
{
"epoch": 0.07097548927410303,
"grad_norm": 3.4197392236572925,
"learning_rate": 1.9813401038491893e-05,
"loss": 1.3278,
"step": 17170
},
{
"epoch": 0.07101682619272512,
"grad_norm": 2.8656903185698868,
"learning_rate": 1.9813148728347263e-05,
"loss": 1.3044,
"step": 17180
},
{
"epoch": 0.07105816311134719,
"grad_norm": 3.4951853787951457,
"learning_rate": 1.981289624934572e-05,
"loss": 1.3731,
"step": 17190
},
{
"epoch": 0.07109950002996926,
"grad_norm": 3.0021246003594646,
"learning_rate": 1.981264360149161e-05,
"loss": 1.2953,
"step": 17200
},
{
"epoch": 0.07114083694859134,
"grad_norm": 3.078524102988392,
"learning_rate": 1.981239078478927e-05,
"loss": 1.3536,
"step": 17210
},
{
"epoch": 0.07118217386721341,
"grad_norm": 3.6935463858453805,
"learning_rate": 1.981213779924306e-05,
"loss": 1.3386,
"step": 17220
},
{
"epoch": 0.0712235107858355,
"grad_norm": 3.2508773899525036,
"learning_rate": 1.9811884644857332e-05,
"loss": 1.2929,
"step": 17230
},
{
"epoch": 0.07126484770445757,
"grad_norm": 4.11272659457257,
"learning_rate": 1.9811631321636438e-05,
"loss": 1.3376,
"step": 17240
},
{
"epoch": 0.07130618462307964,
"grad_norm": 3.3031386708100734,
"learning_rate": 1.9811377829584738e-05,
"loss": 1.3078,
"step": 17250
},
{
"epoch": 0.07134752154170172,
"grad_norm": 3.721567213304264,
"learning_rate": 1.9811124168706598e-05,
"loss": 1.3135,
"step": 17260
},
{
"epoch": 0.07138885846032379,
"grad_norm": 3.3077256345455712,
"learning_rate": 1.981087033900638e-05,
"loss": 1.2727,
"step": 17270
},
{
"epoch": 0.07143019537894586,
"grad_norm": 3.1365945025331716,
"learning_rate": 1.9810616340488448e-05,
"loss": 1.3082,
"step": 17280
},
{
"epoch": 0.07147153229756795,
"grad_norm": 3.0095750827782095,
"learning_rate": 1.981036217315718e-05,
"loss": 1.3422,
"step": 17290
},
{
"epoch": 0.07151286921619002,
"grad_norm": 3.879458456856346,
"learning_rate": 1.9810107837016943e-05,
"loss": 1.3444,
"step": 17300
},
{
"epoch": 0.07155420613481209,
"grad_norm": 3.5419514306232145,
"learning_rate": 1.9809853332072118e-05,
"loss": 1.3495,
"step": 17310
},
{
"epoch": 0.07159554305343417,
"grad_norm": 3.309317975252579,
"learning_rate": 1.9809598658327084e-05,
"loss": 1.3249,
"step": 17320
},
{
"epoch": 0.07163687997205624,
"grad_norm": 3.0972043541501484,
"learning_rate": 1.9809343815786218e-05,
"loss": 1.3289,
"step": 17330
},
{
"epoch": 0.07167821689067831,
"grad_norm": 3.278786808325931,
"learning_rate": 1.9809088804453913e-05,
"loss": 1.2998,
"step": 17340
},
{
"epoch": 0.0717195538093004,
"grad_norm": 4.238835899003122,
"learning_rate": 1.9808833624334547e-05,
"loss": 1.3311,
"step": 17350
},
{
"epoch": 0.07176089072792247,
"grad_norm": 2.92277331992324,
"learning_rate": 1.980857827543252e-05,
"loss": 1.2915,
"step": 17360
},
{
"epoch": 0.07180222764654455,
"grad_norm": 3.376501582542177,
"learning_rate": 1.9808322757752227e-05,
"loss": 1.3324,
"step": 17370
},
{
"epoch": 0.07184356456516662,
"grad_norm": 2.9710666955071754,
"learning_rate": 1.9808067071298057e-05,
"loss": 1.28,
"step": 17380
},
{
"epoch": 0.07188490148378869,
"grad_norm": 3.651318543567687,
"learning_rate": 1.9807811216074412e-05,
"loss": 1.2827,
"step": 17390
},
{
"epoch": 0.07192623840241077,
"grad_norm": 4.002270792783564,
"learning_rate": 1.9807555192085697e-05,
"loss": 1.2869,
"step": 17400
},
{
"epoch": 0.07196757532103285,
"grad_norm": 3.7494249341770027,
"learning_rate": 1.9807298999336316e-05,
"loss": 1.368,
"step": 17410
},
{
"epoch": 0.07200891223965492,
"grad_norm": 3.533387599209301,
"learning_rate": 1.9807042637830677e-05,
"loss": 1.3093,
"step": 17420
},
{
"epoch": 0.072050249158277,
"grad_norm": 3.3204434550557096,
"learning_rate": 1.980678610757319e-05,
"loss": 1.3557,
"step": 17430
},
{
"epoch": 0.07209158607689907,
"grad_norm": 3.1416247569905935,
"learning_rate": 1.9806529408568274e-05,
"loss": 1.2784,
"step": 17440
},
{
"epoch": 0.07213292299552114,
"grad_norm": 3.117796981445201,
"learning_rate": 1.980627254082034e-05,
"loss": 1.2882,
"step": 17450
},
{
"epoch": 0.07217425991414322,
"grad_norm": 3.13339952835746,
"learning_rate": 1.9806015504333812e-05,
"loss": 1.3144,
"step": 17460
},
{
"epoch": 0.0722155968327653,
"grad_norm": 3.7609938207052074,
"learning_rate": 1.9805758299113115e-05,
"loss": 1.329,
"step": 17470
},
{
"epoch": 0.07225693375138736,
"grad_norm": 3.1345386262954533,
"learning_rate": 1.980550092516267e-05,
"loss": 1.3049,
"step": 17480
},
{
"epoch": 0.07229827067000945,
"grad_norm": 3.7469777473015573,
"learning_rate": 1.98052433824869e-05,
"loss": 1.3096,
"step": 17490
},
{
"epoch": 0.07233960758863152,
"grad_norm": 2.9933582431757073,
"learning_rate": 1.9804985671090252e-05,
"loss": 1.3274,
"step": 17500
},
{
"epoch": 0.07238094450725359,
"grad_norm": 3.182564552885168,
"learning_rate": 1.980472779097715e-05,
"loss": 1.2891,
"step": 17510
},
{
"epoch": 0.07242228142587567,
"grad_norm": 3.8813847623233744,
"learning_rate": 1.9804469742152035e-05,
"loss": 1.2941,
"step": 17520
},
{
"epoch": 0.07246361834449774,
"grad_norm": 3.388309470760254,
"learning_rate": 1.9804211524619345e-05,
"loss": 1.3129,
"step": 17530
},
{
"epoch": 0.07250495526311983,
"grad_norm": 3.0370861904609305,
"learning_rate": 1.9803953138383523e-05,
"loss": 1.2816,
"step": 17540
},
{
"epoch": 0.0725462921817419,
"grad_norm": 3.255739551897295,
"learning_rate": 1.980369458344902e-05,
"loss": 1.3048,
"step": 17550
},
{
"epoch": 0.07258762910036397,
"grad_norm": 3.2197652995956325,
"learning_rate": 1.9803435859820278e-05,
"loss": 1.3401,
"step": 17560
},
{
"epoch": 0.07262896601898605,
"grad_norm": 3.144102345108867,
"learning_rate": 1.9803176967501752e-05,
"loss": 1.3093,
"step": 17570
},
{
"epoch": 0.07267030293760812,
"grad_norm": 3.255959441623639,
"learning_rate": 1.98029179064979e-05,
"loss": 1.3107,
"step": 17580
},
{
"epoch": 0.07271163985623019,
"grad_norm": 3.300836279803978,
"learning_rate": 1.9802658676813177e-05,
"loss": 1.2793,
"step": 17590
},
{
"epoch": 0.07275297677485228,
"grad_norm": 3.1982961633057307,
"learning_rate": 1.980239927845204e-05,
"loss": 1.293,
"step": 17600
},
{
"epoch": 0.07279431369347435,
"grad_norm": 2.7735162621143066,
"learning_rate": 1.980213971141896e-05,
"loss": 1.2829,
"step": 17610
},
{
"epoch": 0.07283565061209642,
"grad_norm": 3.1127853241409524,
"learning_rate": 1.9801879975718397e-05,
"loss": 1.3038,
"step": 17620
},
{
"epoch": 0.0728769875307185,
"grad_norm": 4.4882400102517,
"learning_rate": 1.9801620071354823e-05,
"loss": 1.3252,
"step": 17630
},
{
"epoch": 0.07291832444934057,
"grad_norm": 3.0527302986554474,
"learning_rate": 1.980135999833271e-05,
"loss": 1.3324,
"step": 17640
},
{
"epoch": 0.07295966136796264,
"grad_norm": 2.6769380820391637,
"learning_rate": 1.9801099756656534e-05,
"loss": 1.3472,
"step": 17650
},
{
"epoch": 0.07300099828658473,
"grad_norm": 2.8240010897597796,
"learning_rate": 1.980083934633077e-05,
"loss": 1.3072,
"step": 17660
},
{
"epoch": 0.0730423352052068,
"grad_norm": 3.0678813341238387,
"learning_rate": 1.9800578767359905e-05,
"loss": 1.3385,
"step": 17670
},
{
"epoch": 0.07308367212382888,
"grad_norm": 3.9303518801684056,
"learning_rate": 1.9800318019748414e-05,
"loss": 1.3024,
"step": 17680
},
{
"epoch": 0.07312500904245095,
"grad_norm": 3.4138877073872367,
"learning_rate": 1.980005710350079e-05,
"loss": 1.3219,
"step": 17690
},
{
"epoch": 0.07316634596107302,
"grad_norm": 2.9612831417650574,
"learning_rate": 1.9799796018621523e-05,
"loss": 1.2972,
"step": 17700
},
{
"epoch": 0.0732076828796951,
"grad_norm": 3.5999340523181167,
"learning_rate": 1.9799534765115106e-05,
"loss": 1.2879,
"step": 17710
},
{
"epoch": 0.07324901979831717,
"grad_norm": 3.243493407552293,
"learning_rate": 1.9799273342986027e-05,
"loss": 1.3312,
"step": 17720
},
{
"epoch": 0.07329035671693925,
"grad_norm": 3.1251098781902678,
"learning_rate": 1.979901175223879e-05,
"loss": 1.283,
"step": 17730
},
{
"epoch": 0.07333169363556133,
"grad_norm": 3.016302925303861,
"learning_rate": 1.97987499928779e-05,
"loss": 1.2653,
"step": 17740
},
{
"epoch": 0.0733730305541834,
"grad_norm": 3.0676270506907373,
"learning_rate": 1.9798488064907854e-05,
"loss": 1.3463,
"step": 17750
},
{
"epoch": 0.07341436747280547,
"grad_norm": 3.33093266064901,
"learning_rate": 1.9798225968333162e-05,
"loss": 1.2925,
"step": 17760
},
{
"epoch": 0.07345570439142755,
"grad_norm": 2.9945442082263707,
"learning_rate": 1.9797963703158338e-05,
"loss": 1.3073,
"step": 17770
},
{
"epoch": 0.07349704131004962,
"grad_norm": 2.7596863662002145,
"learning_rate": 1.9797701269387886e-05,
"loss": 1.3014,
"step": 17780
},
{
"epoch": 0.0735383782286717,
"grad_norm": 3.533092084537244,
"learning_rate": 1.979743866702633e-05,
"loss": 1.2791,
"step": 17790
},
{
"epoch": 0.07357971514729378,
"grad_norm": 4.416928923354257,
"learning_rate": 1.9797175896078183e-05,
"loss": 1.3187,
"step": 17800
},
{
"epoch": 0.07362105206591585,
"grad_norm": 3.4889626149350983,
"learning_rate": 1.9796912956547968e-05,
"loss": 1.3279,
"step": 17810
},
{
"epoch": 0.07366238898453792,
"grad_norm": 2.8612849638284636,
"learning_rate": 1.979664984844021e-05,
"loss": 1.286,
"step": 17820
},
{
"epoch": 0.07370372590316,
"grad_norm": 3.48443371509397,
"learning_rate": 1.9796386571759437e-05,
"loss": 1.3143,
"step": 17830
},
{
"epoch": 0.07374506282178207,
"grad_norm": 3.744181699697567,
"learning_rate": 1.979612312651018e-05,
"loss": 1.3133,
"step": 17840
},
{
"epoch": 0.07378639974040416,
"grad_norm": 3.0686519126186056,
"learning_rate": 1.9795859512696974e-05,
"loss": 1.3136,
"step": 17850
},
{
"epoch": 0.07382773665902623,
"grad_norm": 3.067495520118401,
"learning_rate": 1.9795595730324347e-05,
"loss": 1.3026,
"step": 17860
},
{
"epoch": 0.0738690735776483,
"grad_norm": 3.39693600568867,
"learning_rate": 1.9795331779396846e-05,
"loss": 1.3045,
"step": 17870
},
{
"epoch": 0.07391041049627038,
"grad_norm": 3.593857826641827,
"learning_rate": 1.9795067659919008e-05,
"loss": 1.3278,
"step": 17880
},
{
"epoch": 0.07395174741489245,
"grad_norm": 3.5924096708123985,
"learning_rate": 1.9794803371895383e-05,
"loss": 1.2578,
"step": 17890
},
{
"epoch": 0.07399308433351452,
"grad_norm": 3.1548306467737546,
"learning_rate": 1.9794538915330514e-05,
"loss": 1.3145,
"step": 17900
},
{
"epoch": 0.0740344212521366,
"grad_norm": 3.3874308529909736,
"learning_rate": 1.979427429022895e-05,
"loss": 1.2635,
"step": 17910
},
{
"epoch": 0.07407575817075868,
"grad_norm": 3.574121943476516,
"learning_rate": 1.979400949659525e-05,
"loss": 1.3333,
"step": 17920
},
{
"epoch": 0.07411709508938075,
"grad_norm": 3.277207731814223,
"learning_rate": 1.9793744534433968e-05,
"loss": 1.3438,
"step": 17930
},
{
"epoch": 0.07415843200800283,
"grad_norm": 3.8968392073501152,
"learning_rate": 1.979347940374966e-05,
"loss": 1.313,
"step": 17940
},
{
"epoch": 0.0741997689266249,
"grad_norm": 3.083444751622717,
"learning_rate": 1.9793214104546895e-05,
"loss": 1.3146,
"step": 17950
},
{
"epoch": 0.07424110584524697,
"grad_norm": 3.2007909625237208,
"learning_rate": 1.9792948636830235e-05,
"loss": 1.2767,
"step": 17960
},
{
"epoch": 0.07428244276386906,
"grad_norm": 3.475285388439305,
"learning_rate": 1.979268300060424e-05,
"loss": 1.3113,
"step": 17970
},
{
"epoch": 0.07432377968249113,
"grad_norm": 3.0728058276568064,
"learning_rate": 1.9792417195873496e-05,
"loss": 1.2625,
"step": 17980
},
{
"epoch": 0.07436511660111321,
"grad_norm": 3.1462512846060595,
"learning_rate": 1.9792151222642565e-05,
"loss": 1.2669,
"step": 17990
},
{
"epoch": 0.07440645351973528,
"grad_norm": 3.451097468244895,
"learning_rate": 1.9791885080916026e-05,
"loss": 1.3536,
"step": 18000
},
{
"epoch": 0.07444779043835735,
"grad_norm": 2.9123675277858805,
"learning_rate": 1.979161877069846e-05,
"loss": 1.2859,
"step": 18010
},
{
"epoch": 0.07448912735697943,
"grad_norm": 3.6048473916322905,
"learning_rate": 1.9791352291994453e-05,
"loss": 1.2868,
"step": 18020
},
{
"epoch": 0.0745304642756015,
"grad_norm": 2.877236521609129,
"learning_rate": 1.9791085644808588e-05,
"loss": 1.3201,
"step": 18030
},
{
"epoch": 0.07457180119422357,
"grad_norm": 3.346518920497629,
"learning_rate": 1.9790818829145447e-05,
"loss": 1.2914,
"step": 18040
},
{
"epoch": 0.07461313811284566,
"grad_norm": 3.5346578512252416,
"learning_rate": 1.979055184500963e-05,
"loss": 1.2668,
"step": 18050
},
{
"epoch": 0.07465447503146773,
"grad_norm": 3.044418176627761,
"learning_rate": 1.9790284692405723e-05,
"loss": 1.2722,
"step": 18060
},
{
"epoch": 0.0746958119500898,
"grad_norm": 3.4743539555195206,
"learning_rate": 1.979001737133833e-05,
"loss": 1.3093,
"step": 18070
},
{
"epoch": 0.07473714886871188,
"grad_norm": 3.152139034733342,
"learning_rate": 1.978974988181205e-05,
"loss": 1.3282,
"step": 18080
},
{
"epoch": 0.07477848578733395,
"grad_norm": 3.844127465592703,
"learning_rate": 1.978948222383148e-05,
"loss": 1.3,
"step": 18090
},
{
"epoch": 0.07481982270595602,
"grad_norm": 3.312821234732469,
"learning_rate": 1.9789214397401233e-05,
"loss": 1.3007,
"step": 18100
},
{
"epoch": 0.07486115962457811,
"grad_norm": 2.587430778686902,
"learning_rate": 1.978894640252591e-05,
"loss": 1.2712,
"step": 18110
},
{
"epoch": 0.07490249654320018,
"grad_norm": 3.6471232765625867,
"learning_rate": 1.978867823921013e-05,
"loss": 1.3255,
"step": 18120
},
{
"epoch": 0.07494383346182225,
"grad_norm": 3.4191203190660966,
"learning_rate": 1.9788409907458502e-05,
"loss": 1.2588,
"step": 18130
},
{
"epoch": 0.07498517038044433,
"grad_norm": 3.3115429776293466,
"learning_rate": 1.9788141407275643e-05,
"loss": 1.2923,
"step": 18140
},
{
"epoch": 0.0750265072990664,
"grad_norm": 3.360103375582472,
"learning_rate": 1.9787872738666182e-05,
"loss": 1.3274,
"step": 18150
},
{
"epoch": 0.07506784421768849,
"grad_norm": 4.4310427169860205,
"learning_rate": 1.978760390163473e-05,
"loss": 1.3237,
"step": 18160
},
{
"epoch": 0.07510918113631056,
"grad_norm": 2.794577806503562,
"learning_rate": 1.9787334896185916e-05,
"loss": 1.3095,
"step": 18170
},
{
"epoch": 0.07515051805493263,
"grad_norm": 3.3585888834525384,
"learning_rate": 1.9787065722324374e-05,
"loss": 1.3199,
"step": 18180
},
{
"epoch": 0.07519185497355471,
"grad_norm": 3.060067450654041,
"learning_rate": 1.9786796380054733e-05,
"loss": 1.2532,
"step": 18190
},
{
"epoch": 0.07523319189217678,
"grad_norm": 3.124345955834219,
"learning_rate": 1.978652686938163e-05,
"loss": 1.2875,
"step": 18200
},
{
"epoch": 0.07527452881079885,
"grad_norm": 3.6658880497970254,
"learning_rate": 1.9786257190309695e-05,
"loss": 1.301,
"step": 18210
},
{
"epoch": 0.07531586572942094,
"grad_norm": 3.7099075204316088,
"learning_rate": 1.9785987342843573e-05,
"loss": 1.343,
"step": 18220
},
{
"epoch": 0.075357202648043,
"grad_norm": 3.26000011118403,
"learning_rate": 1.9785717326987914e-05,
"loss": 1.2782,
"step": 18230
},
{
"epoch": 0.07539853956666508,
"grad_norm": 3.133092069419479,
"learning_rate": 1.978544714274735e-05,
"loss": 1.3026,
"step": 18240
},
{
"epoch": 0.07543987648528716,
"grad_norm": 3.1564032343316453,
"learning_rate": 1.9785176790126542e-05,
"loss": 1.3207,
"step": 18250
},
{
"epoch": 0.07548121340390923,
"grad_norm": 3.284115672721678,
"learning_rate": 1.9784906269130137e-05,
"loss": 1.3117,
"step": 18260
},
{
"epoch": 0.0755225503225313,
"grad_norm": 3.1339318140995167,
"learning_rate": 1.9784635579762793e-05,
"loss": 1.305,
"step": 18270
},
{
"epoch": 0.07556388724115339,
"grad_norm": 4.379220492841367,
"learning_rate": 1.9784364722029165e-05,
"loss": 1.3408,
"step": 18280
},
{
"epoch": 0.07560522415977546,
"grad_norm": 3.9318661880011945,
"learning_rate": 1.978409369593391e-05,
"loss": 1.3053,
"step": 18290
},
{
"epoch": 0.07564656107839754,
"grad_norm": 3.3417710306838253,
"learning_rate": 1.97838225014817e-05,
"loss": 1.2781,
"step": 18300
},
{
"epoch": 0.07568789799701961,
"grad_norm": 3.54702678079395,
"learning_rate": 1.9783551138677197e-05,
"loss": 1.2998,
"step": 18310
},
{
"epoch": 0.07572923491564168,
"grad_norm": 3.4741727106428946,
"learning_rate": 1.978327960752507e-05,
"loss": 1.3003,
"step": 18320
},
{
"epoch": 0.07577057183426376,
"grad_norm": 3.2924386529485563,
"learning_rate": 1.9783007908029995e-05,
"loss": 1.3095,
"step": 18330
},
{
"epoch": 0.07581190875288583,
"grad_norm": 3.516437725795232,
"learning_rate": 1.978273604019664e-05,
"loss": 1.3088,
"step": 18340
},
{
"epoch": 0.0758532456715079,
"grad_norm": 3.5342954273687193,
"learning_rate": 1.9782464004029692e-05,
"loss": 1.3035,
"step": 18350
},
{
"epoch": 0.07589458259012999,
"grad_norm": 3.3786994777959523,
"learning_rate": 1.9782191799533824e-05,
"loss": 1.315,
"step": 18360
},
{
"epoch": 0.07593591950875206,
"grad_norm": 3.16392973516022,
"learning_rate": 1.9781919426713725e-05,
"loss": 1.3363,
"step": 18370
},
{
"epoch": 0.07597725642737413,
"grad_norm": 3.5807733324507693,
"learning_rate": 1.9781646885574078e-05,
"loss": 1.313,
"step": 18380
},
{
"epoch": 0.07601859334599621,
"grad_norm": 2.911047627936505,
"learning_rate": 1.978137417611958e-05,
"loss": 1.2943,
"step": 18390
},
{
"epoch": 0.07605993026461828,
"grad_norm": 3.519244637721149,
"learning_rate": 1.9781101298354913e-05,
"loss": 1.3013,
"step": 18400
},
{
"epoch": 0.07610126718324035,
"grad_norm": 2.80355546975897,
"learning_rate": 1.9780828252284778e-05,
"loss": 1.319,
"step": 18410
},
{
"epoch": 0.07614260410186244,
"grad_norm": 3.272390505391555,
"learning_rate": 1.9780555037913874e-05,
"loss": 1.3139,
"step": 18420
},
{
"epoch": 0.07618394102048451,
"grad_norm": 4.373312173753661,
"learning_rate": 1.9780281655246903e-05,
"loss": 1.332,
"step": 18430
},
{
"epoch": 0.07622527793910659,
"grad_norm": 3.4593509090352828,
"learning_rate": 1.9780008104288566e-05,
"loss": 1.3275,
"step": 18440
},
{
"epoch": 0.07626661485772866,
"grad_norm": 3.988419562725868,
"learning_rate": 1.9779734385043572e-05,
"loss": 1.2876,
"step": 18450
},
{
"epoch": 0.07630795177635073,
"grad_norm": 3.1901407057893127,
"learning_rate": 1.9779460497516633e-05,
"loss": 1.3077,
"step": 18460
},
{
"epoch": 0.07634928869497282,
"grad_norm": 3.392493661451085,
"learning_rate": 1.9779186441712456e-05,
"loss": 1.2853,
"step": 18470
},
{
"epoch": 0.07639062561359489,
"grad_norm": 3.2188126520970126,
"learning_rate": 1.9778912217635762e-05,
"loss": 1.3166,
"step": 18480
},
{
"epoch": 0.07643196253221696,
"grad_norm": 3.207853073897323,
"learning_rate": 1.9778637825291267e-05,
"loss": 1.2522,
"step": 18490
},
{
"epoch": 0.07647329945083904,
"grad_norm": 3.614534499504545,
"learning_rate": 1.9778363264683694e-05,
"loss": 1.2857,
"step": 18500
},
{
"epoch": 0.07651463636946111,
"grad_norm": 3.1137575271670634,
"learning_rate": 1.9778088535817765e-05,
"loss": 1.322,
"step": 18510
},
{
"epoch": 0.07655597328808318,
"grad_norm": 3.155733653317413,
"learning_rate": 1.977781363869821e-05,
"loss": 1.2775,
"step": 18520
},
{
"epoch": 0.07659731020670527,
"grad_norm": 4.553902503352338,
"learning_rate": 1.9777538573329757e-05,
"loss": 1.3102,
"step": 18530
},
{
"epoch": 0.07663864712532734,
"grad_norm": 3.421736022180327,
"learning_rate": 1.9777263339717143e-05,
"loss": 1.3101,
"step": 18540
},
{
"epoch": 0.0766799840439494,
"grad_norm": 3.0688727373371245,
"learning_rate": 1.97769879378651e-05,
"loss": 1.3116,
"step": 18550
},
{
"epoch": 0.07672132096257149,
"grad_norm": 3.7016289513150173,
"learning_rate": 1.977671236777837e-05,
"loss": 1.3266,
"step": 18560
},
{
"epoch": 0.07676265788119356,
"grad_norm": 2.918955840628811,
"learning_rate": 1.977643662946169e-05,
"loss": 1.335,
"step": 18570
},
{
"epoch": 0.07680399479981563,
"grad_norm": 2.710819215083447,
"learning_rate": 1.9776160722919808e-05,
"loss": 1.3241,
"step": 18580
},
{
"epoch": 0.07684533171843771,
"grad_norm": 4.690835156163024,
"learning_rate": 1.9775884648157473e-05,
"loss": 1.3112,
"step": 18590
},
{
"epoch": 0.07688666863705979,
"grad_norm": 2.8651231551210685,
"learning_rate": 1.9775608405179433e-05,
"loss": 1.2753,
"step": 18600
},
{
"epoch": 0.07692800555568187,
"grad_norm": 3.440907539318232,
"learning_rate": 1.9775331993990445e-05,
"loss": 1.3065,
"step": 18610
},
{
"epoch": 0.07696934247430394,
"grad_norm": 3.284791911253238,
"learning_rate": 1.977505541459526e-05,
"loss": 1.2491,
"step": 18620
},
{
"epoch": 0.07701067939292601,
"grad_norm": 3.9127411714280815,
"learning_rate": 1.977477866699864e-05,
"loss": 1.3486,
"step": 18630
},
{
"epoch": 0.0770520163115481,
"grad_norm": 3.6832173528899292,
"learning_rate": 1.9774501751205343e-05,
"loss": 1.26,
"step": 18640
},
{
"epoch": 0.07709335323017016,
"grad_norm": 3.037263148140778,
"learning_rate": 1.9774224667220145e-05,
"loss": 1.3066,
"step": 18650
},
{
"epoch": 0.07713469014879223,
"grad_norm": 3.9474254309978978,
"learning_rate": 1.97739474150478e-05,
"loss": 1.3526,
"step": 18660
},
{
"epoch": 0.07717602706741432,
"grad_norm": 3.2632360352472087,
"learning_rate": 1.977366999469309e-05,
"loss": 1.242,
"step": 18670
},
{
"epoch": 0.07721736398603639,
"grad_norm": 3.398609819663782,
"learning_rate": 1.977339240616078e-05,
"loss": 1.3083,
"step": 18680
},
{
"epoch": 0.07725870090465846,
"grad_norm": 2.8558120813505274,
"learning_rate": 1.977311464945565e-05,
"loss": 1.3002,
"step": 18690
},
{
"epoch": 0.07730003782328054,
"grad_norm": 3.3599778107706553,
"learning_rate": 1.9772836724582483e-05,
"loss": 1.3299,
"step": 18700
},
{
"epoch": 0.07734137474190261,
"grad_norm": 3.9628018743002658,
"learning_rate": 1.9772558631546054e-05,
"loss": 1.3115,
"step": 18710
},
{
"epoch": 0.07738271166052468,
"grad_norm": 3.239048590526076,
"learning_rate": 1.9772280370351155e-05,
"loss": 1.2683,
"step": 18720
},
{
"epoch": 0.07742404857914677,
"grad_norm": 3.0722778687848558,
"learning_rate": 1.977200194100257e-05,
"loss": 1.3484,
"step": 18730
},
{
"epoch": 0.07746538549776884,
"grad_norm": 3.1378953738889637,
"learning_rate": 1.9771723343505093e-05,
"loss": 1.353,
"step": 18740
},
{
"epoch": 0.07750672241639092,
"grad_norm": 2.9849102736288113,
"learning_rate": 1.9771444577863517e-05,
"loss": 1.3318,
"step": 18750
},
{
"epoch": 0.07754805933501299,
"grad_norm": 3.006210181091503,
"learning_rate": 1.9771165644082636e-05,
"loss": 1.3095,
"step": 18760
},
{
"epoch": 0.07758939625363506,
"grad_norm": 2.8083464465567074,
"learning_rate": 1.9770886542167252e-05,
"loss": 1.2896,
"step": 18770
},
{
"epoch": 0.07763073317225715,
"grad_norm": 3.262789038521565,
"learning_rate": 1.9770607272122168e-05,
"loss": 1.3333,
"step": 18780
},
{
"epoch": 0.07767207009087922,
"grad_norm": 3.421189746770822,
"learning_rate": 1.9770327833952187e-05,
"loss": 1.28,
"step": 18790
},
{
"epoch": 0.07771340700950129,
"grad_norm": 3.006884991049078,
"learning_rate": 1.977004822766212e-05,
"loss": 1.2982,
"step": 18800
},
{
"epoch": 0.07775474392812337,
"grad_norm": 2.8631010619173427,
"learning_rate": 1.976976845325678e-05,
"loss": 1.2924,
"step": 18810
},
{
"epoch": 0.07779608084674544,
"grad_norm": 3.4772737519626533,
"learning_rate": 1.9769488510740974e-05,
"loss": 1.2927,
"step": 18820
},
{
"epoch": 0.07783741776536751,
"grad_norm": 2.7187723773853967,
"learning_rate": 1.976920840011953e-05,
"loss": 1.2868,
"step": 18830
},
{
"epoch": 0.0778787546839896,
"grad_norm": 3.145525828551427,
"learning_rate": 1.9768928121397253e-05,
"loss": 1.2662,
"step": 18840
},
{
"epoch": 0.07792009160261167,
"grad_norm": 3.6834084868042205,
"learning_rate": 1.9768647674578978e-05,
"loss": 1.2916,
"step": 18850
},
{
"epoch": 0.07796142852123374,
"grad_norm": 3.2384861614856697,
"learning_rate": 1.976836705966953e-05,
"loss": 1.2719,
"step": 18860
},
{
"epoch": 0.07800276543985582,
"grad_norm": 3.06441372291841,
"learning_rate": 1.976808627667373e-05,
"loss": 1.3137,
"step": 18870
},
{
"epoch": 0.07804410235847789,
"grad_norm": 3.1960269763685565,
"learning_rate": 1.9767805325596417e-05,
"loss": 1.2943,
"step": 18880
},
{
"epoch": 0.07808543927709996,
"grad_norm": 3.6639857210827778,
"learning_rate": 1.976752420644242e-05,
"loss": 1.2634,
"step": 18890
},
{
"epoch": 0.07812677619572204,
"grad_norm": 2.8992259499910564,
"learning_rate": 1.976724291921658e-05,
"loss": 1.3205,
"step": 18900
},
{
"epoch": 0.07816811311434411,
"grad_norm": 3.8410116582710963,
"learning_rate": 1.9766961463923735e-05,
"loss": 1.2778,
"step": 18910
},
{
"epoch": 0.0782094500329662,
"grad_norm": 4.723075181703024,
"learning_rate": 1.976667984056873e-05,
"loss": 1.2998,
"step": 18920
},
{
"epoch": 0.07825078695158827,
"grad_norm": 3.371508438100944,
"learning_rate": 1.976639804915641e-05,
"loss": 1.2512,
"step": 18930
},
{
"epoch": 0.07829212387021034,
"grad_norm": 3.6639818461414677,
"learning_rate": 1.976611608969162e-05,
"loss": 1.2961,
"step": 18940
},
{
"epoch": 0.07833346078883242,
"grad_norm": 2.8877225304005525,
"learning_rate": 1.976583396217922e-05,
"loss": 1.3345,
"step": 18950
},
{
"epoch": 0.0783747977074545,
"grad_norm": 3.282148781915577,
"learning_rate": 1.9765551666624062e-05,
"loss": 1.3293,
"step": 18960
},
{
"epoch": 0.07841613462607656,
"grad_norm": 3.1290209387742007,
"learning_rate": 1.9765269203030996e-05,
"loss": 1.3202,
"step": 18970
},
{
"epoch": 0.07845747154469865,
"grad_norm": 2.9401948590630798,
"learning_rate": 1.9764986571404892e-05,
"loss": 1.2739,
"step": 18980
},
{
"epoch": 0.07849880846332072,
"grad_norm": 2.726413528264204,
"learning_rate": 1.9764703771750606e-05,
"loss": 1.3417,
"step": 18990
},
{
"epoch": 0.07854014538194279,
"grad_norm": 3.8708836373349693,
"learning_rate": 1.976442080407301e-05,
"loss": 1.2817,
"step": 19000
},
{
"epoch": 0.07858148230056487,
"grad_norm": 2.9568442793661225,
"learning_rate": 1.976413766837697e-05,
"loss": 1.3157,
"step": 19010
},
{
"epoch": 0.07862281921918694,
"grad_norm": 3.2014326193422895,
"learning_rate": 1.9763854364667355e-05,
"loss": 1.2734,
"step": 19020
},
{
"epoch": 0.07866415613780901,
"grad_norm": 3.8142983049855204,
"learning_rate": 1.9763570892949048e-05,
"loss": 1.3268,
"step": 19030
},
{
"epoch": 0.0787054930564311,
"grad_norm": 3.2687191971325555,
"learning_rate": 1.976328725322692e-05,
"loss": 1.2922,
"step": 19040
},
{
"epoch": 0.07874682997505317,
"grad_norm": 3.8385310533023276,
"learning_rate": 1.9763003445505854e-05,
"loss": 1.274,
"step": 19050
},
{
"epoch": 0.07878816689367525,
"grad_norm": 3.11002301324885,
"learning_rate": 1.9762719469790736e-05,
"loss": 1.3201,
"step": 19060
},
{
"epoch": 0.07882950381229732,
"grad_norm": 3.1064094109546385,
"learning_rate": 1.9762435326086446e-05,
"loss": 1.2524,
"step": 19070
},
{
"epoch": 0.07887084073091939,
"grad_norm": 2.8255870735206687,
"learning_rate": 1.976215101439788e-05,
"loss": 1.2982,
"step": 19080
},
{
"epoch": 0.07891217764954148,
"grad_norm": 3.041236956663884,
"learning_rate": 1.9761866534729926e-05,
"loss": 1.2784,
"step": 19090
},
{
"epoch": 0.07895351456816355,
"grad_norm": 3.0762050459561032,
"learning_rate": 1.976158188708748e-05,
"loss": 1.2615,
"step": 19100
},
{
"epoch": 0.07899485148678562,
"grad_norm": 4.057630290231405,
"learning_rate": 1.976129707147544e-05,
"loss": 1.2956,
"step": 19110
},
{
"epoch": 0.0790361884054077,
"grad_norm": 3.530641336009131,
"learning_rate": 1.976101208789871e-05,
"loss": 1.3015,
"step": 19120
},
{
"epoch": 0.07907752532402977,
"grad_norm": 3.457139153653439,
"learning_rate": 1.976072693636219e-05,
"loss": 1.3007,
"step": 19130
},
{
"epoch": 0.07911886224265184,
"grad_norm": 4.247767055366308,
"learning_rate": 1.9760441616870785e-05,
"loss": 1.3284,
"step": 19140
},
{
"epoch": 0.07916019916127393,
"grad_norm": 3.4313340038672995,
"learning_rate": 1.976015612942941e-05,
"loss": 1.3064,
"step": 19150
},
{
"epoch": 0.079201536079896,
"grad_norm": 3.908560338711229,
"learning_rate": 1.9759870474042973e-05,
"loss": 1.3116,
"step": 19160
},
{
"epoch": 0.07924287299851807,
"grad_norm": 3.745765592110947,
"learning_rate": 1.9759584650716395e-05,
"loss": 1.3737,
"step": 19170
},
{
"epoch": 0.07928420991714015,
"grad_norm": 3.078986622432472,
"learning_rate": 1.9759298659454588e-05,
"loss": 1.2788,
"step": 19180
},
{
"epoch": 0.07932554683576222,
"grad_norm": 2.987786079650764,
"learning_rate": 1.9759012500262474e-05,
"loss": 1.2834,
"step": 19190
},
{
"epoch": 0.07936688375438429,
"grad_norm": 3.1036456287250673,
"learning_rate": 1.975872617314498e-05,
"loss": 1.2831,
"step": 19200
},
{
"epoch": 0.07940822067300637,
"grad_norm": 3.865690690229234,
"learning_rate": 1.9758439678107033e-05,
"loss": 1.2922,
"step": 19210
},
{
"epoch": 0.07944955759162844,
"grad_norm": 3.3185016462394588,
"learning_rate": 1.9758153015153553e-05,
"loss": 1.3349,
"step": 19220
},
{
"epoch": 0.07949089451025053,
"grad_norm": 2.971387000152931,
"learning_rate": 1.975786618428949e-05,
"loss": 1.2411,
"step": 19230
},
{
"epoch": 0.0795322314288726,
"grad_norm": 3.1868412483457065,
"learning_rate": 1.9757579185519766e-05,
"loss": 1.3152,
"step": 19240
},
{
"epoch": 0.07957356834749467,
"grad_norm": 3.573887753524877,
"learning_rate": 1.9757292018849322e-05,
"loss": 1.32,
"step": 19250
},
{
"epoch": 0.07961490526611675,
"grad_norm": 2.6269840263774347,
"learning_rate": 1.9757004684283107e-05,
"loss": 1.3123,
"step": 19260
},
{
"epoch": 0.07965624218473882,
"grad_norm": 3.1839995568616546,
"learning_rate": 1.9756717181826054e-05,
"loss": 1.3305,
"step": 19270
},
{
"epoch": 0.0796975791033609,
"grad_norm": 3.5732944308856474,
"learning_rate": 1.9756429511483117e-05,
"loss": 1.298,
"step": 19280
},
{
"epoch": 0.07973891602198298,
"grad_norm": 3.612052010659264,
"learning_rate": 1.9756141673259247e-05,
"loss": 1.2797,
"step": 19290
},
{
"epoch": 0.07978025294060505,
"grad_norm": 3.129559274239735,
"learning_rate": 1.9755853667159392e-05,
"loss": 1.3242,
"step": 19300
},
{
"epoch": 0.07982158985922712,
"grad_norm": 3.2804938014945915,
"learning_rate": 1.9755565493188507e-05,
"loss": 1.2882,
"step": 19310
},
{
"epoch": 0.0798629267778492,
"grad_norm": 3.5207612414520444,
"learning_rate": 1.9755277151351558e-05,
"loss": 1.3292,
"step": 19320
},
{
"epoch": 0.07990426369647127,
"grad_norm": 3.2901806629006356,
"learning_rate": 1.9754988641653502e-05,
"loss": 1.2829,
"step": 19330
},
{
"epoch": 0.07994560061509334,
"grad_norm": 3.1031625175692876,
"learning_rate": 1.97546999640993e-05,
"loss": 1.2952,
"step": 19340
},
{
"epoch": 0.07998693753371543,
"grad_norm": 3.1014549071869606,
"learning_rate": 1.975441111869393e-05,
"loss": 1.2609,
"step": 19350
},
{
"epoch": 0.0800282744523375,
"grad_norm": 2.9394216851114217,
"learning_rate": 1.975412210544235e-05,
"loss": 1.2948,
"step": 19360
},
{
"epoch": 0.08006961137095958,
"grad_norm": 3.0263675235321545,
"learning_rate": 1.975383292434954e-05,
"loss": 1.3069,
"step": 19370
},
{
"epoch": 0.08011094828958165,
"grad_norm": 3.0848357434937124,
"learning_rate": 1.9753543575420477e-05,
"loss": 1.2747,
"step": 19380
},
{
"epoch": 0.08015228520820372,
"grad_norm": 2.9964068973521774,
"learning_rate": 1.9753254058660132e-05,
"loss": 1.3225,
"step": 19390
},
{
"epoch": 0.0801936221268258,
"grad_norm": 3.2536839847485224,
"learning_rate": 1.9752964374073494e-05,
"loss": 1.3448,
"step": 19400
},
{
"epoch": 0.08023495904544788,
"grad_norm": 3.37952149059762,
"learning_rate": 1.9752674521665546e-05,
"loss": 1.2845,
"step": 19410
},
{
"epoch": 0.08027629596406995,
"grad_norm": 3.0635418759023687,
"learning_rate": 1.9752384501441276e-05,
"loss": 1.3123,
"step": 19420
},
{
"epoch": 0.08031763288269203,
"grad_norm": 3.298468467101726,
"learning_rate": 1.9752094313405674e-05,
"loss": 1.2986,
"step": 19430
},
{
"epoch": 0.0803589698013141,
"grad_norm": 3.2473420562854507,
"learning_rate": 1.9751803957563735e-05,
"loss": 1.327,
"step": 19440
},
{
"epoch": 0.08040030671993617,
"grad_norm": 3.0843171834880483,
"learning_rate": 1.975151343392045e-05,
"loss": 1.304,
"step": 19450
},
{
"epoch": 0.08044164363855826,
"grad_norm": 3.0717834263814856,
"learning_rate": 1.9751222742480823e-05,
"loss": 1.3133,
"step": 19460
},
{
"epoch": 0.08048298055718033,
"grad_norm": 2.9934271457981603,
"learning_rate": 1.9750931883249852e-05,
"loss": 1.2674,
"step": 19470
},
{
"epoch": 0.0805243174758024,
"grad_norm": 3.1064293377071843,
"learning_rate": 1.9750640856232548e-05,
"loss": 1.2917,
"step": 19480
},
{
"epoch": 0.08056565439442448,
"grad_norm": 2.980738543973593,
"learning_rate": 1.975034966143391e-05,
"loss": 1.2978,
"step": 19490
},
{
"epoch": 0.08060699131304655,
"grad_norm": 3.383580434705728,
"learning_rate": 1.975005829885896e-05,
"loss": 1.2603,
"step": 19500
},
{
"epoch": 0.08064832823166862,
"grad_norm": 3.2830909352425564,
"learning_rate": 1.97497667685127e-05,
"loss": 1.3043,
"step": 19510
},
{
"epoch": 0.0806896651502907,
"grad_norm": 3.3969068546862036,
"learning_rate": 1.9749475070400157e-05,
"loss": 1.3338,
"step": 19520
},
{
"epoch": 0.08073100206891277,
"grad_norm": 3.0880094364724773,
"learning_rate": 1.974918320452634e-05,
"loss": 1.2351,
"step": 19530
},
{
"epoch": 0.08077233898753486,
"grad_norm": 3.4417279889999564,
"learning_rate": 1.974889117089628e-05,
"loss": 1.264,
"step": 19540
},
{
"epoch": 0.08081367590615693,
"grad_norm": 2.904756528303193,
"learning_rate": 1.9748598969514993e-05,
"loss": 1.2647,
"step": 19550
},
{
"epoch": 0.080855012824779,
"grad_norm": 2.7148091448555722,
"learning_rate": 1.9748306600387516e-05,
"loss": 1.2989,
"step": 19560
},
{
"epoch": 0.08089634974340108,
"grad_norm": 3.651161572746769,
"learning_rate": 1.9748014063518875e-05,
"loss": 1.3161,
"step": 19570
},
{
"epoch": 0.08093768666202315,
"grad_norm": 3.1157328327095866,
"learning_rate": 1.9747721358914106e-05,
"loss": 1.2713,
"step": 19580
},
{
"epoch": 0.08097902358064522,
"grad_norm": 3.131352709584711,
"learning_rate": 1.9747428486578243e-05,
"loss": 1.2904,
"step": 19590
},
{
"epoch": 0.08102036049926731,
"grad_norm": 2.942079614415163,
"learning_rate": 1.9747135446516327e-05,
"loss": 1.2857,
"step": 19600
},
{
"epoch": 0.08106169741788938,
"grad_norm": 2.8838500418296955,
"learning_rate": 1.9746842238733404e-05,
"loss": 1.3162,
"step": 19610
},
{
"epoch": 0.08110303433651145,
"grad_norm": 3.62285581087724,
"learning_rate": 1.9746548863234512e-05,
"loss": 1.3105,
"step": 19620
},
{
"epoch": 0.08114437125513353,
"grad_norm": 2.8759608727278927,
"learning_rate": 1.9746255320024702e-05,
"loss": 1.2757,
"step": 19630
},
{
"epoch": 0.0811857081737556,
"grad_norm": 3.2868321525430377,
"learning_rate": 1.974596160910903e-05,
"loss": 1.2808,
"step": 19640
},
{
"epoch": 0.08122704509237767,
"grad_norm": 2.9255021662463,
"learning_rate": 1.9745667730492543e-05,
"loss": 1.2982,
"step": 19650
},
{
"epoch": 0.08126838201099976,
"grad_norm": 3.6734905665007336,
"learning_rate": 1.97453736841803e-05,
"loss": 1.3469,
"step": 19660
},
{
"epoch": 0.08130971892962183,
"grad_norm": 2.9210368580920294,
"learning_rate": 1.974507947017736e-05,
"loss": 1.2624,
"step": 19670
},
{
"epoch": 0.08135105584824391,
"grad_norm": 3.5155014430588514,
"learning_rate": 1.974478508848879e-05,
"loss": 1.3032,
"step": 19680
},
{
"epoch": 0.08139239276686598,
"grad_norm": 3.1972640975511966,
"learning_rate": 1.9744490539119652e-05,
"loss": 1.2663,
"step": 19690
},
{
"epoch": 0.08143372968548805,
"grad_norm": 3.137196038716376,
"learning_rate": 1.9744195822075016e-05,
"loss": 1.2599,
"step": 19700
},
{
"epoch": 0.08147506660411014,
"grad_norm": 3.429683582295254,
"learning_rate": 1.974390093735995e-05,
"loss": 1.3125,
"step": 19710
},
{
"epoch": 0.0815164035227322,
"grad_norm": 3.4935153839614213,
"learning_rate": 1.974360588497953e-05,
"loss": 1.3021,
"step": 19720
},
{
"epoch": 0.08155774044135428,
"grad_norm": 3.693677622986634,
"learning_rate": 1.9743310664938836e-05,
"loss": 1.3154,
"step": 19730
},
{
"epoch": 0.08159907735997636,
"grad_norm": 3.1960119762782995,
"learning_rate": 1.9743015277242942e-05,
"loss": 1.2931,
"step": 19740
},
{
"epoch": 0.08164041427859843,
"grad_norm": 3.362501999299266,
"learning_rate": 1.9742719721896936e-05,
"loss": 1.2977,
"step": 19750
},
{
"epoch": 0.0816817511972205,
"grad_norm": 3.152254479818853,
"learning_rate": 1.97424239989059e-05,
"loss": 1.2571,
"step": 19760
},
{
"epoch": 0.08172308811584258,
"grad_norm": 3.78958825416846,
"learning_rate": 1.9742128108274926e-05,
"loss": 1.2903,
"step": 19770
},
{
"epoch": 0.08176442503446466,
"grad_norm": 3.2123145951877077,
"learning_rate": 1.9741832050009102e-05,
"loss": 1.282,
"step": 19780
},
{
"epoch": 0.08180576195308673,
"grad_norm": 3.1167331240538076,
"learning_rate": 1.9741535824113526e-05,
"loss": 1.2552,
"step": 19790
},
{
"epoch": 0.08184709887170881,
"grad_norm": 4.206288270940374,
"learning_rate": 1.974123943059329e-05,
"loss": 1.2597,
"step": 19800
},
{
"epoch": 0.08188843579033088,
"grad_norm": 3.553992295271698,
"learning_rate": 1.9740942869453504e-05,
"loss": 1.2908,
"step": 19810
},
{
"epoch": 0.08192977270895295,
"grad_norm": 3.521907001370512,
"learning_rate": 1.974064614069926e-05,
"loss": 1.297,
"step": 19820
},
{
"epoch": 0.08197110962757503,
"grad_norm": 3.360470421890721,
"learning_rate": 1.9740349244335665e-05,
"loss": 1.2882,
"step": 19830
},
{
"epoch": 0.0820124465461971,
"grad_norm": 2.9432002594333464,
"learning_rate": 1.9740052180367836e-05,
"loss": 1.252,
"step": 19840
},
{
"epoch": 0.08205378346481919,
"grad_norm": 3.252199526734451,
"learning_rate": 1.9739754948800874e-05,
"loss": 1.2805,
"step": 19850
},
{
"epoch": 0.08209512038344126,
"grad_norm": 3.1871490405890586,
"learning_rate": 1.9739457549639905e-05,
"loss": 1.2697,
"step": 19860
},
{
"epoch": 0.08213645730206333,
"grad_norm": 3.1800803475250166,
"learning_rate": 1.973915998289004e-05,
"loss": 1.3342,
"step": 19870
},
{
"epoch": 0.08217779422068541,
"grad_norm": 2.8783537685939478,
"learning_rate": 1.9738862248556395e-05,
"loss": 1.2471,
"step": 19880
},
{
"epoch": 0.08221913113930748,
"grad_norm": 2.7100348444687246,
"learning_rate": 1.9738564346644103e-05,
"loss": 1.2827,
"step": 19890
},
{
"epoch": 0.08226046805792955,
"grad_norm": 3.509524916127054,
"learning_rate": 1.973826627715828e-05,
"loss": 1.3025,
"step": 19900
},
{
"epoch": 0.08230180497655164,
"grad_norm": 3.3725272601095795,
"learning_rate": 1.9737968040104065e-05,
"loss": 1.3234,
"step": 19910
},
{
"epoch": 0.08234314189517371,
"grad_norm": 3.341475977428283,
"learning_rate": 1.9737669635486585e-05,
"loss": 1.3203,
"step": 19920
},
{
"epoch": 0.08238447881379578,
"grad_norm": 3.185315976662971,
"learning_rate": 1.9737371063310972e-05,
"loss": 1.2828,
"step": 19930
},
{
"epoch": 0.08242581573241786,
"grad_norm": 3.6059578502827945,
"learning_rate": 1.9737072323582366e-05,
"loss": 1.3272,
"step": 19940
},
{
"epoch": 0.08246715265103993,
"grad_norm": 3.486164979637442,
"learning_rate": 1.973677341630591e-05,
"loss": 1.2619,
"step": 19950
},
{
"epoch": 0.082508489569662,
"grad_norm": 3.3788628785271935,
"learning_rate": 1.9736474341486742e-05,
"loss": 1.2866,
"step": 19960
},
{
"epoch": 0.08254982648828409,
"grad_norm": 3.116863275035675,
"learning_rate": 1.973617509913001e-05,
"loss": 1.291,
"step": 19970
},
{
"epoch": 0.08259116340690616,
"grad_norm": 2.7964040776322796,
"learning_rate": 1.973587568924087e-05,
"loss": 1.2725,
"step": 19980
},
{
"epoch": 0.08263250032552824,
"grad_norm": 2.797740269032973,
"learning_rate": 1.9735576111824465e-05,
"loss": 1.2742,
"step": 19990
},
{
"epoch": 0.08267383724415031,
"grad_norm": 3.2432141472722162,
"learning_rate": 1.9735276366885956e-05,
"loss": 1.2947,
"step": 20000
},
{
"epoch": 0.08267383724415031,
"eval_loss": 1.5713279247283936,
"eval_runtime": 392.3898,
"eval_samples_per_second": 10.439,
"eval_steps_per_second": 2.61,
"step": 20000
},
{
"epoch": 0.08267797093601252,
"step": 20001,
"total_flos": 0.0,
"train_loss": 6.551032936291113e-05,
"train_runtime": 86.8457,
"train_samples_per_second": 14738.777,
"train_steps_per_second": 230.293
}
],
"logging_steps": 10,
"max_steps": 20000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 10000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}