e-zorzi's picture
Add files using upload-large-folder tool
4d172a3 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.5,
"eval_steps": 500,
"global_step": 10000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"grad_norm": 1.3594739437103271,
"learning_rate": 9e-07,
"loss": 1.1913,
"step": 10
},
{
"grad_norm": 1.0572824478149414,
"learning_rate": 1.9e-06,
"loss": 1.1841,
"step": 20
},
{
"grad_norm": 0.5717663764953613,
"learning_rate": 2.9e-06,
"loss": 1.1508,
"step": 30
},
{
"grad_norm": 0.3898443877696991,
"learning_rate": 3.9e-06,
"loss": 1.1205,
"step": 40
},
{
"grad_norm": 0.28664326667785645,
"learning_rate": 4.9000000000000005e-06,
"loss": 1.0888,
"step": 50
},
{
"grad_norm": 0.1729290783405304,
"learning_rate": 5.9e-06,
"loss": 1.0782,
"step": 60
},
{
"grad_norm": 0.17002208530902863,
"learning_rate": 6.900000000000001e-06,
"loss": 1.0691,
"step": 70
},
{
"grad_norm": 0.2152942717075348,
"learning_rate": 7.9e-06,
"loss": 1.0562,
"step": 80
},
{
"grad_norm": 0.19103780388832092,
"learning_rate": 8.9e-06,
"loss": 1.0479,
"step": 90
},
{
"grad_norm": 0.3243984878063202,
"learning_rate": 9.900000000000002e-06,
"loss": 1.0372,
"step": 100
},
{
"grad_norm": 0.1820673942565918,
"learning_rate": 1.09e-05,
"loss": 1.0272,
"step": 110
},
{
"grad_norm": 0.21819084882736206,
"learning_rate": 1.19e-05,
"loss": 1.0236,
"step": 120
},
{
"grad_norm": 0.20377595722675323,
"learning_rate": 1.29e-05,
"loss": 1.0237,
"step": 130
},
{
"grad_norm": 0.20572194457054138,
"learning_rate": 1.3900000000000002e-05,
"loss": 1.0228,
"step": 140
},
{
"grad_norm": 0.20157840847969055,
"learning_rate": 1.49e-05,
"loss": 1.0217,
"step": 150
},
{
"grad_norm": 0.23459017276763916,
"learning_rate": 1.59e-05,
"loss": 1.0192,
"step": 160
},
{
"grad_norm": 0.32469043135643005,
"learning_rate": 1.69e-05,
"loss": 1.0063,
"step": 170
},
{
"grad_norm": 0.36008527874946594,
"learning_rate": 1.79e-05,
"loss": 0.9873,
"step": 180
},
{
"grad_norm": 0.5633573532104492,
"learning_rate": 1.8900000000000002e-05,
"loss": 0.9672,
"step": 190
},
{
"grad_norm": 0.7019369006156921,
"learning_rate": 1.9900000000000003e-05,
"loss": 0.9315,
"step": 200
},
{
"grad_norm": 0.5538105964660645,
"learning_rate": 2.09e-05,
"loss": 0.8958,
"step": 210
},
{
"grad_norm": 0.5306029319763184,
"learning_rate": 2.19e-05,
"loss": 0.8707,
"step": 220
},
{
"grad_norm": 0.6606974005699158,
"learning_rate": 2.29e-05,
"loss": 0.8479,
"step": 230
},
{
"grad_norm": 0.8058410882949829,
"learning_rate": 2.39e-05,
"loss": 0.8169,
"step": 240
},
{
"grad_norm": 0.7277475595474243,
"learning_rate": 2.4900000000000002e-05,
"loss": 0.77,
"step": 250
},
{
"grad_norm": 0.6617355942726135,
"learning_rate": 2.5900000000000003e-05,
"loss": 0.7456,
"step": 260
},
{
"grad_norm": 0.8156651258468628,
"learning_rate": 2.6900000000000003e-05,
"loss": 0.6984,
"step": 270
},
{
"grad_norm": 0.7090954780578613,
"learning_rate": 2.7900000000000004e-05,
"loss": 0.6774,
"step": 280
},
{
"grad_norm": 0.8667084574699402,
"learning_rate": 2.8899999999999998e-05,
"loss": 0.6429,
"step": 290
},
{
"grad_norm": 0.946596622467041,
"learning_rate": 2.9900000000000002e-05,
"loss": 0.6052,
"step": 300
},
{
"grad_norm": 0.8120863437652588,
"learning_rate": 3.09e-05,
"loss": 0.5681,
"step": 310
},
{
"grad_norm": 0.9630921483039856,
"learning_rate": 3.19e-05,
"loss": 0.5267,
"step": 320
},
{
"grad_norm": 0.9185823798179626,
"learning_rate": 3.29e-05,
"loss": 0.497,
"step": 330
},
{
"grad_norm": 0.9909350872039795,
"learning_rate": 3.3900000000000004e-05,
"loss": 0.4704,
"step": 340
},
{
"grad_norm": 0.7408623695373535,
"learning_rate": 3.49e-05,
"loss": 0.4463,
"step": 350
},
{
"grad_norm": 0.8417967557907104,
"learning_rate": 3.59e-05,
"loss": 0.4515,
"step": 360
},
{
"grad_norm": 0.9200495481491089,
"learning_rate": 3.69e-05,
"loss": 0.417,
"step": 370
},
{
"grad_norm": 1.146302342414856,
"learning_rate": 3.79e-05,
"loss": 0.3937,
"step": 380
},
{
"grad_norm": 1.0057293176651,
"learning_rate": 3.8900000000000004e-05,
"loss": 0.3773,
"step": 390
},
{
"grad_norm": 1.112216591835022,
"learning_rate": 3.99e-05,
"loss": 0.348,
"step": 400
},
{
"grad_norm": 1.0176512002944946,
"learning_rate": 4.09e-05,
"loss": 0.3392,
"step": 410
},
{
"grad_norm": 1.0310163497924805,
"learning_rate": 4.19e-05,
"loss": 0.3065,
"step": 420
},
{
"grad_norm": 1.022374153137207,
"learning_rate": 4.29e-05,
"loss": 0.2808,
"step": 430
},
{
"grad_norm": 1.368080735206604,
"learning_rate": 4.39e-05,
"loss": 0.2624,
"step": 440
},
{
"grad_norm": 1.1092591285705566,
"learning_rate": 4.49e-05,
"loss": 0.2405,
"step": 450
},
{
"grad_norm": 0.9738430380821228,
"learning_rate": 4.5900000000000004e-05,
"loss": 0.2254,
"step": 460
},
{
"grad_norm": 1.033246636390686,
"learning_rate": 4.69e-05,
"loss": 0.2162,
"step": 470
},
{
"grad_norm": 0.9855560064315796,
"learning_rate": 4.79e-05,
"loss": 0.2088,
"step": 480
},
{
"grad_norm": 1.0313360691070557,
"learning_rate": 4.89e-05,
"loss": 0.2188,
"step": 490
},
{
"grad_norm": 1.100176215171814,
"learning_rate": 4.99e-05,
"loss": 0.2007,
"step": 500
},
{
"grad_norm": 1.0784265995025635,
"learning_rate": 5.0900000000000004e-05,
"loss": 0.2016,
"step": 510
},
{
"grad_norm": 1.0822303295135498,
"learning_rate": 5.19e-05,
"loss": 0.1961,
"step": 520
},
{
"grad_norm": 1.067589282989502,
"learning_rate": 5.2900000000000005e-05,
"loss": 0.1801,
"step": 530
},
{
"grad_norm": 1.1917147636413574,
"learning_rate": 5.390000000000001e-05,
"loss": 0.1705,
"step": 540
},
{
"grad_norm": 1.3141072988510132,
"learning_rate": 5.4900000000000006e-05,
"loss": 0.1851,
"step": 550
},
{
"grad_norm": 1.002855658531189,
"learning_rate": 5.590000000000001e-05,
"loss": 0.1663,
"step": 560
},
{
"grad_norm": 1.167011022567749,
"learning_rate": 5.69e-05,
"loss": 0.1741,
"step": 570
},
{
"grad_norm": 1.0936863422393799,
"learning_rate": 5.79e-05,
"loss": 0.1661,
"step": 580
},
{
"grad_norm": 0.9669778347015381,
"learning_rate": 5.89e-05,
"loss": 0.1648,
"step": 590
},
{
"grad_norm": 0.9405611753463745,
"learning_rate": 5.99e-05,
"loss": 0.1627,
"step": 600
},
{
"grad_norm": 1.0284767150878906,
"learning_rate": 6.09e-05,
"loss": 0.1496,
"step": 610
},
{
"grad_norm": 1.1097605228424072,
"learning_rate": 6.19e-05,
"loss": 0.1628,
"step": 620
},
{
"grad_norm": 0.9104214310646057,
"learning_rate": 6.29e-05,
"loss": 0.1302,
"step": 630
},
{
"grad_norm": 0.8578998446464539,
"learning_rate": 6.390000000000001e-05,
"loss": 0.1326,
"step": 640
},
{
"grad_norm": 1.1287304162979126,
"learning_rate": 6.49e-05,
"loss": 0.1127,
"step": 650
},
{
"grad_norm": 0.8655268549919128,
"learning_rate": 6.59e-05,
"loss": 0.1202,
"step": 660
},
{
"grad_norm": 0.9937160015106201,
"learning_rate": 6.690000000000001e-05,
"loss": 0.1198,
"step": 670
},
{
"grad_norm": 0.9691420197486877,
"learning_rate": 6.790000000000001e-05,
"loss": 0.1096,
"step": 680
},
{
"grad_norm": 1.0945252180099487,
"learning_rate": 6.89e-05,
"loss": 0.105,
"step": 690
},
{
"grad_norm": 1.0388752222061157,
"learning_rate": 6.99e-05,
"loss": 0.1027,
"step": 700
},
{
"grad_norm": 0.881949245929718,
"learning_rate": 7.09e-05,
"loss": 0.1044,
"step": 710
},
{
"grad_norm": 0.8678519129753113,
"learning_rate": 7.19e-05,
"loss": 0.0842,
"step": 720
},
{
"grad_norm": 1.2314260005950928,
"learning_rate": 7.29e-05,
"loss": 0.0841,
"step": 730
},
{
"grad_norm": 0.7337191700935364,
"learning_rate": 7.390000000000001e-05,
"loss": 0.0771,
"step": 740
},
{
"grad_norm": 1.194354772567749,
"learning_rate": 7.49e-05,
"loss": 0.0791,
"step": 750
},
{
"grad_norm": 1.0703870058059692,
"learning_rate": 7.59e-05,
"loss": 0.0697,
"step": 760
},
{
"grad_norm": 0.9820927977561951,
"learning_rate": 7.69e-05,
"loss": 0.0798,
"step": 770
},
{
"grad_norm": 1.099042534828186,
"learning_rate": 7.790000000000001e-05,
"loss": 0.0736,
"step": 780
},
{
"grad_norm": 0.9056155681610107,
"learning_rate": 7.890000000000001e-05,
"loss": 0.0756,
"step": 790
},
{
"grad_norm": 0.8292648792266846,
"learning_rate": 7.99e-05,
"loss": 0.0796,
"step": 800
},
{
"grad_norm": 0.9507290720939636,
"learning_rate": 8.090000000000001e-05,
"loss": 0.0829,
"step": 810
},
{
"grad_norm": 0.9466397762298584,
"learning_rate": 8.19e-05,
"loss": 0.0688,
"step": 820
},
{
"grad_norm": 0.7956731915473938,
"learning_rate": 8.29e-05,
"loss": 0.0747,
"step": 830
},
{
"grad_norm": 0.7995853424072266,
"learning_rate": 8.39e-05,
"loss": 0.0634,
"step": 840
},
{
"grad_norm": 0.7665478587150574,
"learning_rate": 8.49e-05,
"loss": 0.0661,
"step": 850
},
{
"grad_norm": 0.9283880591392517,
"learning_rate": 8.59e-05,
"loss": 0.0702,
"step": 860
},
{
"grad_norm": 1.126967191696167,
"learning_rate": 8.69e-05,
"loss": 0.0716,
"step": 870
},
{
"grad_norm": 0.8662194609642029,
"learning_rate": 8.790000000000001e-05,
"loss": 0.0667,
"step": 880
},
{
"grad_norm": 0.9572857022285461,
"learning_rate": 8.89e-05,
"loss": 0.0791,
"step": 890
},
{
"grad_norm": 0.9036967158317566,
"learning_rate": 8.99e-05,
"loss": 0.0745,
"step": 900
},
{
"grad_norm": 0.7550048828125,
"learning_rate": 9.090000000000001e-05,
"loss": 0.0746,
"step": 910
},
{
"grad_norm": 0.9990408420562744,
"learning_rate": 9.190000000000001e-05,
"loss": 0.0648,
"step": 920
},
{
"grad_norm": 0.8286410570144653,
"learning_rate": 9.290000000000001e-05,
"loss": 0.0697,
"step": 930
},
{
"grad_norm": 0.9783310890197754,
"learning_rate": 9.39e-05,
"loss": 0.0749,
"step": 940
},
{
"grad_norm": 0.9899768233299255,
"learning_rate": 9.49e-05,
"loss": 0.0722,
"step": 950
},
{
"grad_norm": 0.7450554370880127,
"learning_rate": 9.59e-05,
"loss": 0.0599,
"step": 960
},
{
"grad_norm": 0.7791635394096375,
"learning_rate": 9.69e-05,
"loss": 0.0654,
"step": 970
},
{
"grad_norm": 0.7614015340805054,
"learning_rate": 9.790000000000001e-05,
"loss": 0.0558,
"step": 980
},
{
"grad_norm": 0.9096309542655945,
"learning_rate": 9.89e-05,
"loss": 0.0581,
"step": 990
},
{
"grad_norm": 0.668950080871582,
"learning_rate": 9.99e-05,
"loss": 0.0652,
"step": 1000
},
{
"grad_norm": 0.8658283948898315,
"learning_rate": 9.999994463727085e-05,
"loss": 0.0529,
"step": 1010
},
{
"grad_norm": 0.7495288848876953,
"learning_rate": 9.999975326009292e-05,
"loss": 0.059,
"step": 1020
},
{
"grad_norm": 0.9980189204216003,
"learning_rate": 9.999942518549879e-05,
"loss": 0.0638,
"step": 1030
},
{
"grad_norm": 0.7826606035232544,
"learning_rate": 9.999896041438544e-05,
"loss": 0.0546,
"step": 1040
},
{
"grad_norm": 0.6360778212547302,
"learning_rate": 9.999835894802353e-05,
"loss": 0.054,
"step": 1050
},
{
"grad_norm": 0.7757160067558289,
"learning_rate": 9.999762078805743e-05,
"loss": 0.0591,
"step": 1060
},
{
"grad_norm": 0.7390689849853516,
"learning_rate": 9.999674593650526e-05,
"loss": 0.0595,
"step": 1070
},
{
"grad_norm": 0.6460424065589905,
"learning_rate": 9.99957343957588e-05,
"loss": 0.0658,
"step": 1080
},
{
"grad_norm": 0.8082983493804932,
"learning_rate": 9.99945861685836e-05,
"loss": 0.0596,
"step": 1090
},
{
"grad_norm": 0.7415626645088196,
"learning_rate": 9.999330125811884e-05,
"loss": 0.0483,
"step": 1100
},
{
"grad_norm": 0.8829818367958069,
"learning_rate": 9.999187966787744e-05,
"loss": 0.0619,
"step": 1110
},
{
"grad_norm": 0.8239393830299377,
"learning_rate": 9.999032140174595e-05,
"loss": 0.0528,
"step": 1120
},
{
"grad_norm": 0.8529507517814636,
"learning_rate": 9.998862646398464e-05,
"loss": 0.0654,
"step": 1130
},
{
"grad_norm": 0.7502208948135376,
"learning_rate": 9.998679485922739e-05,
"loss": 0.0526,
"step": 1140
},
{
"grad_norm": 0.6970030069351196,
"learning_rate": 9.998482659248174e-05,
"loss": 0.0547,
"step": 1150
},
{
"grad_norm": 0.9376399517059326,
"learning_rate": 9.998272166912883e-05,
"loss": 0.0557,
"step": 1160
},
{
"grad_norm": 0.7249330282211304,
"learning_rate": 9.998048009492347e-05,
"loss": 0.0504,
"step": 1170
},
{
"grad_norm": 0.8968970775604248,
"learning_rate": 9.997810187599403e-05,
"loss": 0.0526,
"step": 1180
},
{
"grad_norm": 0.7676458358764648,
"learning_rate": 9.997558701884249e-05,
"loss": 0.0506,
"step": 1190
},
{
"grad_norm": 0.6501711010932922,
"learning_rate": 9.997293553034433e-05,
"loss": 0.061,
"step": 1200
},
{
"grad_norm": 0.677116870880127,
"learning_rate": 9.997014741774866e-05,
"loss": 0.0462,
"step": 1210
},
{
"grad_norm": 0.8147766590118408,
"learning_rate": 9.996722268867803e-05,
"loss": 0.0486,
"step": 1220
},
{
"grad_norm": 0.706069827079773,
"learning_rate": 9.996416135112858e-05,
"loss": 0.0511,
"step": 1230
},
{
"grad_norm": 0.6159539818763733,
"learning_rate": 9.996096341346988e-05,
"loss": 0.0492,
"step": 1240
},
{
"grad_norm": 0.6369336843490601,
"learning_rate": 9.995762888444495e-05,
"loss": 0.0479,
"step": 1250
},
{
"grad_norm": 0.7543830275535583,
"learning_rate": 9.995415777317027e-05,
"loss": 0.0493,
"step": 1260
},
{
"grad_norm": 0.7505154609680176,
"learning_rate": 9.995055008913574e-05,
"loss": 0.053,
"step": 1270
},
{
"grad_norm": 0.5397493243217468,
"learning_rate": 9.994680584220463e-05,
"loss": 0.0432,
"step": 1280
},
{
"grad_norm": 0.6707198619842529,
"learning_rate": 9.994292504261355e-05,
"loss": 0.0472,
"step": 1290
},
{
"grad_norm": 0.8792182803153992,
"learning_rate": 9.993890770097247e-05,
"loss": 0.0453,
"step": 1300
},
{
"grad_norm": 0.7324561476707458,
"learning_rate": 9.993475382826467e-05,
"loss": 0.0479,
"step": 1310
},
{
"grad_norm": 0.8385289907455444,
"learning_rate": 9.993046343584664e-05,
"loss": 0.0549,
"step": 1320
},
{
"grad_norm": 0.5908923745155334,
"learning_rate": 9.992603653544816e-05,
"loss": 0.0483,
"step": 1330
},
{
"grad_norm": 0.63700932264328,
"learning_rate": 9.992147313917222e-05,
"loss": 0.0485,
"step": 1340
},
{
"grad_norm": 0.7525864839553833,
"learning_rate": 9.991677325949497e-05,
"loss": 0.0469,
"step": 1350
},
{
"grad_norm": 0.5628486275672913,
"learning_rate": 9.991193690926568e-05,
"loss": 0.0459,
"step": 1360
},
{
"grad_norm": 0.795554518699646,
"learning_rate": 9.990696410170678e-05,
"loss": 0.0467,
"step": 1370
},
{
"grad_norm": 0.7957155704498291,
"learning_rate": 9.990185485041371e-05,
"loss": 0.0481,
"step": 1380
},
{
"grad_norm": 0.5773254632949829,
"learning_rate": 9.989660916935498e-05,
"loss": 0.0471,
"step": 1390
},
{
"grad_norm": 0.6150880455970764,
"learning_rate": 9.989122707287208e-05,
"loss": 0.0426,
"step": 1400
},
{
"grad_norm": 0.7106145620346069,
"learning_rate": 9.988570857567945e-05,
"loss": 0.0537,
"step": 1410
},
{
"grad_norm": 0.9491516947746277,
"learning_rate": 9.988005369286446e-05,
"loss": 0.0525,
"step": 1420
},
{
"grad_norm": 0.6860232353210449,
"learning_rate": 9.987426243988734e-05,
"loss": 0.0429,
"step": 1430
},
{
"grad_norm": 0.7841853499412537,
"learning_rate": 9.986833483258114e-05,
"loss": 0.0524,
"step": 1440
},
{
"grad_norm": 0.6175568103790283,
"learning_rate": 9.986227088715173e-05,
"loss": 0.0385,
"step": 1450
},
{
"grad_norm": 0.5932314991950989,
"learning_rate": 9.98560706201777e-05,
"loss": 0.0408,
"step": 1460
},
{
"grad_norm": 0.7410153150558472,
"learning_rate": 9.984973404861036e-05,
"loss": 0.043,
"step": 1470
},
{
"grad_norm": 0.8330276608467102,
"learning_rate": 9.984326118977361e-05,
"loss": 0.051,
"step": 1480
},
{
"grad_norm": 0.7202706933021545,
"learning_rate": 9.983665206136406e-05,
"loss": 0.0493,
"step": 1490
},
{
"grad_norm": 0.574433445930481,
"learning_rate": 9.982990668145075e-05,
"loss": 0.0466,
"step": 1500
},
{
"grad_norm": 0.7351802587509155,
"learning_rate": 9.982302506847534e-05,
"loss": 0.057,
"step": 1510
},
{
"grad_norm": 0.819564163684845,
"learning_rate": 9.981600724125189e-05,
"loss": 0.0555,
"step": 1520
},
{
"grad_norm": 0.6065496206283569,
"learning_rate": 9.980885321896685e-05,
"loss": 0.0509,
"step": 1530
},
{
"grad_norm": 0.6572223901748657,
"learning_rate": 9.980156302117905e-05,
"loss": 0.044,
"step": 1540
},
{
"grad_norm": 0.6978927254676819,
"learning_rate": 9.979413666781963e-05,
"loss": 0.0465,
"step": 1550
},
{
"grad_norm": 0.5508580803871155,
"learning_rate": 9.978657417919193e-05,
"loss": 0.0452,
"step": 1560
},
{
"grad_norm": 0.5769541263580322,
"learning_rate": 9.977887557597153e-05,
"loss": 0.0475,
"step": 1570
},
{
"grad_norm": 0.5610742568969727,
"learning_rate": 9.97710408792061e-05,
"loss": 0.0469,
"step": 1580
},
{
"grad_norm": 0.5692776441574097,
"learning_rate": 9.976307011031542e-05,
"loss": 0.0449,
"step": 1590
},
{
"grad_norm": 0.5226185321807861,
"learning_rate": 9.975496329109126e-05,
"loss": 0.0476,
"step": 1600
},
{
"grad_norm": 0.7111744284629822,
"learning_rate": 9.974672044369732e-05,
"loss": 0.047,
"step": 1610
},
{
"grad_norm": 0.514858067035675,
"learning_rate": 9.97383415906693e-05,
"loss": 0.043,
"step": 1620
},
{
"grad_norm": 0.5856963396072388,
"learning_rate": 9.97298267549146e-05,
"loss": 0.0471,
"step": 1630
},
{
"grad_norm": 0.6191436052322388,
"learning_rate": 9.972117595971249e-05,
"loss": 0.0422,
"step": 1640
},
{
"grad_norm": 0.5670982599258423,
"learning_rate": 9.971238922871391e-05,
"loss": 0.0419,
"step": 1650
},
{
"grad_norm": 0.7190003991127014,
"learning_rate": 9.970346658594142e-05,
"loss": 0.0453,
"step": 1660
},
{
"grad_norm": 0.6552428007125854,
"learning_rate": 9.969440805578923e-05,
"loss": 0.046,
"step": 1670
},
{
"grad_norm": 0.578118622303009,
"learning_rate": 9.968521366302298e-05,
"loss": 0.0392,
"step": 1680
},
{
"grad_norm": 0.7054030895233154,
"learning_rate": 9.967588343277981e-05,
"loss": 0.0455,
"step": 1690
},
{
"grad_norm": 0.6531293392181396,
"learning_rate": 9.966641739056818e-05,
"loss": 0.0421,
"step": 1700
},
{
"grad_norm": 0.6111751198768616,
"learning_rate": 9.965681556226793e-05,
"loss": 0.0517,
"step": 1710
},
{
"grad_norm": 0.4928556978702545,
"learning_rate": 9.964707797413006e-05,
"loss": 0.044,
"step": 1720
},
{
"grad_norm": 0.6597058773040771,
"learning_rate": 9.963720465277679e-05,
"loss": 0.047,
"step": 1730
},
{
"grad_norm": 0.6202155351638794,
"learning_rate": 9.96271956252014e-05,
"loss": 0.0384,
"step": 1740
},
{
"grad_norm": 0.5262959599494934,
"learning_rate": 9.961705091876816e-05,
"loss": 0.0425,
"step": 1750
},
{
"grad_norm": 0.6935763955116272,
"learning_rate": 9.960677056121235e-05,
"loss": 0.0409,
"step": 1760
},
{
"grad_norm": 0.6149827837944031,
"learning_rate": 9.959635458064005e-05,
"loss": 0.0383,
"step": 1770
},
{
"grad_norm": 0.5901826024055481,
"learning_rate": 9.958580300552815e-05,
"loss": 0.0426,
"step": 1780
},
{
"grad_norm": 0.5597098469734192,
"learning_rate": 9.957511586472426e-05,
"loss": 0.0352,
"step": 1790
},
{
"grad_norm": 0.5581690073013306,
"learning_rate": 9.956429318744662e-05,
"loss": 0.0366,
"step": 1800
},
{
"grad_norm": 0.5969916582107544,
"learning_rate": 9.955333500328404e-05,
"loss": 0.0355,
"step": 1810
},
{
"grad_norm": 0.5474916696548462,
"learning_rate": 9.95422413421957e-05,
"loss": 0.0376,
"step": 1820
},
{
"grad_norm": 0.5651562809944153,
"learning_rate": 9.953101223451133e-05,
"loss": 0.0359,
"step": 1830
},
{
"grad_norm": 0.6243921518325806,
"learning_rate": 9.951964771093085e-05,
"loss": 0.0373,
"step": 1840
},
{
"grad_norm": 0.4624647796154022,
"learning_rate": 9.950814780252442e-05,
"loss": 0.0347,
"step": 1850
},
{
"grad_norm": 0.5893751382827759,
"learning_rate": 9.949651254073236e-05,
"loss": 0.0408,
"step": 1860
},
{
"grad_norm": 0.526287317276001,
"learning_rate": 9.948474195736504e-05,
"loss": 0.0388,
"step": 1870
},
{
"grad_norm": 0.6111840605735779,
"learning_rate": 9.947283608460277e-05,
"loss": 0.0346,
"step": 1880
},
{
"grad_norm": 0.46461328864097595,
"learning_rate": 9.946079495499577e-05,
"loss": 0.0411,
"step": 1890
},
{
"grad_norm": 0.610548734664917,
"learning_rate": 9.944861860146401e-05,
"loss": 0.0407,
"step": 1900
},
{
"grad_norm": 0.5339504480361938,
"learning_rate": 9.943630705729719e-05,
"loss": 0.0398,
"step": 1910
},
{
"grad_norm": 0.46559029817581177,
"learning_rate": 9.942386035615459e-05,
"loss": 0.039,
"step": 1920
},
{
"grad_norm": 0.7745798826217651,
"learning_rate": 9.941127853206503e-05,
"loss": 0.04,
"step": 1930
},
{
"grad_norm": 0.5811882019042969,
"learning_rate": 9.939856161942673e-05,
"loss": 0.0425,
"step": 1940
},
{
"grad_norm": 0.4856541156768799,
"learning_rate": 9.938570965300724e-05,
"loss": 0.0363,
"step": 1950
},
{
"grad_norm": 0.5952467918395996,
"learning_rate": 9.937272266794335e-05,
"loss": 0.0439,
"step": 1960
},
{
"grad_norm": 0.5669976472854614,
"learning_rate": 9.935960069974096e-05,
"loss": 0.05,
"step": 1970
},
{
"grad_norm": 0.5959198474884033,
"learning_rate": 9.934634378427506e-05,
"loss": 0.0382,
"step": 1980
},
{
"grad_norm": 0.520875096321106,
"learning_rate": 9.933295195778954e-05,
"loss": 0.0386,
"step": 1990
},
{
"grad_norm": 0.4351758360862732,
"learning_rate": 9.931942525689715e-05,
"loss": 0.0488,
"step": 2000
},
{
"grad_norm": 0.6345981359481812,
"learning_rate": 9.930576371857936e-05,
"loss": 0.0391,
"step": 2010
},
{
"grad_norm": 0.6230748295783997,
"learning_rate": 9.929196738018629e-05,
"loss": 0.0388,
"step": 2020
},
{
"grad_norm": 0.5425089001655579,
"learning_rate": 9.927803627943662e-05,
"loss": 0.0395,
"step": 2030
},
{
"grad_norm": 0.49332770705223083,
"learning_rate": 9.926397045441744e-05,
"loss": 0.039,
"step": 2040
},
{
"grad_norm": 0.6731558442115784,
"learning_rate": 9.924976994358417e-05,
"loss": 0.0427,
"step": 2050
},
{
"grad_norm": 0.5310463309288025,
"learning_rate": 9.923543478576048e-05,
"loss": 0.0474,
"step": 2060
},
{
"grad_norm": 0.548930823802948,
"learning_rate": 9.922096502013813e-05,
"loss": 0.0423,
"step": 2070
},
{
"grad_norm": 0.5744786262512207,
"learning_rate": 9.92063606862769e-05,
"loss": 0.0372,
"step": 2080
},
{
"grad_norm": 0.6390929222106934,
"learning_rate": 9.919162182410453e-05,
"loss": 0.0368,
"step": 2090
},
{
"grad_norm": 0.5252511501312256,
"learning_rate": 9.917674847391645e-05,
"loss": 0.038,
"step": 2100
},
{
"grad_norm": 0.5656434297561646,
"learning_rate": 9.916174067637584e-05,
"loss": 0.0333,
"step": 2110
},
{
"grad_norm": 0.5288258790969849,
"learning_rate": 9.914659847251348e-05,
"loss": 0.0406,
"step": 2120
},
{
"grad_norm": 0.5040147304534912,
"learning_rate": 9.913132190372753e-05,
"loss": 0.0369,
"step": 2130
},
{
"grad_norm": 0.5128138661384583,
"learning_rate": 9.911591101178359e-05,
"loss": 0.0368,
"step": 2140
},
{
"grad_norm": 0.4942684769630432,
"learning_rate": 9.910036583881443e-05,
"loss": 0.0334,
"step": 2150
},
{
"grad_norm": 0.5318565368652344,
"learning_rate": 9.908468642731995e-05,
"loss": 0.0325,
"step": 2160
},
{
"grad_norm": 0.5772367715835571,
"learning_rate": 9.906887282016707e-05,
"loss": 0.0344,
"step": 2170
},
{
"grad_norm": 0.5957911014556885,
"learning_rate": 9.90529250605896e-05,
"loss": 0.0368,
"step": 2180
},
{
"grad_norm": 0.6259480714797974,
"learning_rate": 9.903684319218809e-05,
"loss": 0.0375,
"step": 2190
},
{
"grad_norm": 0.691277801990509,
"learning_rate": 9.902062725892976e-05,
"loss": 0.0402,
"step": 2200
},
{
"grad_norm": 0.624859094619751,
"learning_rate": 9.900427730514834e-05,
"loss": 0.0316,
"step": 2210
},
{
"grad_norm": 0.46915674209594727,
"learning_rate": 9.8987793375544e-05,
"loss": 0.0352,
"step": 2220
},
{
"grad_norm": 0.5559591054916382,
"learning_rate": 9.897117551518318e-05,
"loss": 0.0353,
"step": 2230
},
{
"grad_norm": 0.47577548027038574,
"learning_rate": 9.895442376949844e-05,
"loss": 0.0395,
"step": 2240
},
{
"grad_norm": 0.7231595516204834,
"learning_rate": 9.893753818428845e-05,
"loss": 0.0442,
"step": 2250
},
{
"grad_norm": 0.4607575535774231,
"learning_rate": 9.892051880571773e-05,
"loss": 0.037,
"step": 2260
},
{
"grad_norm": 0.4901242256164551,
"learning_rate": 9.890336568031663e-05,
"loss": 0.0342,
"step": 2270
},
{
"grad_norm": 0.46413323283195496,
"learning_rate": 9.888607885498113e-05,
"loss": 0.0386,
"step": 2280
},
{
"grad_norm": 0.5028432607650757,
"learning_rate": 9.886865837697275e-05,
"loss": 0.0384,
"step": 2290
},
{
"grad_norm": 0.6079827547073364,
"learning_rate": 9.88511042939184e-05,
"loss": 0.0416,
"step": 2300
},
{
"grad_norm": 0.6189248561859131,
"learning_rate": 9.883341665381028e-05,
"loss": 0.0372,
"step": 2310
},
{
"grad_norm": 0.569456160068512,
"learning_rate": 9.881559550500575e-05,
"loss": 0.0317,
"step": 2320
},
{
"grad_norm": 0.5782006978988647,
"learning_rate": 9.879764089622712e-05,
"loss": 0.0363,
"step": 2330
},
{
"grad_norm": 0.6612024307250977,
"learning_rate": 9.87795528765616e-05,
"loss": 0.0386,
"step": 2340
},
{
"grad_norm": 0.45619797706604004,
"learning_rate": 9.876133149546118e-05,
"loss": 0.0385,
"step": 2350
},
{
"grad_norm": 0.4743977189064026,
"learning_rate": 9.874297680274238e-05,
"loss": 0.0384,
"step": 2360
},
{
"grad_norm": 0.5303918719291687,
"learning_rate": 9.872448884858624e-05,
"loss": 0.0364,
"step": 2370
},
{
"grad_norm": 0.5923212766647339,
"learning_rate": 9.870586768353815e-05,
"loss": 0.0366,
"step": 2380
},
{
"grad_norm": 0.5156052112579346,
"learning_rate": 9.868711335850764e-05,
"loss": 0.0412,
"step": 2390
},
{
"grad_norm": 0.4702778458595276,
"learning_rate": 9.866822592476833e-05,
"loss": 0.0353,
"step": 2400
},
{
"grad_norm": 0.4955006241798401,
"learning_rate": 9.86492054339577e-05,
"loss": 0.0356,
"step": 2410
},
{
"grad_norm": 0.4722374677658081,
"learning_rate": 9.863005193807711e-05,
"loss": 0.0328,
"step": 2420
},
{
"grad_norm": 0.5261074900627136,
"learning_rate": 9.861076548949143e-05,
"loss": 0.0314,
"step": 2430
},
{
"grad_norm": 0.43109720945358276,
"learning_rate": 9.859134614092912e-05,
"loss": 0.0306,
"step": 2440
},
{
"grad_norm": 0.5150691270828247,
"learning_rate": 9.857179394548191e-05,
"loss": 0.0331,
"step": 2450
},
{
"grad_norm": 0.413881778717041,
"learning_rate": 9.855210895660477e-05,
"loss": 0.0313,
"step": 2460
},
{
"grad_norm": 0.5778813362121582,
"learning_rate": 9.853229122811568e-05,
"loss": 0.0327,
"step": 2470
},
{
"grad_norm": 0.5499809980392456,
"learning_rate": 9.851234081419559e-05,
"loss": 0.0371,
"step": 2480
},
{
"grad_norm": 0.533755898475647,
"learning_rate": 9.849225776938814e-05,
"loss": 0.0347,
"step": 2490
},
{
"grad_norm": 0.5036794543266296,
"learning_rate": 9.847204214859964e-05,
"loss": 0.0365,
"step": 2500
},
{
"grad_norm": 0.4547636806964874,
"learning_rate": 9.845169400709879e-05,
"loss": 0.0284,
"step": 2510
},
{
"grad_norm": 0.4148177206516266,
"learning_rate": 9.843121340051664e-05,
"loss": 0.0338,
"step": 2520
},
{
"grad_norm": 0.4307814836502075,
"learning_rate": 9.841060038484641e-05,
"loss": 0.0401,
"step": 2530
},
{
"grad_norm": 0.5055217146873474,
"learning_rate": 9.838985501644328e-05,
"loss": 0.0413,
"step": 2540
},
{
"grad_norm": 0.5252987742424011,
"learning_rate": 9.83689773520243e-05,
"loss": 0.0334,
"step": 2550
},
{
"grad_norm": 0.5325053334236145,
"learning_rate": 9.834796744866819e-05,
"loss": 0.0339,
"step": 2560
},
{
"grad_norm": 0.5485632419586182,
"learning_rate": 9.832682536381525e-05,
"loss": 0.0354,
"step": 2570
},
{
"grad_norm": 0.5406777262687683,
"learning_rate": 9.830555115526711e-05,
"loss": 0.0368,
"step": 2580
},
{
"grad_norm": 0.37698280811309814,
"learning_rate": 9.828414488118667e-05,
"loss": 0.0336,
"step": 2590
},
{
"grad_norm": 0.5253736972808838,
"learning_rate": 9.826260660009785e-05,
"loss": 0.0337,
"step": 2600
},
{
"grad_norm": 0.482319176197052,
"learning_rate": 9.824093637088547e-05,
"loss": 0.0299,
"step": 2610
},
{
"grad_norm": 0.43845584988594055,
"learning_rate": 9.821913425279514e-05,
"loss": 0.032,
"step": 2620
},
{
"grad_norm": 0.4526597559452057,
"learning_rate": 9.8197200305433e-05,
"loss": 0.034,
"step": 2630
},
{
"grad_norm": 0.45589521527290344,
"learning_rate": 9.817513458876564e-05,
"loss": 0.0464,
"step": 2640
},
{
"grad_norm": 0.5381149649620056,
"learning_rate": 9.815293716311987e-05,
"loss": 0.0334,
"step": 2650
},
{
"grad_norm": 0.5279123187065125,
"learning_rate": 9.813060808918262e-05,
"loss": 0.0318,
"step": 2660
},
{
"grad_norm": 0.3532435894012451,
"learning_rate": 9.810814742800069e-05,
"loss": 0.0285,
"step": 2670
},
{
"grad_norm": 0.3765302896499634,
"learning_rate": 9.808555524098074e-05,
"loss": 0.0289,
"step": 2680
},
{
"grad_norm": 0.46037837862968445,
"learning_rate": 9.806283158988887e-05,
"loss": 0.0291,
"step": 2690
},
{
"grad_norm": 0.483735591173172,
"learning_rate": 9.803997653685072e-05,
"loss": 0.0392,
"step": 2700
},
{
"grad_norm": 0.45865148305892944,
"learning_rate": 9.801699014435112e-05,
"loss": 0.0393,
"step": 2710
},
{
"grad_norm": 0.4620376229286194,
"learning_rate": 9.799387247523398e-05,
"loss": 0.0352,
"step": 2720
},
{
"grad_norm": 0.41832435131073,
"learning_rate": 9.797062359270215e-05,
"loss": 0.0319,
"step": 2730
},
{
"grad_norm": 0.4439375400543213,
"learning_rate": 9.794724356031715e-05,
"loss": 0.0307,
"step": 2740
},
{
"grad_norm": 0.5037664771080017,
"learning_rate": 9.792373244199913e-05,
"loss": 0.0306,
"step": 2750
},
{
"grad_norm": 0.378164678812027,
"learning_rate": 9.790009030202658e-05,
"loss": 0.0313,
"step": 2760
},
{
"grad_norm": 0.5053073763847351,
"learning_rate": 9.78763172050362e-05,
"loss": 0.0295,
"step": 2770
},
{
"grad_norm": 0.4680381119251251,
"learning_rate": 9.785241321602274e-05,
"loss": 0.0277,
"step": 2780
},
{
"grad_norm": 0.4624013304710388,
"learning_rate": 9.782837840033879e-05,
"loss": 0.0288,
"step": 2790
},
{
"grad_norm": 0.5074241757392883,
"learning_rate": 9.780421282369461e-05,
"loss": 0.0292,
"step": 2800
},
{
"grad_norm": 0.4835506081581116,
"learning_rate": 9.777991655215797e-05,
"loss": 0.0294,
"step": 2810
},
{
"grad_norm": 0.5738292336463928,
"learning_rate": 9.775548965215394e-05,
"loss": 0.0295,
"step": 2820
},
{
"grad_norm": 0.5334445238113403,
"learning_rate": 9.773093219046474e-05,
"loss": 0.0293,
"step": 2830
},
{
"grad_norm": 0.4011390507221222,
"learning_rate": 9.770624423422954e-05,
"loss": 0.0291,
"step": 2840
},
{
"grad_norm": 0.41171419620513916,
"learning_rate": 9.768142585094426e-05,
"loss": 0.0302,
"step": 2850
},
{
"grad_norm": 0.46391263604164124,
"learning_rate": 9.765647710846142e-05,
"loss": 0.0405,
"step": 2860
},
{
"grad_norm": 0.5071845650672913,
"learning_rate": 9.763139807498991e-05,
"loss": 0.0285,
"step": 2870
},
{
"grad_norm": 0.4814237058162689,
"learning_rate": 9.760618881909487e-05,
"loss": 0.0317,
"step": 2880
},
{
"grad_norm": 0.5396919846534729,
"learning_rate": 9.758084940969744e-05,
"loss": 0.0316,
"step": 2890
},
{
"grad_norm": 0.5363779664039612,
"learning_rate": 9.755537991607459e-05,
"loss": 0.027,
"step": 2900
},
{
"grad_norm": 0.505138099193573,
"learning_rate": 9.752978040785895e-05,
"loss": 0.0354,
"step": 2910
},
{
"grad_norm": 0.5476271510124207,
"learning_rate": 9.750405095503859e-05,
"loss": 0.0299,
"step": 2920
},
{
"grad_norm": 0.5189036130905151,
"learning_rate": 9.747819162795686e-05,
"loss": 0.0331,
"step": 2930
},
{
"grad_norm": 0.45717042684555054,
"learning_rate": 9.745220249731217e-05,
"loss": 0.026,
"step": 2940
},
{
"grad_norm": 0.4337165355682373,
"learning_rate": 9.742608363415781e-05,
"loss": 0.0272,
"step": 2950
},
{
"grad_norm": 0.4811023771762848,
"learning_rate": 9.739983510990176e-05,
"loss": 0.0288,
"step": 2960
},
{
"grad_norm": 0.3455168902873993,
"learning_rate": 9.737345699630647e-05,
"loss": 0.0298,
"step": 2970
},
{
"grad_norm": 0.5057815313339233,
"learning_rate": 9.734694936548869e-05,
"loss": 0.0332,
"step": 2980
},
{
"grad_norm": 0.38619765639305115,
"learning_rate": 9.732031228991932e-05,
"loss": 0.0256,
"step": 2990
},
{
"grad_norm": 0.3297816514968872,
"learning_rate": 9.729354584242302e-05,
"loss": 0.0355,
"step": 3000
},
{
"grad_norm": 0.5174765586853027,
"learning_rate": 9.726665009617832e-05,
"loss": 0.0309,
"step": 3010
},
{
"grad_norm": 0.43245866894721985,
"learning_rate": 9.723962512471714e-05,
"loss": 0.033,
"step": 3020
},
{
"grad_norm": 0.516598105430603,
"learning_rate": 9.72124710019247e-05,
"loss": 0.03,
"step": 3030
},
{
"grad_norm": 0.48712822794914246,
"learning_rate": 9.718518780203934e-05,
"loss": 0.0322,
"step": 3040
},
{
"grad_norm": 0.3674415946006775,
"learning_rate": 9.715777559965228e-05,
"loss": 0.0319,
"step": 3050
},
{
"grad_norm": 0.4218079149723053,
"learning_rate": 9.713023446970746e-05,
"loss": 0.0255,
"step": 3060
},
{
"grad_norm": 0.4967867136001587,
"learning_rate": 9.710256448750126e-05,
"loss": 0.0311,
"step": 3070
},
{
"grad_norm": 0.497653067111969,
"learning_rate": 9.707476572868235e-05,
"loss": 0.0341,
"step": 3080
},
{
"grad_norm": 0.4222137928009033,
"learning_rate": 9.704683826925149e-05,
"loss": 0.0273,
"step": 3090
},
{
"grad_norm": 0.37705838680267334,
"learning_rate": 9.701878218556129e-05,
"loss": 0.036,
"step": 3100
},
{
"grad_norm": 0.5626199841499329,
"learning_rate": 9.699059755431598e-05,
"loss": 0.0331,
"step": 3110
},
{
"grad_norm": 0.46293774247169495,
"learning_rate": 9.696228445257132e-05,
"loss": 0.0277,
"step": 3120
},
{
"grad_norm": 0.42764750123023987,
"learning_rate": 9.693384295773419e-05,
"loss": 0.0327,
"step": 3130
},
{
"grad_norm": 0.4717363715171814,
"learning_rate": 9.690527314756259e-05,
"loss": 0.0339,
"step": 3140
},
{
"grad_norm": 0.458967387676239,
"learning_rate": 9.687657510016527e-05,
"loss": 0.0261,
"step": 3150
},
{
"grad_norm": 0.45871081948280334,
"learning_rate": 9.684774889400161e-05,
"loss": 0.0309,
"step": 3160
},
{
"grad_norm": 0.5132860541343689,
"learning_rate": 9.681879460788135e-05,
"loss": 0.0264,
"step": 3170
},
{
"grad_norm": 0.4729975461959839,
"learning_rate": 9.67897123209644e-05,
"loss": 0.0315,
"step": 3180
},
{
"grad_norm": 0.4921012818813324,
"learning_rate": 9.676050211276062e-05,
"loss": 0.035,
"step": 3190
},
{
"grad_norm": 0.4574073255062103,
"learning_rate": 9.673116406312962e-05,
"loss": 0.0284,
"step": 3200
},
{
"grad_norm": 0.48541590571403503,
"learning_rate": 9.67016982522805e-05,
"loss": 0.028,
"step": 3210
},
{
"grad_norm": 0.4924331307411194,
"learning_rate": 9.667210476077164e-05,
"loss": 0.028,
"step": 3220
},
{
"grad_norm": 0.5730510950088501,
"learning_rate": 9.664238366951055e-05,
"loss": 0.0288,
"step": 3230
},
{
"grad_norm": 0.5551027059555054,
"learning_rate": 9.661253505975355e-05,
"loss": 0.0269,
"step": 3240
},
{
"grad_norm": 0.4366356134414673,
"learning_rate": 9.658255901310557e-05,
"loss": 0.0301,
"step": 3250
},
{
"grad_norm": 0.5327138304710388,
"learning_rate": 9.655245561152e-05,
"loss": 0.0278,
"step": 3260
},
{
"grad_norm": 0.4516207277774811,
"learning_rate": 9.65222249372984e-05,
"loss": 0.0266,
"step": 3270
},
{
"grad_norm": 0.4709407687187195,
"learning_rate": 9.649186707309026e-05,
"loss": 0.0325,
"step": 3280
},
{
"grad_norm": 0.36673372983932495,
"learning_rate": 9.646138210189283e-05,
"loss": 0.0285,
"step": 3290
},
{
"grad_norm": 0.5308244824409485,
"learning_rate": 9.643077010705087e-05,
"loss": 0.0281,
"step": 3300
},
{
"grad_norm": 0.45568153262138367,
"learning_rate": 9.640003117225637e-05,
"loss": 0.0286,
"step": 3310
},
{
"grad_norm": 0.4082559049129486,
"learning_rate": 9.636916538154846e-05,
"loss": 0.0241,
"step": 3320
},
{
"grad_norm": 0.48012563586235046,
"learning_rate": 9.633817281931296e-05,
"loss": 0.0297,
"step": 3330
},
{
"grad_norm": 0.4177444875240326,
"learning_rate": 9.630705357028242e-05,
"loss": 0.032,
"step": 3340
},
{
"grad_norm": 0.48793429136276245,
"learning_rate": 9.627580771953563e-05,
"loss": 0.0285,
"step": 3350
},
{
"grad_norm": 0.4371464252471924,
"learning_rate": 9.624443535249759e-05,
"loss": 0.0275,
"step": 3360
},
{
"grad_norm": 0.4983312487602234,
"learning_rate": 9.621293655493913e-05,
"loss": 0.0254,
"step": 3370
},
{
"grad_norm": 0.5624396204948425,
"learning_rate": 9.618131141297675e-05,
"loss": 0.027,
"step": 3380
},
{
"grad_norm": 0.43570947647094727,
"learning_rate": 9.614956001307242e-05,
"loss": 0.0301,
"step": 3390
},
{
"grad_norm": 0.4448493719100952,
"learning_rate": 9.611768244203321e-05,
"loss": 0.0351,
"step": 3400
},
{
"grad_norm": 0.4213621914386749,
"learning_rate": 9.60856787870112e-05,
"loss": 0.0292,
"step": 3410
},
{
"grad_norm": 0.4154338836669922,
"learning_rate": 9.605354913550318e-05,
"loss": 0.0262,
"step": 3420
},
{
"grad_norm": 0.45102718472480774,
"learning_rate": 9.602129357535037e-05,
"loss": 0.0313,
"step": 3430
},
{
"grad_norm": 0.38145503401756287,
"learning_rate": 9.598891219473825e-05,
"loss": 0.027,
"step": 3440
},
{
"grad_norm": 0.41790488362312317,
"learning_rate": 9.595640508219625e-05,
"loss": 0.0291,
"step": 3450
},
{
"grad_norm": 0.4644753336906433,
"learning_rate": 9.592377232659761e-05,
"loss": 0.0249,
"step": 3460
},
{
"grad_norm": 0.4731713533401489,
"learning_rate": 9.589101401715904e-05,
"loss": 0.0263,
"step": 3470
},
{
"grad_norm": 0.42398542165756226,
"learning_rate": 9.585813024344045e-05,
"loss": 0.026,
"step": 3480
},
{
"grad_norm": 0.5419644117355347,
"learning_rate": 9.58251210953449e-05,
"loss": 0.0296,
"step": 3490
},
{
"grad_norm": 0.463670939207077,
"learning_rate": 9.579198666311809e-05,
"loss": 0.0238,
"step": 3500
},
{
"grad_norm": 0.39643239974975586,
"learning_rate": 9.575872703734832e-05,
"loss": 0.0292,
"step": 3510
},
{
"grad_norm": 0.3542700409889221,
"learning_rate": 9.572534230896611e-05,
"loss": 0.0231,
"step": 3520
},
{
"grad_norm": 0.43060752749443054,
"learning_rate": 9.569183256924403e-05,
"loss": 0.025,
"step": 3530
},
{
"grad_norm": 0.40233463048934937,
"learning_rate": 9.565819790979646e-05,
"loss": 0.0422,
"step": 3540
},
{
"grad_norm": 0.4497774839401245,
"learning_rate": 9.562443842257925e-05,
"loss": 0.029,
"step": 3550
},
{
"grad_norm": 0.5018470287322998,
"learning_rate": 9.559055419988956e-05,
"loss": 0.0283,
"step": 3560
},
{
"grad_norm": 0.47868454456329346,
"learning_rate": 9.555654533436557e-05,
"loss": 0.0349,
"step": 3570
},
{
"grad_norm": 0.4413691759109497,
"learning_rate": 9.552241191898621e-05,
"loss": 0.0238,
"step": 3580
},
{
"grad_norm": 0.40998080372810364,
"learning_rate": 9.548815404707092e-05,
"loss": 0.03,
"step": 3590
},
{
"grad_norm": 0.43824273347854614,
"learning_rate": 9.545377181227942e-05,
"loss": 0.0284,
"step": 3600
},
{
"grad_norm": 0.4570449888706207,
"learning_rate": 9.541926530861145e-05,
"loss": 0.0266,
"step": 3610
},
{
"grad_norm": 0.44766074419021606,
"learning_rate": 9.538463463040645e-05,
"loss": 0.0278,
"step": 3620
},
{
"grad_norm": 0.481611967086792,
"learning_rate": 9.534987987234337e-05,
"loss": 0.0277,
"step": 3630
},
{
"grad_norm": 0.4858357608318329,
"learning_rate": 9.53150011294404e-05,
"loss": 0.0265,
"step": 3640
},
{
"grad_norm": 0.40574368834495544,
"learning_rate": 9.527999849705471e-05,
"loss": 0.0297,
"step": 3650
},
{
"grad_norm": 0.4581122100353241,
"learning_rate": 9.524487207088213e-05,
"loss": 0.0224,
"step": 3660
},
{
"grad_norm": 0.4100882411003113,
"learning_rate": 9.520962194695698e-05,
"loss": 0.0239,
"step": 3670
},
{
"grad_norm": 0.40333643555641174,
"learning_rate": 9.517424822165175e-05,
"loss": 0.0238,
"step": 3680
},
{
"grad_norm": 0.5596145987510681,
"learning_rate": 9.513875099167685e-05,
"loss": 0.0245,
"step": 3690
},
{
"grad_norm": 0.5230712890625,
"learning_rate": 9.510313035408035e-05,
"loss": 0.0262,
"step": 3700
},
{
"grad_norm": 0.39155617356300354,
"learning_rate": 9.506738640624775e-05,
"loss": 0.0264,
"step": 3710
},
{
"grad_norm": 0.4129464328289032,
"learning_rate": 9.50315192459016e-05,
"loss": 0.0208,
"step": 3720
},
{
"grad_norm": 0.5159543752670288,
"learning_rate": 9.499552897110136e-05,
"loss": 0.0239,
"step": 3730
},
{
"grad_norm": 0.5178094506263733,
"learning_rate": 9.495941568024304e-05,
"loss": 0.0253,
"step": 3740
},
{
"grad_norm": 0.43580612540245056,
"learning_rate": 9.492317947205904e-05,
"loss": 0.0268,
"step": 3750
},
{
"grad_norm": 0.4596274495124817,
"learning_rate": 9.488682044561775e-05,
"loss": 0.0256,
"step": 3760
},
{
"grad_norm": 0.41573286056518555,
"learning_rate": 9.485033870032335e-05,
"loss": 0.0243,
"step": 3770
},
{
"grad_norm": 0.47876912355422974,
"learning_rate": 9.481373433591556e-05,
"loss": 0.0215,
"step": 3780
},
{
"grad_norm": 0.4741547703742981,
"learning_rate": 9.47770074524693e-05,
"loss": 0.027,
"step": 3790
},
{
"grad_norm": 0.4306631088256836,
"learning_rate": 9.474015815039446e-05,
"loss": 0.0277,
"step": 3800
},
{
"grad_norm": 0.46127429604530334,
"learning_rate": 9.470318653043565e-05,
"loss": 0.0273,
"step": 3810
},
{
"grad_norm": 0.5021414160728455,
"learning_rate": 9.466609269367185e-05,
"loss": 0.0263,
"step": 3820
},
{
"grad_norm": 0.5333779454231262,
"learning_rate": 9.46288767415162e-05,
"loss": 0.0234,
"step": 3830
},
{
"grad_norm": 0.4366990625858307,
"learning_rate": 9.459153877571567e-05,
"loss": 0.0225,
"step": 3840
},
{
"grad_norm": 0.4819251298904419,
"learning_rate": 9.455407889835087e-05,
"loss": 0.0238,
"step": 3850
},
{
"grad_norm": 0.3999616503715515,
"learning_rate": 9.451649721183564e-05,
"loss": 0.0234,
"step": 3860
},
{
"grad_norm": 0.37807697057724,
"learning_rate": 9.447879381891692e-05,
"loss": 0.0258,
"step": 3870
},
{
"grad_norm": 0.5266739130020142,
"learning_rate": 9.444096882267428e-05,
"loss": 0.0329,
"step": 3880
},
{
"grad_norm": 0.3961910903453827,
"learning_rate": 9.440302232651988e-05,
"loss": 0.0226,
"step": 3890
},
{
"grad_norm": 0.3786242604255676,
"learning_rate": 9.436495443419795e-05,
"loss": 0.024,
"step": 3900
},
{
"grad_norm": 0.4175941050052643,
"learning_rate": 9.432676524978466e-05,
"loss": 0.0219,
"step": 3910
},
{
"grad_norm": 0.44096827507019043,
"learning_rate": 9.42884548776878e-05,
"loss": 0.0253,
"step": 3920
},
{
"grad_norm": 0.41201087832450867,
"learning_rate": 9.425002342264646e-05,
"loss": 0.0223,
"step": 3930
},
{
"grad_norm": 0.5009353160858154,
"learning_rate": 9.421147098973077e-05,
"loss": 0.0266,
"step": 3940
},
{
"grad_norm": 0.5505723357200623,
"learning_rate": 9.41727976843416e-05,
"loss": 0.0258,
"step": 3950
},
{
"grad_norm": 0.45981982350349426,
"learning_rate": 9.413400361221029e-05,
"loss": 0.0279,
"step": 3960
},
{
"grad_norm": 0.4804719388484955,
"learning_rate": 9.409508887939835e-05,
"loss": 0.022,
"step": 3970
},
{
"grad_norm": 0.4238436222076416,
"learning_rate": 9.40560535922972e-05,
"loss": 0.0212,
"step": 3980
},
{
"grad_norm": 0.403974324464798,
"learning_rate": 9.40168978576278e-05,
"loss": 0.0189,
"step": 3990
},
{
"grad_norm": 0.48837044835090637,
"learning_rate": 9.397762178244043e-05,
"loss": 0.0244,
"step": 4000
},
{
"grad_norm": 0.48128196597099304,
"learning_rate": 9.393822547411439e-05,
"loss": 0.0217,
"step": 4010
},
{
"grad_norm": 0.3272818624973297,
"learning_rate": 9.389870904035769e-05,
"loss": 0.0242,
"step": 4020
},
{
"grad_norm": 0.36953118443489075,
"learning_rate": 9.385907258920672e-05,
"loss": 0.0246,
"step": 4030
},
{
"grad_norm": 0.41161492466926575,
"learning_rate": 9.381931622902607e-05,
"loss": 0.021,
"step": 4040
},
{
"grad_norm": 0.4544064998626709,
"learning_rate": 9.377944006850807e-05,
"loss": 0.0193,
"step": 4050
},
{
"grad_norm": 0.47396498918533325,
"learning_rate": 9.373944421667265e-05,
"loss": 0.0213,
"step": 4060
},
{
"grad_norm": 0.4621795117855072,
"learning_rate": 9.369932878286691e-05,
"loss": 0.0266,
"step": 4070
},
{
"grad_norm": 0.5184421539306641,
"learning_rate": 9.365909387676494e-05,
"loss": 0.0196,
"step": 4080
},
{
"grad_norm": 0.4004800319671631,
"learning_rate": 9.361873960836744e-05,
"loss": 0.0263,
"step": 4090
},
{
"grad_norm": 0.3737598657608032,
"learning_rate": 9.357826608800142e-05,
"loss": 0.0196,
"step": 4100
},
{
"grad_norm": 0.4000731110572815,
"learning_rate": 9.353767342631994e-05,
"loss": 0.0203,
"step": 4110
},
{
"grad_norm": 0.3826330006122589,
"learning_rate": 9.34969617343018e-05,
"loss": 0.0219,
"step": 4120
},
{
"grad_norm": 0.5988262891769409,
"learning_rate": 9.345613112325122e-05,
"loss": 0.0204,
"step": 4130
},
{
"grad_norm": 0.4280189275741577,
"learning_rate": 9.34151817047975e-05,
"loss": 0.0224,
"step": 4140
},
{
"grad_norm": 0.3716961145401001,
"learning_rate": 9.33741135908948e-05,
"loss": 0.0262,
"step": 4150
},
{
"grad_norm": 0.4295980930328369,
"learning_rate": 9.33329268938218e-05,
"loss": 0.0207,
"step": 4160
},
{
"grad_norm": 0.425942063331604,
"learning_rate": 9.329162172618132e-05,
"loss": 0.0238,
"step": 4170
},
{
"grad_norm": 0.416522741317749,
"learning_rate": 9.325019820090013e-05,
"loss": 0.0226,
"step": 4180
},
{
"grad_norm": 0.5610533952713013,
"learning_rate": 9.320865643122855e-05,
"loss": 0.0208,
"step": 4190
},
{
"grad_norm": 0.379802942276001,
"learning_rate": 9.316699653074023e-05,
"loss": 0.022,
"step": 4200
},
{
"grad_norm": 0.4576219618320465,
"learning_rate": 9.312521861333172e-05,
"loss": 0.0166,
"step": 4210
},
{
"grad_norm": 0.45310190320014954,
"learning_rate": 9.308332279322224e-05,
"loss": 0.0242,
"step": 4220
},
{
"grad_norm": 0.4080248177051544,
"learning_rate": 9.304130918495338e-05,
"loss": 0.0224,
"step": 4230
},
{
"grad_norm": 0.33399489521980286,
"learning_rate": 9.299917790338874e-05,
"loss": 0.0187,
"step": 4240
},
{
"grad_norm": 0.356057733297348,
"learning_rate": 9.295692906371363e-05,
"loss": 0.0173,
"step": 4250
},
{
"grad_norm": 0.42619287967681885,
"learning_rate": 9.291456278143476e-05,
"loss": 0.0264,
"step": 4260
},
{
"grad_norm": 0.3479536175727844,
"learning_rate": 9.287207917237994e-05,
"loss": 0.0213,
"step": 4270
},
{
"grad_norm": 0.3362795114517212,
"learning_rate": 9.282947835269773e-05,
"loss": 0.0206,
"step": 4280
},
{
"grad_norm": 0.43236204981803894,
"learning_rate": 9.278676043885715e-05,
"loss": 0.0191,
"step": 4290
},
{
"grad_norm": 0.32585880160331726,
"learning_rate": 9.274392554764733e-05,
"loss": 0.0194,
"step": 4300
},
{
"grad_norm": 0.4723697900772095,
"learning_rate": 9.270097379617723e-05,
"loss": 0.016,
"step": 4310
},
{
"grad_norm": 0.42713454365730286,
"learning_rate": 9.26579053018753e-05,
"loss": 0.0154,
"step": 4320
},
{
"grad_norm": 0.33830246329307556,
"learning_rate": 9.261472018248918e-05,
"loss": 0.0146,
"step": 4330
},
{
"grad_norm": 0.4066753387451172,
"learning_rate": 9.25714185560853e-05,
"loss": 0.0259,
"step": 4340
},
{
"grad_norm": 0.448772668838501,
"learning_rate": 9.252800054104868e-05,
"loss": 0.0187,
"step": 4350
},
{
"grad_norm": 0.4219300448894501,
"learning_rate": 9.248446625608252e-05,
"loss": 0.0208,
"step": 4360
},
{
"grad_norm": 0.39920371770858765,
"learning_rate": 9.244081582020789e-05,
"loss": 0.0175,
"step": 4370
},
{
"grad_norm": 0.42131638526916504,
"learning_rate": 9.239704935276339e-05,
"loss": 0.0182,
"step": 4380
},
{
"grad_norm": 0.45648935437202454,
"learning_rate": 9.235316697340489e-05,
"loss": 0.0158,
"step": 4390
},
{
"grad_norm": 0.42188429832458496,
"learning_rate": 9.230916880210512e-05,
"loss": 0.0183,
"step": 4400
},
{
"grad_norm": 0.36581969261169434,
"learning_rate": 9.226505495915342e-05,
"loss": 0.0147,
"step": 4410
},
{
"grad_norm": 0.42502549290657043,
"learning_rate": 9.222082556515536e-05,
"loss": 0.0198,
"step": 4420
},
{
"grad_norm": 0.35229989886283875,
"learning_rate": 9.217648074103242e-05,
"loss": 0.0153,
"step": 4430
},
{
"grad_norm": 0.4085313379764557,
"learning_rate": 9.213202060802161e-05,
"loss": 0.0192,
"step": 4440
},
{
"grad_norm": 0.4650028645992279,
"learning_rate": 9.208744528767528e-05,
"loss": 0.0173,
"step": 4450
},
{
"grad_norm": 0.4048616886138916,
"learning_rate": 9.204275490186064e-05,
"loss": 0.0204,
"step": 4460
},
{
"grad_norm": 0.4178619980812073,
"learning_rate": 9.199794957275949e-05,
"loss": 0.0204,
"step": 4470
},
{
"grad_norm": 0.46256691217422485,
"learning_rate": 9.19530294228679e-05,
"loss": 0.0177,
"step": 4480
},
{
"grad_norm": 0.35352519154548645,
"learning_rate": 9.190799457499583e-05,
"loss": 0.028,
"step": 4490
},
{
"grad_norm": 0.4470050632953644,
"learning_rate": 9.186284515226686e-05,
"loss": 0.0194,
"step": 4500
},
{
"grad_norm": 0.3508913815021515,
"learning_rate": 9.181758127811777e-05,
"loss": 0.0241,
"step": 4510
},
{
"grad_norm": 0.411702424287796,
"learning_rate": 9.177220307629825e-05,
"loss": 0.0204,
"step": 4520
},
{
"grad_norm": 0.4468960762023926,
"learning_rate": 9.172671067087059e-05,
"loss": 0.0194,
"step": 4530
},
{
"grad_norm": 0.4807928204536438,
"learning_rate": 9.16811041862093e-05,
"loss": 0.0256,
"step": 4540
},
{
"grad_norm": 0.39205247163772583,
"learning_rate": 9.163538374700076e-05,
"loss": 0.0185,
"step": 4550
},
{
"grad_norm": 0.44329723715782166,
"learning_rate": 9.158954947824287e-05,
"loss": 0.0178,
"step": 4560
},
{
"grad_norm": 0.47283023595809937,
"learning_rate": 9.154360150524482e-05,
"loss": 0.0174,
"step": 4570
},
{
"grad_norm": 0.38849857449531555,
"learning_rate": 9.14975399536266e-05,
"loss": 0.0143,
"step": 4580
},
{
"grad_norm": 0.3656264543533325,
"learning_rate": 9.14513649493187e-05,
"loss": 0.0212,
"step": 4590
},
{
"grad_norm": 0.4674840271472931,
"learning_rate": 9.140507661856187e-05,
"loss": 0.0153,
"step": 4600
},
{
"grad_norm": 0.4313472509384155,
"learning_rate": 9.135867508790661e-05,
"loss": 0.0214,
"step": 4610
},
{
"grad_norm": 0.3471619486808777,
"learning_rate": 9.131216048421291e-05,
"loss": 0.0165,
"step": 4620
},
{
"grad_norm": 0.4542539715766907,
"learning_rate": 9.126553293464998e-05,
"loss": 0.0189,
"step": 4630
},
{
"grad_norm": 0.47608688473701477,
"learning_rate": 9.121879256669572e-05,
"loss": 0.017,
"step": 4640
},
{
"grad_norm": 0.3959465026855469,
"learning_rate": 9.117193950813652e-05,
"loss": 0.0164,
"step": 4650
},
{
"grad_norm": 0.408431738615036,
"learning_rate": 9.112497388706685e-05,
"loss": 0.0255,
"step": 4660
},
{
"grad_norm": 0.4116475582122803,
"learning_rate": 9.10778958318889e-05,
"loss": 0.0174,
"step": 4670
},
{
"grad_norm": 0.3917919993400574,
"learning_rate": 9.103070547131232e-05,
"loss": 0.0199,
"step": 4680
},
{
"grad_norm": 0.3482106029987335,
"learning_rate": 9.098340293435375e-05,
"loss": 0.0179,
"step": 4690
},
{
"grad_norm": 0.34646838903427124,
"learning_rate": 9.093598835033649e-05,
"loss": 0.0174,
"step": 4700
},
{
"grad_norm": 0.39419376850128174,
"learning_rate": 9.088846184889021e-05,
"loss": 0.0191,
"step": 4710
},
{
"grad_norm": 0.4543268084526062,
"learning_rate": 9.084082355995057e-05,
"loss": 0.0213,
"step": 4720
},
{
"grad_norm": 0.4212946891784668,
"learning_rate": 9.079307361375882e-05,
"loss": 0.0181,
"step": 4730
},
{
"grad_norm": 0.3014923334121704,
"learning_rate": 9.074521214086149e-05,
"loss": 0.019,
"step": 4740
},
{
"grad_norm": 0.36527299880981445,
"learning_rate": 9.069723927211001e-05,
"loss": 0.0179,
"step": 4750
},
{
"grad_norm": 0.3752840757369995,
"learning_rate": 9.064915513866037e-05,
"loss": 0.0183,
"step": 4760
},
{
"grad_norm": 0.42201003432273865,
"learning_rate": 9.060095987197279e-05,
"loss": 0.0162,
"step": 4770
},
{
"grad_norm": 0.3307137191295624,
"learning_rate": 9.055265360381126e-05,
"loss": 0.0206,
"step": 4780
},
{
"grad_norm": 0.33322593569755554,
"learning_rate": 9.050423646624326e-05,
"loss": 0.016,
"step": 4790
},
{
"grad_norm": 0.35324618220329285,
"learning_rate": 9.045570859163943e-05,
"loss": 0.0194,
"step": 4800
},
{
"grad_norm": 0.427572637796402,
"learning_rate": 9.04070701126731e-05,
"loss": 0.015,
"step": 4810
},
{
"grad_norm": 0.3561609983444214,
"learning_rate": 9.035832116232001e-05,
"loss": 0.0145,
"step": 4820
},
{
"grad_norm": 0.37716561555862427,
"learning_rate": 9.030946187385796e-05,
"loss": 0.016,
"step": 4830
},
{
"grad_norm": 0.39859738945961,
"learning_rate": 9.026049238086635e-05,
"loss": 0.0178,
"step": 4840
},
{
"grad_norm": 0.4500395655632019,
"learning_rate": 9.021141281722591e-05,
"loss": 0.0202,
"step": 4850
},
{
"grad_norm": 0.34830138087272644,
"learning_rate": 9.01622233171183e-05,
"loss": 0.0169,
"step": 4860
},
{
"grad_norm": 0.3729107677936554,
"learning_rate": 9.011292401502574e-05,
"loss": 0.0212,
"step": 4870
},
{
"grad_norm": 0.3912448585033417,
"learning_rate": 9.006351504573063e-05,
"loss": 0.0146,
"step": 4880
},
{
"grad_norm": 0.4137353003025055,
"learning_rate": 9.001399654431519e-05,
"loss": 0.0171,
"step": 4890
},
{
"grad_norm": 0.4444160759449005,
"learning_rate": 8.996436864616116e-05,
"loss": 0.0162,
"step": 4900
},
{
"grad_norm": 0.3148241639137268,
"learning_rate": 8.991463148694925e-05,
"loss": 0.0191,
"step": 4910
},
{
"grad_norm": 0.4391416907310486,
"learning_rate": 8.986478520265902e-05,
"loss": 0.0187,
"step": 4920
},
{
"grad_norm": 0.4296688139438629,
"learning_rate": 8.981482992956827e-05,
"loss": 0.0143,
"step": 4930
},
{
"grad_norm": 0.29728299379348755,
"learning_rate": 8.976476580425282e-05,
"loss": 0.0148,
"step": 4940
},
{
"grad_norm": 0.4356195032596588,
"learning_rate": 8.971459296358606e-05,
"loss": 0.0287,
"step": 4950
},
{
"grad_norm": 0.4179481565952301,
"learning_rate": 8.966431154473864e-05,
"loss": 0.0157,
"step": 4960
},
{
"grad_norm": 0.3610477149486542,
"learning_rate": 8.961392168517803e-05,
"loss": 0.0159,
"step": 4970
},
{
"grad_norm": 0.34345686435699463,
"learning_rate": 8.956342352266821e-05,
"loss": 0.016,
"step": 4980
},
{
"grad_norm": 0.3698787987232208,
"learning_rate": 8.95128171952692e-05,
"loss": 0.0214,
"step": 4990
},
{
"grad_norm": 0.327648788690567,
"learning_rate": 8.946210284133676e-05,
"loss": 0.0173,
"step": 5000
},
{
"grad_norm": 0.2809329330921173,
"learning_rate": 8.941128059952201e-05,
"loss": 0.0132,
"step": 5010
},
{
"grad_norm": 0.31239569187164307,
"learning_rate": 8.936035060877102e-05,
"loss": 0.0244,
"step": 5020
},
{
"grad_norm": 0.40824398398399353,
"learning_rate": 8.930931300832443e-05,
"loss": 0.0181,
"step": 5030
},
{
"grad_norm": 0.36586353182792664,
"learning_rate": 8.925816793771711e-05,
"loss": 0.0148,
"step": 5040
},
{
"grad_norm": 0.2970711886882782,
"learning_rate": 8.92069155367777e-05,
"loss": 0.0139,
"step": 5050
},
{
"grad_norm": 0.3478129506111145,
"learning_rate": 8.915555594562834e-05,
"loss": 0.0172,
"step": 5060
},
{
"grad_norm": 0.3716062903404236,
"learning_rate": 8.910408930468416e-05,
"loss": 0.016,
"step": 5070
},
{
"grad_norm": 0.35524141788482666,
"learning_rate": 8.905251575465303e-05,
"loss": 0.0128,
"step": 5080
},
{
"grad_norm": 0.46534451842308044,
"learning_rate": 8.900083543653502e-05,
"loss": 0.0192,
"step": 5090
},
{
"grad_norm": 0.4329080283641815,
"learning_rate": 8.894904849162218e-05,
"loss": 0.0176,
"step": 5100
},
{
"grad_norm": 0.33941879868507385,
"learning_rate": 8.889715506149802e-05,
"loss": 0.0161,
"step": 5110
},
{
"grad_norm": 0.37832191586494446,
"learning_rate": 8.884515528803722e-05,
"loss": 0.0157,
"step": 5120
},
{
"grad_norm": 0.36182844638824463,
"learning_rate": 8.879304931340517e-05,
"loss": 0.0148,
"step": 5130
},
{
"grad_norm": 0.3270893096923828,
"learning_rate": 8.874083728005759e-05,
"loss": 0.014,
"step": 5140
},
{
"grad_norm": 0.3815666437149048,
"learning_rate": 8.868851933074021e-05,
"loss": 0.0207,
"step": 5150
},
{
"grad_norm": 0.4211690425872803,
"learning_rate": 8.863609560848829e-05,
"loss": 0.0147,
"step": 5160
},
{
"grad_norm": 0.38520973920822144,
"learning_rate": 8.85835662566263e-05,
"loss": 0.0167,
"step": 5170
},
{
"grad_norm": 0.4238542914390564,
"learning_rate": 8.853093141876747e-05,
"loss": 0.0123,
"step": 5180
},
{
"grad_norm": 0.4630364179611206,
"learning_rate": 8.847819123881343e-05,
"loss": 0.0157,
"step": 5190
},
{
"grad_norm": 0.3982260227203369,
"learning_rate": 8.842534586095383e-05,
"loss": 0.0156,
"step": 5200
},
{
"grad_norm": 0.418029248714447,
"learning_rate": 8.837239542966593e-05,
"loss": 0.0147,
"step": 5210
},
{
"grad_norm": 0.42096608877182007,
"learning_rate": 8.831934008971417e-05,
"loss": 0.0155,
"step": 5220
},
{
"grad_norm": 0.40651246905326843,
"learning_rate": 8.826617998614982e-05,
"loss": 0.0134,
"step": 5230
},
{
"grad_norm": 0.35442405939102173,
"learning_rate": 8.821291526431056e-05,
"loss": 0.0149,
"step": 5240
},
{
"grad_norm": 0.4477267563343048,
"learning_rate": 8.815954606982015e-05,
"loss": 0.017,
"step": 5250
},
{
"grad_norm": 0.42138704657554626,
"learning_rate": 8.810607254858789e-05,
"loss": 0.0161,
"step": 5260
},
{
"grad_norm": 0.3887590169906616,
"learning_rate": 8.805249484680838e-05,
"loss": 0.0177,
"step": 5270
},
{
"grad_norm": 0.3340283930301666,
"learning_rate": 8.799881311096096e-05,
"loss": 0.0132,
"step": 5280
},
{
"grad_norm": 0.37869396805763245,
"learning_rate": 8.794502748780949e-05,
"loss": 0.0158,
"step": 5290
},
{
"grad_norm": 0.38290390372276306,
"learning_rate": 8.78911381244018e-05,
"loss": 0.0142,
"step": 5300
},
{
"grad_norm": 0.3591060936450958,
"learning_rate": 8.783714516806933e-05,
"loss": 0.0151,
"step": 5310
},
{
"grad_norm": 0.4037642478942871,
"learning_rate": 8.77830487664268e-05,
"loss": 0.0128,
"step": 5320
},
{
"grad_norm": 0.25061705708503723,
"learning_rate": 8.772884906737167e-05,
"loss": 0.0162,
"step": 5330
},
{
"grad_norm": 0.3075624108314514,
"learning_rate": 8.767454621908387e-05,
"loss": 0.0136,
"step": 5340
},
{
"grad_norm": 0.38026294112205505,
"learning_rate": 8.76201403700253e-05,
"loss": 0.0136,
"step": 5350
},
{
"grad_norm": 0.48398804664611816,
"learning_rate": 8.756563166893949e-05,
"loss": 0.0132,
"step": 5360
},
{
"grad_norm": 0.3468417823314667,
"learning_rate": 8.751102026485113e-05,
"loss": 0.0151,
"step": 5370
},
{
"grad_norm": 0.3630031943321228,
"learning_rate": 8.745630630706571e-05,
"loss": 0.0158,
"step": 5380
},
{
"grad_norm": 0.5174519419670105,
"learning_rate": 8.740148994516912e-05,
"loss": 0.0165,
"step": 5390
},
{
"grad_norm": 0.3353045582771301,
"learning_rate": 8.73465713290272e-05,
"loss": 0.0133,
"step": 5400
},
{
"grad_norm": 0.3345337212085724,
"learning_rate": 8.729155060878533e-05,
"loss": 0.0186,
"step": 5410
},
{
"grad_norm": 0.3502795100212097,
"learning_rate": 8.723642793486809e-05,
"loss": 0.017,
"step": 5420
},
{
"grad_norm": 0.33117562532424927,
"learning_rate": 8.718120345797873e-05,
"loss": 0.0156,
"step": 5430
},
{
"grad_norm": 0.3348385989665985,
"learning_rate": 8.712587732909889e-05,
"loss": 0.0131,
"step": 5440
},
{
"grad_norm": 0.31218665838241577,
"learning_rate": 8.707044969948806e-05,
"loss": 0.0139,
"step": 5450
},
{
"grad_norm": 0.36143720149993896,
"learning_rate": 8.701492072068329e-05,
"loss": 0.0162,
"step": 5460
},
{
"grad_norm": 0.397625207901001,
"learning_rate": 8.695929054449869e-05,
"loss": 0.0162,
"step": 5470
},
{
"grad_norm": 0.35156044363975525,
"learning_rate": 8.690355932302501e-05,
"loss": 0.0149,
"step": 5480
},
{
"grad_norm": 0.3862064778804779,
"learning_rate": 8.684772720862931e-05,
"loss": 0.0134,
"step": 5490
},
{
"grad_norm": 0.33415740728378296,
"learning_rate": 8.679179435395446e-05,
"loss": 0.0156,
"step": 5500
},
{
"grad_norm": 0.33985161781311035,
"learning_rate": 8.673576091191874e-05,
"loss": 0.0152,
"step": 5510
},
{
"grad_norm": 0.43412765860557556,
"learning_rate": 8.667962703571541e-05,
"loss": 0.0153,
"step": 5520
},
{
"grad_norm": 0.29241663217544556,
"learning_rate": 8.662339287881238e-05,
"loss": 0.0145,
"step": 5530
},
{
"grad_norm": 0.26985955238342285,
"learning_rate": 8.656705859495169e-05,
"loss": 0.0095,
"step": 5540
},
{
"grad_norm": 0.3288934528827667,
"learning_rate": 8.651062433814912e-05,
"loss": 0.0148,
"step": 5550
},
{
"grad_norm": 0.32042691111564636,
"learning_rate": 8.645409026269375e-05,
"loss": 0.0178,
"step": 5560
},
{
"grad_norm": 0.29201775789260864,
"learning_rate": 8.639745652314759e-05,
"loss": 0.0136,
"step": 5570
},
{
"grad_norm": 0.33705347776412964,
"learning_rate": 8.634072327434515e-05,
"loss": 0.0199,
"step": 5580
},
{
"grad_norm": 0.43964189291000366,
"learning_rate": 8.628389067139294e-05,
"loss": 0.0153,
"step": 5590
},
{
"grad_norm": 0.3852575421333313,
"learning_rate": 8.622695886966911e-05,
"loss": 0.0124,
"step": 5600
},
{
"grad_norm": 0.3601333200931549,
"learning_rate": 8.616992802482308e-05,
"loss": 0.0115,
"step": 5610
},
{
"grad_norm": 0.3712993562221527,
"learning_rate": 8.611279829277496e-05,
"loss": 0.0129,
"step": 5620
},
{
"grad_norm": 0.3430801033973694,
"learning_rate": 8.605556982971528e-05,
"loss": 0.0119,
"step": 5630
},
{
"grad_norm": 0.2783951163291931,
"learning_rate": 8.599824279210447e-05,
"loss": 0.0113,
"step": 5640
},
{
"grad_norm": 0.3604603111743927,
"learning_rate": 8.594081733667243e-05,
"loss": 0.016,
"step": 5650
},
{
"grad_norm": 0.4052552282810211,
"learning_rate": 8.58832936204182e-05,
"loss": 0.0141,
"step": 5660
},
{
"grad_norm": 0.2946913242340088,
"learning_rate": 8.582567180060942e-05,
"loss": 0.0188,
"step": 5670
},
{
"grad_norm": 0.29554295539855957,
"learning_rate": 8.576795203478194e-05,
"loss": 0.0147,
"step": 5680
},
{
"grad_norm": 0.30913200974464417,
"learning_rate": 8.571013448073939e-05,
"loss": 0.0155,
"step": 5690
},
{
"grad_norm": 0.3153333067893982,
"learning_rate": 8.565221929655275e-05,
"loss": 0.0116,
"step": 5700
},
{
"grad_norm": 0.26914530992507935,
"learning_rate": 8.559420664055992e-05,
"loss": 0.0125,
"step": 5710
},
{
"grad_norm": 0.3266845941543579,
"learning_rate": 8.553609667136532e-05,
"loss": 0.0127,
"step": 5720
},
{
"grad_norm": 0.36770594120025635,
"learning_rate": 8.547788954783936e-05,
"loss": 0.0132,
"step": 5730
},
{
"grad_norm": 0.3868075907230377,
"learning_rate": 8.541958542911808e-05,
"loss": 0.0137,
"step": 5740
},
{
"grad_norm": 0.3873762786388397,
"learning_rate": 8.536118447460275e-05,
"loss": 0.016,
"step": 5750
},
{
"grad_norm": 0.34997740387916565,
"learning_rate": 8.530268684395932e-05,
"loss": 0.012,
"step": 5760
},
{
"grad_norm": 0.36314913630485535,
"learning_rate": 8.524409269711807e-05,
"loss": 0.014,
"step": 5770
},
{
"grad_norm": 0.2800992727279663,
"learning_rate": 8.51854021942732e-05,
"loss": 0.0111,
"step": 5780
},
{
"grad_norm": 0.3715326488018036,
"learning_rate": 8.512661549588227e-05,
"loss": 0.0128,
"step": 5790
},
{
"grad_norm": 0.3508760631084442,
"learning_rate": 8.506773276266588e-05,
"loss": 0.0123,
"step": 5800
},
{
"grad_norm": 0.31156125664711,
"learning_rate": 8.500875415560721e-05,
"loss": 0.0104,
"step": 5810
},
{
"grad_norm": 0.28672730922698975,
"learning_rate": 8.494967983595144e-05,
"loss": 0.0138,
"step": 5820
},
{
"grad_norm": 0.2949328124523163,
"learning_rate": 8.489050996520558e-05,
"loss": 0.0111,
"step": 5830
},
{
"grad_norm": 0.3339660167694092,
"learning_rate": 8.483124470513775e-05,
"loss": 0.0125,
"step": 5840
},
{
"grad_norm": 0.37675192952156067,
"learning_rate": 8.477188421777692e-05,
"loss": 0.013,
"step": 5850
},
{
"grad_norm": 0.4156615436077118,
"learning_rate": 8.47124286654124e-05,
"loss": 0.0188,
"step": 5860
},
{
"grad_norm": 0.3377411365509033,
"learning_rate": 8.465287821059341e-05,
"loss": 0.0197,
"step": 5870
},
{
"grad_norm": 0.31554165482521057,
"learning_rate": 8.45932330161286e-05,
"loss": 0.0125,
"step": 5880
},
{
"grad_norm": 0.3891998529434204,
"learning_rate": 8.453349324508567e-05,
"loss": 0.0169,
"step": 5890
},
{
"grad_norm": 0.2835284173488617,
"learning_rate": 8.447365906079088e-05,
"loss": 0.0172,
"step": 5900
},
{
"grad_norm": 0.3825901746749878,
"learning_rate": 8.441373062682856e-05,
"loss": 0.0146,
"step": 5910
},
{
"grad_norm": 0.3294428884983063,
"learning_rate": 8.43537081070408e-05,
"loss": 0.0218,
"step": 5920
},
{
"grad_norm": 0.3541003167629242,
"learning_rate": 8.429359166552689e-05,
"loss": 0.0132,
"step": 5930
},
{
"grad_norm": 0.3192877173423767,
"learning_rate": 8.423338146664284e-05,
"loss": 0.014,
"step": 5940
},
{
"grad_norm": 0.3500727713108063,
"learning_rate": 8.417307767500107e-05,
"loss": 0.0115,
"step": 5950
},
{
"grad_norm": 0.3229285478591919,
"learning_rate": 8.411268045546983e-05,
"loss": 0.0121,
"step": 5960
},
{
"grad_norm": 0.4392866790294647,
"learning_rate": 8.405218997317281e-05,
"loss": 0.0125,
"step": 5970
},
{
"grad_norm": 0.3409421145915985,
"learning_rate": 8.399160639348869e-05,
"loss": 0.0115,
"step": 5980
},
{
"grad_norm": 0.3397701680660248,
"learning_rate": 8.393092988205065e-05,
"loss": 0.0144,
"step": 5990
},
{
"grad_norm": 0.2932409346103668,
"learning_rate": 8.387016060474597e-05,
"loss": 0.0155,
"step": 6000
},
{
"grad_norm": 0.30588042736053467,
"learning_rate": 8.380929872771551e-05,
"loss": 0.018,
"step": 6010
},
{
"grad_norm": 0.2569223642349243,
"learning_rate": 8.374834441735335e-05,
"loss": 0.0139,
"step": 6020
},
{
"grad_norm": 0.37832796573638916,
"learning_rate": 8.368729784030622e-05,
"loss": 0.012,
"step": 6030
},
{
"grad_norm": 0.3126446604728699,
"learning_rate": 8.362615916347315e-05,
"loss": 0.0138,
"step": 6040
},
{
"grad_norm": 0.2546840310096741,
"learning_rate": 8.356492855400493e-05,
"loss": 0.0122,
"step": 6050
},
{
"grad_norm": 0.3276226818561554,
"learning_rate": 8.350360617930371e-05,
"loss": 0.0118,
"step": 6060
},
{
"grad_norm": 0.3676457405090332,
"learning_rate": 8.344219220702255e-05,
"loss": 0.0117,
"step": 6070
},
{
"grad_norm": 0.35417577624320984,
"learning_rate": 8.338068680506485e-05,
"loss": 0.0104,
"step": 6080
},
{
"grad_norm": 0.26148155331611633,
"learning_rate": 8.33190901415841e-05,
"loss": 0.0142,
"step": 6090
},
{
"grad_norm": 0.29308485984802246,
"learning_rate": 8.325740238498317e-05,
"loss": 0.0111,
"step": 6100
},
{
"grad_norm": 0.41223078966140747,
"learning_rate": 8.319562370391406e-05,
"loss": 0.0135,
"step": 6110
},
{
"grad_norm": 0.38607892394065857,
"learning_rate": 8.31337542672773e-05,
"loss": 0.0131,
"step": 6120
},
{
"grad_norm": 0.3013926148414612,
"learning_rate": 8.307179424422158e-05,
"loss": 0.0115,
"step": 6130
},
{
"grad_norm": 0.34792882204055786,
"learning_rate": 8.300974380414327e-05,
"loss": 0.0165,
"step": 6140
},
{
"grad_norm": 0.3160726726055145,
"learning_rate": 8.294760311668586e-05,
"loss": 0.0133,
"step": 6150
},
{
"grad_norm": 0.3607368469238281,
"learning_rate": 8.288537235173961e-05,
"loss": 0.0152,
"step": 6160
},
{
"grad_norm": 0.2962538003921509,
"learning_rate": 8.282305167944108e-05,
"loss": 0.0146,
"step": 6170
},
{
"grad_norm": 0.3576897978782654,
"learning_rate": 8.276064127017262e-05,
"loss": 0.0132,
"step": 6180
},
{
"grad_norm": 0.3158738911151886,
"learning_rate": 8.269814129456189e-05,
"loss": 0.0165,
"step": 6190
},
{
"grad_norm": 0.3886716663837433,
"learning_rate": 8.263555192348143e-05,
"loss": 0.0164,
"step": 6200
},
{
"grad_norm": 0.33075031638145447,
"learning_rate": 8.257287332804819e-05,
"loss": 0.024,
"step": 6210
},
{
"grad_norm": 0.40429654717445374,
"learning_rate": 8.251010567962307e-05,
"loss": 0.0142,
"step": 6220
},
{
"grad_norm": 0.26305797696113586,
"learning_rate": 8.244724914981041e-05,
"loss": 0.0127,
"step": 6230
},
{
"grad_norm": 0.26651525497436523,
"learning_rate": 8.238430391045757e-05,
"loss": 0.0099,
"step": 6240
},
{
"grad_norm": 0.22140610218048096,
"learning_rate": 8.232127013365445e-05,
"loss": 0.0162,
"step": 6250
},
{
"grad_norm": 0.3048286736011505,
"learning_rate": 8.225814799173295e-05,
"loss": 0.0219,
"step": 6260
},
{
"grad_norm": 0.27734512090682983,
"learning_rate": 8.219493765726663e-05,
"loss": 0.012,
"step": 6270
},
{
"grad_norm": 0.3474031090736389,
"learning_rate": 8.21316393030701e-05,
"loss": 0.0122,
"step": 6280
},
{
"grad_norm": 0.3461661636829376,
"learning_rate": 8.206825310219865e-05,
"loss": 0.0142,
"step": 6290
},
{
"grad_norm": 0.3480895161628723,
"learning_rate": 8.200477922794776e-05,
"loss": 0.0155,
"step": 6300
},
{
"grad_norm": 0.29770898818969727,
"learning_rate": 8.194121785385256e-05,
"loss": 0.0122,
"step": 6310
},
{
"grad_norm": 0.27077922224998474,
"learning_rate": 8.187756915368741e-05,
"loss": 0.0114,
"step": 6320
},
{
"grad_norm": 0.37504443526268005,
"learning_rate": 8.181383330146544e-05,
"loss": 0.0123,
"step": 6330
},
{
"grad_norm": 0.27733808755874634,
"learning_rate": 8.175001047143804e-05,
"loss": 0.0152,
"step": 6340
},
{
"grad_norm": 0.2905326187610626,
"learning_rate": 8.168610083809438e-05,
"loss": 0.014,
"step": 6350
},
{
"grad_norm": 0.3680465519428253,
"learning_rate": 8.162210457616095e-05,
"loss": 0.0128,
"step": 6360
},
{
"grad_norm": 0.41437506675720215,
"learning_rate": 8.155802186060109e-05,
"loss": 0.0168,
"step": 6370
},
{
"grad_norm": 0.2748274505138397,
"learning_rate": 8.149385286661453e-05,
"loss": 0.014,
"step": 6380
},
{
"grad_norm": 0.27940356731414795,
"learning_rate": 8.14295977696368e-05,
"loss": 0.0106,
"step": 6390
},
{
"grad_norm": 0.3001856505870819,
"learning_rate": 8.13652567453389e-05,
"loss": 0.0161,
"step": 6400
},
{
"grad_norm": 0.3228931725025177,
"learning_rate": 8.130082996962676e-05,
"loss": 0.0108,
"step": 6410
},
{
"grad_norm": 0.37547504901885986,
"learning_rate": 8.123631761864068e-05,
"loss": 0.0108,
"step": 6420
},
{
"grad_norm": 0.3306344449520111,
"learning_rate": 8.1171719868755e-05,
"loss": 0.012,
"step": 6430
},
{
"grad_norm": 0.30303868651390076,
"learning_rate": 8.110703689657748e-05,
"loss": 0.0132,
"step": 6440
},
{
"grad_norm": 0.3219710886478424,
"learning_rate": 8.104226887894892e-05,
"loss": 0.0247,
"step": 6450
},
{
"grad_norm": 0.25483033061027527,
"learning_rate": 8.097741599294257e-05,
"loss": 0.0137,
"step": 6460
},
{
"grad_norm": 0.34695756435394287,
"learning_rate": 8.091247841586378e-05,
"loss": 0.0118,
"step": 6470
},
{
"grad_norm": 0.31917816400527954,
"learning_rate": 8.084745632524939e-05,
"loss": 0.0153,
"step": 6480
},
{
"grad_norm": 0.298793762922287,
"learning_rate": 8.07823498988673e-05,
"loss": 0.013,
"step": 6490
},
{
"grad_norm": 0.29808494448661804,
"learning_rate": 8.071715931471602e-05,
"loss": 0.0141,
"step": 6500
},
{
"grad_norm": 0.2866530120372772,
"learning_rate": 8.06518847510241e-05,
"loss": 0.013,
"step": 6510
},
{
"grad_norm": 0.2647181451320648,
"learning_rate": 8.058652638624971e-05,
"loss": 0.013,
"step": 6520
},
{
"grad_norm": 0.2521056830883026,
"learning_rate": 8.052108439908013e-05,
"loss": 0.0117,
"step": 6530
},
{
"grad_norm": 0.30903002619743347,
"learning_rate": 8.045555896843125e-05,
"loss": 0.0127,
"step": 6540
},
{
"grad_norm": 0.3295891284942627,
"learning_rate": 8.03899502734471e-05,
"loss": 0.0116,
"step": 6550
},
{
"grad_norm": 0.3196203410625458,
"learning_rate": 8.032425849349931e-05,
"loss": 0.0096,
"step": 6560
},
{
"grad_norm": 0.2777307331562042,
"learning_rate": 8.025848380818674e-05,
"loss": 0.0137,
"step": 6570
},
{
"grad_norm": 0.2528461217880249,
"learning_rate": 8.019262639733487e-05,
"loss": 0.012,
"step": 6580
},
{
"grad_norm": 0.29351767897605896,
"learning_rate": 8.012668644099531e-05,
"loss": 0.0116,
"step": 6590
},
{
"grad_norm": 0.3499806821346283,
"learning_rate": 8.006066411944542e-05,
"loss": 0.0145,
"step": 6600
},
{
"grad_norm": 0.3001391589641571,
"learning_rate": 7.999455961318769e-05,
"loss": 0.0107,
"step": 6610
},
{
"grad_norm": 0.34684231877326965,
"learning_rate": 7.992837310294932e-05,
"loss": 0.0174,
"step": 6620
},
{
"grad_norm": 0.29393184185028076,
"learning_rate": 7.986210476968167e-05,
"loss": 0.0139,
"step": 6630
},
{
"grad_norm": 0.29339519143104553,
"learning_rate": 7.97957547945599e-05,
"loss": 0.0174,
"step": 6640
},
{
"grad_norm": 0.4209054112434387,
"learning_rate": 7.972932335898226e-05,
"loss": 0.0115,
"step": 6650
},
{
"grad_norm": 0.28844037652015686,
"learning_rate": 7.966281064456975e-05,
"loss": 0.0121,
"step": 6660
},
{
"grad_norm": 0.3526553511619568,
"learning_rate": 7.959621683316563e-05,
"loss": 0.0119,
"step": 6670
},
{
"grad_norm": 0.42336228489875793,
"learning_rate": 7.952954210683481e-05,
"loss": 0.016,
"step": 6680
},
{
"grad_norm": 0.3053176701068878,
"learning_rate": 7.946278664786345e-05,
"loss": 0.0143,
"step": 6690
},
{
"grad_norm": 0.25422704219818115,
"learning_rate": 7.939595063875842e-05,
"loss": 0.0129,
"step": 6700
},
{
"grad_norm": 0.2565918266773224,
"learning_rate": 7.932903426224683e-05,
"loss": 0.0118,
"step": 6710
},
{
"grad_norm": 0.29806122183799744,
"learning_rate": 7.926203770127552e-05,
"loss": 0.0112,
"step": 6720
},
{
"grad_norm": 0.29177016019821167,
"learning_rate": 7.919496113901046e-05,
"loss": 0.0138,
"step": 6730
},
{
"grad_norm": 0.24790076911449432,
"learning_rate": 7.912780475883649e-05,
"loss": 0.0104,
"step": 6740
},
{
"grad_norm": 0.3190149664878845,
"learning_rate": 7.906056874435652e-05,
"loss": 0.0127,
"step": 6750
},
{
"grad_norm": 0.28794440627098083,
"learning_rate": 7.899325327939131e-05,
"loss": 0.0094,
"step": 6760
},
{
"grad_norm": 0.24909764528274536,
"learning_rate": 7.892585854797872e-05,
"loss": 0.0135,
"step": 6770
},
{
"grad_norm": 0.358511745929718,
"learning_rate": 7.88583847343734e-05,
"loss": 0.0119,
"step": 6780
},
{
"grad_norm": 0.2664150893688202,
"learning_rate": 7.879083202304616e-05,
"loss": 0.0137,
"step": 6790
},
{
"grad_norm": 0.24875850975513458,
"learning_rate": 7.872320059868355e-05,
"loss": 0.0144,
"step": 6800
},
{
"grad_norm": 0.2646051347255707,
"learning_rate": 7.865549064618729e-05,
"loss": 0.0101,
"step": 6810
},
{
"grad_norm": 0.3005359172821045,
"learning_rate": 7.858770235067381e-05,
"loss": 0.0189,
"step": 6820
},
{
"grad_norm": 0.30753788352012634,
"learning_rate": 7.851983589747374e-05,
"loss": 0.0128,
"step": 6830
},
{
"grad_norm": 0.3018791675567627,
"learning_rate": 7.845189147213133e-05,
"loss": 0.0103,
"step": 6840
},
{
"grad_norm": 0.27601730823516846,
"learning_rate": 7.838386926040407e-05,
"loss": 0.012,
"step": 6850
},
{
"grad_norm": 0.18579219281673431,
"learning_rate": 7.83157694482621e-05,
"loss": 0.0088,
"step": 6860
},
{
"grad_norm": 0.3077498972415924,
"learning_rate": 7.824759222188768e-05,
"loss": 0.0135,
"step": 6870
},
{
"grad_norm": 0.3342336118221283,
"learning_rate": 7.817933776767478e-05,
"loss": 0.0107,
"step": 6880
},
{
"grad_norm": 0.2645319998264313,
"learning_rate": 7.811100627222842e-05,
"loss": 0.0079,
"step": 6890
},
{
"grad_norm": 0.34547173976898193,
"learning_rate": 7.804259792236435e-05,
"loss": 0.0106,
"step": 6900
},
{
"grad_norm": 0.3139336407184601,
"learning_rate": 7.797411290510835e-05,
"loss": 0.0143,
"step": 6910
},
{
"grad_norm": 0.32347217202186584,
"learning_rate": 7.790555140769586e-05,
"loss": 0.0116,
"step": 6920
},
{
"grad_norm": 0.2938658595085144,
"learning_rate": 7.78369136175714e-05,
"loss": 0.0151,
"step": 6930
},
{
"grad_norm": 0.3143678605556488,
"learning_rate": 7.776819972238806e-05,
"loss": 0.0157,
"step": 6940
},
{
"grad_norm": 0.3636862337589264,
"learning_rate": 7.7699409910007e-05,
"loss": 0.012,
"step": 6950
},
{
"grad_norm": 0.2957841157913208,
"learning_rate": 7.763054436849694e-05,
"loss": 0.0137,
"step": 6960
},
{
"grad_norm": 0.3176383078098297,
"learning_rate": 7.756160328613364e-05,
"loss": 0.0138,
"step": 6970
},
{
"grad_norm": 0.38332003355026245,
"learning_rate": 7.749258685139942e-05,
"loss": 0.0103,
"step": 6980
},
{
"grad_norm": 0.3329831659793854,
"learning_rate": 7.742349525298253e-05,
"loss": 0.0118,
"step": 6990
},
{
"grad_norm": 0.373206228017807,
"learning_rate": 7.735432867977679e-05,
"loss": 0.0096,
"step": 7000
},
{
"grad_norm": 0.22941261529922485,
"learning_rate": 7.728508732088096e-05,
"loss": 0.0141,
"step": 7010
},
{
"grad_norm": 0.3203655183315277,
"learning_rate": 7.721577136559825e-05,
"loss": 0.0142,
"step": 7020
},
{
"grad_norm": 0.2580220103263855,
"learning_rate": 7.714638100343588e-05,
"loss": 0.0119,
"step": 7030
},
{
"grad_norm": 0.23703204095363617,
"learning_rate": 7.707691642410444e-05,
"loss": 0.0114,
"step": 7040
},
{
"grad_norm": 0.2933865189552307,
"learning_rate": 7.70073778175174e-05,
"loss": 0.0125,
"step": 7050
},
{
"grad_norm": 0.3587990403175354,
"learning_rate": 7.69377653737907e-05,
"loss": 0.0103,
"step": 7060
},
{
"grad_norm": 0.3031073212623596,
"learning_rate": 7.686807928324209e-05,
"loss": 0.0119,
"step": 7070
},
{
"grad_norm": 0.24994587898254395,
"learning_rate": 7.679831973639065e-05,
"loss": 0.0108,
"step": 7080
},
{
"grad_norm": 0.2641933262348175,
"learning_rate": 7.672848692395637e-05,
"loss": 0.0124,
"step": 7090
},
{
"grad_norm": 0.3555925488471985,
"learning_rate": 7.665858103685944e-05,
"loss": 0.0106,
"step": 7100
},
{
"grad_norm": 0.32579174637794495,
"learning_rate": 7.658860226621991e-05,
"loss": 0.0129,
"step": 7110
},
{
"grad_norm": 0.2926287353038788,
"learning_rate": 7.651855080335708e-05,
"loss": 0.0127,
"step": 7120
},
{
"grad_norm": 0.32705703377723694,
"learning_rate": 7.644842683978896e-05,
"loss": 0.0103,
"step": 7130
},
{
"grad_norm": 0.3596729636192322,
"learning_rate": 7.63782305672318e-05,
"loss": 0.0153,
"step": 7140
},
{
"grad_norm": 0.3586880564689636,
"learning_rate": 7.63079621775995e-05,
"loss": 0.0132,
"step": 7150
},
{
"grad_norm": 0.23977535963058472,
"learning_rate": 7.623762186300319e-05,
"loss": 0.0125,
"step": 7160
},
{
"grad_norm": 0.3540763854980469,
"learning_rate": 7.616720981575057e-05,
"loss": 0.0133,
"step": 7170
},
{
"grad_norm": 0.33983471989631653,
"learning_rate": 7.609672622834552e-05,
"loss": 0.0102,
"step": 7180
},
{
"grad_norm": 0.30762892961502075,
"learning_rate": 7.602617129348747e-05,
"loss": 0.0108,
"step": 7190
},
{
"grad_norm": 0.3010900020599365,
"learning_rate": 7.595554520407088e-05,
"loss": 0.0111,
"step": 7200
},
{
"grad_norm": 0.2769547402858734,
"learning_rate": 7.588484815318484e-05,
"loss": 0.0126,
"step": 7210
},
{
"grad_norm": 0.28370919823646545,
"learning_rate": 7.581408033411234e-05,
"loss": 0.0101,
"step": 7220
},
{
"grad_norm": 0.32081568241119385,
"learning_rate": 7.574324194032995e-05,
"loss": 0.009,
"step": 7230
},
{
"grad_norm": 0.2977130711078644,
"learning_rate": 7.567233316550705e-05,
"loss": 0.012,
"step": 7240
},
{
"grad_norm": 0.22645479440689087,
"learning_rate": 7.560135420350562e-05,
"loss": 0.0087,
"step": 7250
},
{
"grad_norm": 0.3486950397491455,
"learning_rate": 7.553030524837935e-05,
"loss": 0.0184,
"step": 7260
},
{
"grad_norm": 0.3366019129753113,
"learning_rate": 7.545918649437341e-05,
"loss": 0.0109,
"step": 7270
},
{
"grad_norm": 0.3164430856704712,
"learning_rate": 7.538799813592377e-05,
"loss": 0.0121,
"step": 7280
},
{
"grad_norm": 0.30747735500335693,
"learning_rate": 7.531674036765662e-05,
"loss": 0.0127,
"step": 7290
},
{
"grad_norm": 0.2376401573419571,
"learning_rate": 7.524541338438807e-05,
"loss": 0.0085,
"step": 7300
},
{
"grad_norm": 0.263528436422348,
"learning_rate": 7.517401738112328e-05,
"loss": 0.0116,
"step": 7310
},
{
"grad_norm": 0.35192421078681946,
"learning_rate": 7.510255255305628e-05,
"loss": 0.0097,
"step": 7320
},
{
"grad_norm": 0.3433808386325836,
"learning_rate": 7.503101909556911e-05,
"loss": 0.0106,
"step": 7330
},
{
"grad_norm": 0.2994978129863739,
"learning_rate": 7.495941720423154e-05,
"loss": 0.0101,
"step": 7340
},
{
"grad_norm": 0.2600700557231903,
"learning_rate": 7.488774707480042e-05,
"loss": 0.0116,
"step": 7350
},
{
"grad_norm": 0.3085675537586212,
"learning_rate": 7.481600890321911e-05,
"loss": 0.0088,
"step": 7360
},
{
"grad_norm": 0.2523610591888428,
"learning_rate": 7.474420288561708e-05,
"loss": 0.0149,
"step": 7370
},
{
"grad_norm": 0.3608851432800293,
"learning_rate": 7.467232921830921e-05,
"loss": 0.015,
"step": 7380
},
{
"grad_norm": 0.27163684368133545,
"learning_rate": 7.460038809779537e-05,
"loss": 0.0092,
"step": 7390
},
{
"grad_norm": 0.3165534436702728,
"learning_rate": 7.452837972075983e-05,
"loss": 0.0094,
"step": 7400
},
{
"grad_norm": 0.2729543149471283,
"learning_rate": 7.445630428407074e-05,
"loss": 0.0093,
"step": 7410
},
{
"grad_norm": 0.35123759508132935,
"learning_rate": 7.43841619847796e-05,
"loss": 0.0106,
"step": 7420
},
{
"grad_norm": 0.2810732424259186,
"learning_rate": 7.431195302012072e-05,
"loss": 0.0097,
"step": 7430
},
{
"grad_norm": 0.3775671720504761,
"learning_rate": 7.423967758751061e-05,
"loss": 0.0119,
"step": 7440
},
{
"grad_norm": 0.2957786023616791,
"learning_rate": 7.416733588454758e-05,
"loss": 0.0129,
"step": 7450
},
{
"grad_norm": 0.3434288799762726,
"learning_rate": 7.409492810901106e-05,
"loss": 0.0132,
"step": 7460
},
{
"grad_norm": 0.35347649455070496,
"learning_rate": 7.402245445886116e-05,
"loss": 0.0147,
"step": 7470
},
{
"grad_norm": 0.3285079598426819,
"learning_rate": 7.394991513223806e-05,
"loss": 0.011,
"step": 7480
},
{
"grad_norm": 0.29947003722190857,
"learning_rate": 7.38773103274615e-05,
"loss": 0.0132,
"step": 7490
},
{
"grad_norm": 0.28526970744132996,
"learning_rate": 7.380464024303028e-05,
"loss": 0.0105,
"step": 7500
},
{
"grad_norm": 0.2930798828601837,
"learning_rate": 7.373190507762162e-05,
"loss": 0.0127,
"step": 7510
},
{
"grad_norm": 0.2921172082424164,
"learning_rate": 7.365910503009066e-05,
"loss": 0.0156,
"step": 7520
},
{
"grad_norm": 0.3323417007923126,
"learning_rate": 7.358624029946996e-05,
"loss": 0.0088,
"step": 7530
},
{
"grad_norm": 0.29670819640159607,
"learning_rate": 7.351331108496893e-05,
"loss": 0.0095,
"step": 7540
},
{
"grad_norm": 0.32244303822517395,
"learning_rate": 7.344031758597325e-05,
"loss": 0.0137,
"step": 7550
},
{
"grad_norm": 0.25546425580978394,
"learning_rate": 7.336726000204435e-05,
"loss": 0.0101,
"step": 7560
},
{
"grad_norm": 0.30756881833076477,
"learning_rate": 7.32941385329189e-05,
"loss": 0.0129,
"step": 7570
},
{
"grad_norm": 0.2749859690666199,
"learning_rate": 7.322095337850816e-05,
"loss": 0.0095,
"step": 7580
},
{
"grad_norm": 0.41934898495674133,
"learning_rate": 7.314770473889758e-05,
"loss": 0.0154,
"step": 7590
},
{
"grad_norm": 0.2750692665576935,
"learning_rate": 7.307439281434615e-05,
"loss": 0.0089,
"step": 7600
},
{
"grad_norm": 0.28263887763023376,
"learning_rate": 7.300101780528585e-05,
"loss": 0.0127,
"step": 7610
},
{
"grad_norm": 0.2647198438644409,
"learning_rate": 7.292757991232117e-05,
"loss": 0.0155,
"step": 7620
},
{
"grad_norm": 0.30357304215431213,
"learning_rate": 7.285407933622848e-05,
"loss": 0.0122,
"step": 7630
},
{
"grad_norm": 0.2601131796836853,
"learning_rate": 7.278051627795557e-05,
"loss": 0.0173,
"step": 7640
},
{
"grad_norm": 0.2693704664707184,
"learning_rate": 7.270689093862105e-05,
"loss": 0.0123,
"step": 7650
},
{
"grad_norm": 0.3310806453227997,
"learning_rate": 7.263320351951374e-05,
"loss": 0.009,
"step": 7660
},
{
"grad_norm": 0.2853841483592987,
"learning_rate": 7.255945422209227e-05,
"loss": 0.0104,
"step": 7670
},
{
"grad_norm": 0.19990304112434387,
"learning_rate": 7.248564324798437e-05,
"loss": 0.0105,
"step": 7680
},
{
"grad_norm": 0.20870745182037354,
"learning_rate": 7.241177079898644e-05,
"loss": 0.0126,
"step": 7690
},
{
"grad_norm": 0.29264724254608154,
"learning_rate": 7.233783707706295e-05,
"loss": 0.0108,
"step": 7700
},
{
"grad_norm": 0.26418036222457886,
"learning_rate": 7.226384228434586e-05,
"loss": 0.0121,
"step": 7710
},
{
"grad_norm": 0.19485041499137878,
"learning_rate": 7.21897866231341e-05,
"loss": 0.0136,
"step": 7720
},
{
"grad_norm": 0.21631906926631927,
"learning_rate": 7.211567029589303e-05,
"loss": 0.0103,
"step": 7730
},
{
"grad_norm": 0.2685507833957672,
"learning_rate": 7.204149350525387e-05,
"loss": 0.0088,
"step": 7740
},
{
"grad_norm": 0.21937017142772675,
"learning_rate": 7.196725645401309e-05,
"loss": 0.0118,
"step": 7750
},
{
"grad_norm": 0.40596023201942444,
"learning_rate": 7.1892959345132e-05,
"loss": 0.0106,
"step": 7760
},
{
"grad_norm": 0.3677843511104584,
"learning_rate": 7.181860238173605e-05,
"loss": 0.0155,
"step": 7770
},
{
"grad_norm": 0.3440069854259491,
"learning_rate": 7.174418576711432e-05,
"loss": 0.012,
"step": 7780
},
{
"grad_norm": 0.3115421533584595,
"learning_rate": 7.1669709704719e-05,
"loss": 0.0129,
"step": 7790
},
{
"grad_norm": 0.2567780315876007,
"learning_rate": 7.159517439816481e-05,
"loss": 0.0186,
"step": 7800
},
{
"grad_norm": 0.26565659046173096,
"learning_rate": 7.152058005122842e-05,
"loss": 0.0117,
"step": 7810
},
{
"grad_norm": 0.2598077654838562,
"learning_rate": 7.144592686784793e-05,
"loss": 0.0105,
"step": 7820
},
{
"grad_norm": 0.3031388819217682,
"learning_rate": 7.137121505212229e-05,
"loss": 0.0103,
"step": 7830
},
{
"grad_norm": 0.334942102432251,
"learning_rate": 7.129644480831077e-05,
"loss": 0.0198,
"step": 7840
},
{
"grad_norm": 0.28533506393432617,
"learning_rate": 7.122161634083234e-05,
"loss": 0.0103,
"step": 7850
},
{
"grad_norm": 0.27979883551597595,
"learning_rate": 7.114672985426516e-05,
"loss": 0.0097,
"step": 7860
},
{
"grad_norm": 0.21115346252918243,
"learning_rate": 7.107178555334606e-05,
"loss": 0.0151,
"step": 7870
},
{
"grad_norm": 0.36340436339378357,
"learning_rate": 7.099678364296989e-05,
"loss": 0.0115,
"step": 7880
},
{
"grad_norm": 0.21899573504924774,
"learning_rate": 7.0921724328189e-05,
"loss": 0.0102,
"step": 7890
},
{
"grad_norm": 0.3320227861404419,
"learning_rate": 7.084660781421268e-05,
"loss": 0.0132,
"step": 7900
},
{
"grad_norm": 0.27939140796661377,
"learning_rate": 7.077143430640662e-05,
"loss": 0.0119,
"step": 7910
},
{
"grad_norm": 0.36499500274658203,
"learning_rate": 7.069620401029232e-05,
"loss": 0.0099,
"step": 7920
},
{
"grad_norm": 0.3523150682449341,
"learning_rate": 7.062091713154655e-05,
"loss": 0.0093,
"step": 7930
},
{
"grad_norm": 0.2645350396633148,
"learning_rate": 7.054557387600075e-05,
"loss": 0.0097,
"step": 7940
},
{
"grad_norm": 0.20298346877098083,
"learning_rate": 7.04701744496405e-05,
"loss": 0.0086,
"step": 7950
},
{
"grad_norm": 0.20196221768856049,
"learning_rate": 7.039471905860495e-05,
"loss": 0.0096,
"step": 7960
},
{
"grad_norm": 0.25855553150177,
"learning_rate": 7.031920790918628e-05,
"loss": 0.0097,
"step": 7970
},
{
"grad_norm": 0.28350019454956055,
"learning_rate": 7.024364120782906e-05,
"loss": 0.0105,
"step": 7980
},
{
"grad_norm": 0.28542742133140564,
"learning_rate": 7.016801916112978e-05,
"loss": 0.0137,
"step": 7990
},
{
"grad_norm": 0.24646037817001343,
"learning_rate": 7.009234197583623e-05,
"loss": 0.0103,
"step": 8000
},
{
"grad_norm": 0.26659995317459106,
"learning_rate": 7.001660985884692e-05,
"loss": 0.0088,
"step": 8010
},
{
"grad_norm": 0.3342621624469757,
"learning_rate": 6.994082301721063e-05,
"loss": 0.0114,
"step": 8020
},
{
"grad_norm": 0.2519735097885132,
"learning_rate": 6.986498165812563e-05,
"loss": 0.009,
"step": 8030
},
{
"grad_norm": 0.25942492485046387,
"learning_rate": 6.978908598893932e-05,
"loss": 0.0089,
"step": 8040
},
{
"grad_norm": 0.2715606391429901,
"learning_rate": 6.971313621714756e-05,
"loss": 0.0129,
"step": 8050
},
{
"grad_norm": 0.28495460748672485,
"learning_rate": 6.96371325503941e-05,
"loss": 0.0086,
"step": 8060
},
{
"grad_norm": 0.2748803198337555,
"learning_rate": 6.956107519647014e-05,
"loss": 0.0079,
"step": 8070
},
{
"grad_norm": 0.19981186091899872,
"learning_rate": 6.94849643633135e-05,
"loss": 0.011,
"step": 8080
},
{
"grad_norm": 0.25709986686706543,
"learning_rate": 6.940880025900834e-05,
"loss": 0.0091,
"step": 8090
},
{
"grad_norm": 0.31318721175193787,
"learning_rate": 6.933258309178438e-05,
"loss": 0.0128,
"step": 8100
},
{
"grad_norm": 0.2338728904724121,
"learning_rate": 6.925631307001646e-05,
"loss": 0.01,
"step": 8110
},
{
"grad_norm": 0.2807973027229309,
"learning_rate": 6.91799904022239e-05,
"loss": 0.0117,
"step": 8120
},
{
"grad_norm": 0.16702820360660553,
"learning_rate": 6.910361529706997e-05,
"loss": 0.0081,
"step": 8130
},
{
"grad_norm": 0.2894277572631836,
"learning_rate": 6.902718796336131e-05,
"loss": 0.0086,
"step": 8140
},
{
"grad_norm": 0.3167431950569153,
"learning_rate": 6.895070861004729e-05,
"loss": 0.0099,
"step": 8150
},
{
"grad_norm": 0.35957372188568115,
"learning_rate": 6.887417744621956e-05,
"loss": 0.0103,
"step": 8160
},
{
"grad_norm": 0.2503855526447296,
"learning_rate": 6.87975946811114e-05,
"loss": 0.0088,
"step": 8170
},
{
"grad_norm": 0.24923115968704224,
"learning_rate": 6.872096052409718e-05,
"loss": 0.0101,
"step": 8180
},
{
"grad_norm": 0.27979594469070435,
"learning_rate": 6.864427518469174e-05,
"loss": 0.0088,
"step": 8190
},
{
"grad_norm": 0.22051957249641418,
"learning_rate": 6.856753887254986e-05,
"loss": 0.009,
"step": 8200
},
{
"grad_norm": 0.2485746592283249,
"learning_rate": 6.849075179746572e-05,
"loss": 0.0132,
"step": 8210
},
{
"grad_norm": 0.29662564396858215,
"learning_rate": 6.841391416937221e-05,
"loss": 0.0109,
"step": 8220
},
{
"grad_norm": 0.22791483998298645,
"learning_rate": 6.833702619834053e-05,
"loss": 0.0119,
"step": 8230
},
{
"grad_norm": 0.2398047000169754,
"learning_rate": 6.82600880945794e-05,
"loss": 0.0114,
"step": 8240
},
{
"grad_norm": 0.28467315435409546,
"learning_rate": 6.818310006843468e-05,
"loss": 0.0109,
"step": 8250
},
{
"grad_norm": 0.39719972014427185,
"learning_rate": 6.810606233038868e-05,
"loss": 0.0125,
"step": 8260
},
{
"grad_norm": 0.367841899394989,
"learning_rate": 6.802897509105966e-05,
"loss": 0.0093,
"step": 8270
},
{
"grad_norm": 0.28418073058128357,
"learning_rate": 6.79518385612012e-05,
"loss": 0.011,
"step": 8280
},
{
"grad_norm": 0.33649322390556335,
"learning_rate": 6.787465295170157e-05,
"loss": 0.0111,
"step": 8290
},
{
"grad_norm": 0.20563358068466187,
"learning_rate": 6.779741847358332e-05,
"loss": 0.0101,
"step": 8300
},
{
"grad_norm": 0.2554636299610138,
"learning_rate": 6.772013533800256e-05,
"loss": 0.0078,
"step": 8310
},
{
"grad_norm": 0.35546278953552246,
"learning_rate": 6.764280375624843e-05,
"loss": 0.0112,
"step": 8320
},
{
"grad_norm": 0.33014950156211853,
"learning_rate": 6.756542393974252e-05,
"loss": 0.0136,
"step": 8330
},
{
"grad_norm": 0.34563302993774414,
"learning_rate": 6.748799610003828e-05,
"loss": 0.0111,
"step": 8340
},
{
"grad_norm": 0.28476187586784363,
"learning_rate": 6.741052044882048e-05,
"loss": 0.0133,
"step": 8350
},
{
"grad_norm": 0.21680086851119995,
"learning_rate": 6.73329971979046e-05,
"loss": 0.01,
"step": 8360
},
{
"grad_norm": 0.24411574006080627,
"learning_rate": 6.725542655923625e-05,
"loss": 0.0101,
"step": 8370
},
{
"grad_norm": 0.2673936188220978,
"learning_rate": 6.717780874489057e-05,
"loss": 0.0142,
"step": 8380
},
{
"grad_norm": 0.22214150428771973,
"learning_rate": 6.710014396707172e-05,
"loss": 0.0081,
"step": 8390
},
{
"grad_norm": 0.21424426138401031,
"learning_rate": 6.702243243811221e-05,
"loss": 0.0094,
"step": 8400
},
{
"grad_norm": 0.326800137758255,
"learning_rate": 6.694467437047244e-05,
"loss": 0.0109,
"step": 8410
},
{
"grad_norm": 0.33343732357025146,
"learning_rate": 6.686686997673997e-05,
"loss": 0.0073,
"step": 8420
},
{
"grad_norm": 0.2529543936252594,
"learning_rate": 6.678901946962903e-05,
"loss": 0.0091,
"step": 8430
},
{
"grad_norm": 0.33415693044662476,
"learning_rate": 6.671112306197996e-05,
"loss": 0.0093,
"step": 8440
},
{
"grad_norm": 0.21618202328681946,
"learning_rate": 6.663318096675854e-05,
"loss": 0.0109,
"step": 8450
},
{
"grad_norm": 0.17679205536842346,
"learning_rate": 6.655519339705552e-05,
"loss": 0.0081,
"step": 8460
},
{
"grad_norm": 0.31180447340011597,
"learning_rate": 6.647716056608588e-05,
"loss": 0.0075,
"step": 8470
},
{
"grad_norm": 0.3059850335121155,
"learning_rate": 6.639908268718843e-05,
"loss": 0.0104,
"step": 8480
},
{
"grad_norm": 0.2953120768070221,
"learning_rate": 6.632095997382514e-05,
"loss": 0.0136,
"step": 8490
},
{
"grad_norm": 0.26256608963012695,
"learning_rate": 6.624279263958047e-05,
"loss": 0.0093,
"step": 8500
},
{
"grad_norm": 0.31494539976119995,
"learning_rate": 6.616458089816097e-05,
"loss": 0.0106,
"step": 8510
},
{
"grad_norm": 0.376949280500412,
"learning_rate": 6.608632496339454e-05,
"loss": 0.0099,
"step": 8520
},
{
"grad_norm": 0.2433367669582367,
"learning_rate": 6.600802504922988e-05,
"loss": 0.0102,
"step": 8530
},
{
"grad_norm": 0.37388041615486145,
"learning_rate": 6.592968136973604e-05,
"loss": 0.0105,
"step": 8540
},
{
"grad_norm": 0.28184112906455994,
"learning_rate": 6.585129413910159e-05,
"loss": 0.0097,
"step": 8550
},
{
"grad_norm": 0.2922300696372986,
"learning_rate": 6.577286357163424e-05,
"loss": 0.01,
"step": 8560
},
{
"grad_norm": 0.2782540023326874,
"learning_rate": 6.569438988176018e-05,
"loss": 0.0082,
"step": 8570
},
{
"grad_norm": 0.27248328924179077,
"learning_rate": 6.561587328402347e-05,
"loss": 0.0088,
"step": 8580
},
{
"grad_norm": 0.24880534410476685,
"learning_rate": 6.553731399308549e-05,
"loss": 0.0079,
"step": 8590
},
{
"grad_norm": 0.2515351176261902,
"learning_rate": 6.545871222372436e-05,
"loss": 0.0073,
"step": 8600
},
{
"grad_norm": 0.18304027616977692,
"learning_rate": 6.538006819083426e-05,
"loss": 0.0071,
"step": 8610
},
{
"grad_norm": 0.3344535231590271,
"learning_rate": 6.530138210942505e-05,
"loss": 0.0101,
"step": 8620
},
{
"grad_norm": 0.2146245390176773,
"learning_rate": 6.522265419462141e-05,
"loss": 0.0121,
"step": 8630
},
{
"grad_norm": 0.24218611419200897,
"learning_rate": 6.514388466166248e-05,
"loss": 0.0096,
"step": 8640
},
{
"grad_norm": 0.16334691643714905,
"learning_rate": 6.506507372590119e-05,
"loss": 0.0079,
"step": 8650
},
{
"grad_norm": 0.33363232016563416,
"learning_rate": 6.498622160280355e-05,
"loss": 0.0098,
"step": 8660
},
{
"grad_norm": 0.35225972533226013,
"learning_rate": 6.490732850794832e-05,
"loss": 0.0103,
"step": 8670
},
{
"grad_norm": 0.3227727711200714,
"learning_rate": 6.482839465702616e-05,
"loss": 0.0107,
"step": 8680
},
{
"grad_norm": 0.2620507776737213,
"learning_rate": 6.474942026583923e-05,
"loss": 0.0104,
"step": 8690
},
{
"grad_norm": 0.2854481041431427,
"learning_rate": 6.467040555030052e-05,
"loss": 0.0153,
"step": 8700
},
{
"grad_norm": 0.24487437307834625,
"learning_rate": 6.459135072643321e-05,
"loss": 0.0099,
"step": 8710
},
{
"grad_norm": 0.23461481928825378,
"learning_rate": 6.451225601037019e-05,
"loss": 0.0109,
"step": 8720
},
{
"grad_norm": 0.2263409048318863,
"learning_rate": 6.443312161835338e-05,
"loss": 0.0105,
"step": 8730
},
{
"grad_norm": 0.3112694025039673,
"learning_rate": 6.43539477667332e-05,
"loss": 0.0116,
"step": 8740
},
{
"grad_norm": 0.2899706959724426,
"learning_rate": 6.427473467196793e-05,
"loss": 0.0109,
"step": 8750
},
{
"grad_norm": 0.319865882396698,
"learning_rate": 6.419548255062315e-05,
"loss": 0.0142,
"step": 8760
},
{
"grad_norm": 0.2569391429424286,
"learning_rate": 6.411619161937112e-05,
"loss": 0.0185,
"step": 8770
},
{
"grad_norm": 0.2740200161933899,
"learning_rate": 6.403686209499022e-05,
"loss": 0.0101,
"step": 8780
},
{
"grad_norm": 0.29832252860069275,
"learning_rate": 6.395749419436437e-05,
"loss": 0.0094,
"step": 8790
},
{
"grad_norm": 0.275097519159317,
"learning_rate": 6.387808813448234e-05,
"loss": 0.0106,
"step": 8800
},
{
"grad_norm": 0.32685816287994385,
"learning_rate": 6.37986441324373e-05,
"loss": 0.0086,
"step": 8810
},
{
"grad_norm": 0.28648194670677185,
"learning_rate": 6.37191624054261e-05,
"loss": 0.0117,
"step": 8820
},
{
"grad_norm": 0.2401561737060547,
"learning_rate": 6.363964317074872e-05,
"loss": 0.0081,
"step": 8830
},
{
"grad_norm": 0.2832534909248352,
"learning_rate": 6.356008664580776e-05,
"loss": 0.0136,
"step": 8840
},
{
"grad_norm": 0.2052382379770279,
"learning_rate": 6.348049304810771e-05,
"loss": 0.0097,
"step": 8850
},
{
"grad_norm": 0.3278440833091736,
"learning_rate": 6.340086259525442e-05,
"loss": 0.0083,
"step": 8860
},
{
"grad_norm": 0.34554144740104675,
"learning_rate": 6.332119550495448e-05,
"loss": 0.0098,
"step": 8870
},
{
"grad_norm": 0.2610031068325043,
"learning_rate": 6.324149199501473e-05,
"loss": 0.01,
"step": 8880
},
{
"grad_norm": 0.22511707246303558,
"learning_rate": 6.316175228334146e-05,
"loss": 0.0092,
"step": 8890
},
{
"grad_norm": 0.2637081742286682,
"learning_rate": 6.308197658794003e-05,
"loss": 0.0128,
"step": 8900
},
{
"grad_norm": 0.31135818362236023,
"learning_rate": 6.300216512691417e-05,
"loss": 0.008,
"step": 8910
},
{
"grad_norm": 0.23880526423454285,
"learning_rate": 6.292231811846532e-05,
"loss": 0.0101,
"step": 8920
},
{
"grad_norm": 0.25867730379104614,
"learning_rate": 6.284243578089217e-05,
"loss": 0.0088,
"step": 8930
},
{
"grad_norm": 0.26295626163482666,
"learning_rate": 6.276251833258999e-05,
"loss": 0.0081,
"step": 8940
},
{
"grad_norm": 0.28615128993988037,
"learning_rate": 6.268256599205003e-05,
"loss": 0.0104,
"step": 8950
},
{
"grad_norm": 0.292758971452713,
"learning_rate": 6.260257897785892e-05,
"loss": 0.009,
"step": 8960
},
{
"grad_norm": 0.2305050492286682,
"learning_rate": 6.252255750869811e-05,
"loss": 0.01,
"step": 8970
},
{
"grad_norm": 0.2941057085990906,
"learning_rate": 6.244250180334325e-05,
"loss": 0.0136,
"step": 8980
},
{
"grad_norm": 0.3271690607070923,
"learning_rate": 6.236241208066356e-05,
"loss": 0.0111,
"step": 8990
},
{
"grad_norm": 0.2185642123222351,
"learning_rate": 6.228228855962133e-05,
"loss": 0.0074,
"step": 9000
},
{
"grad_norm": 0.26744088530540466,
"learning_rate": 6.220213145927115e-05,
"loss": 0.0067,
"step": 9010
},
{
"grad_norm": 0.20686663687229156,
"learning_rate": 6.212194099875951e-05,
"loss": 0.0094,
"step": 9020
},
{
"grad_norm": 0.2725589871406555,
"learning_rate": 6.204171739732405e-05,
"loss": 0.0083,
"step": 9030
},
{
"grad_norm": 0.3449211120605469,
"learning_rate": 6.196146087429303e-05,
"loss": 0.0085,
"step": 9040
},
{
"grad_norm": 0.24136734008789062,
"learning_rate": 6.188117164908474e-05,
"loss": 0.0094,
"step": 9050
},
{
"grad_norm": 0.2191270887851715,
"learning_rate": 6.180084994120684e-05,
"loss": 0.011,
"step": 9060
},
{
"grad_norm": 0.4127255082130432,
"learning_rate": 6.17204959702558e-05,
"loss": 0.0115,
"step": 9070
},
{
"grad_norm": 0.25778821110725403,
"learning_rate": 6.164010995591635e-05,
"loss": 0.0073,
"step": 9080
},
{
"grad_norm": 0.19585411250591278,
"learning_rate": 6.155969211796076e-05,
"loss": 0.0093,
"step": 9090
},
{
"grad_norm": 0.1971423476934433,
"learning_rate": 6.147924267624829e-05,
"loss": 0.0065,
"step": 9100
},
{
"grad_norm": 0.18513402342796326,
"learning_rate": 6.13987618507247e-05,
"loss": 0.0082,
"step": 9110
},
{
"grad_norm": 0.30468112230300903,
"learning_rate": 6.131824986142147e-05,
"loss": 0.0108,
"step": 9120
},
{
"grad_norm": 0.2643079161643982,
"learning_rate": 6.123770692845529e-05,
"loss": 0.0081,
"step": 9130
},
{
"grad_norm": 0.35840684175491333,
"learning_rate": 6.11571332720275e-05,
"loss": 0.0081,
"step": 9140
},
{
"grad_norm": 0.34239524602890015,
"learning_rate": 6.107652911242336e-05,
"loss": 0.0118,
"step": 9150
},
{
"grad_norm": 0.3136473596096039,
"learning_rate": 6.0995894670011586e-05,
"loss": 0.0128,
"step": 9160
},
{
"grad_norm": 0.3185141980648041,
"learning_rate": 6.091523016524368e-05,
"loss": 0.0133,
"step": 9170
},
{
"grad_norm": 0.2437521517276764,
"learning_rate": 6.083453581865328e-05,
"loss": 0.0116,
"step": 9180
},
{
"grad_norm": 0.24852575361728668,
"learning_rate": 6.075381185085568e-05,
"loss": 0.0101,
"step": 9190
},
{
"grad_norm": 0.21035079658031464,
"learning_rate": 6.067305848254709e-05,
"loss": 0.0089,
"step": 9200
},
{
"grad_norm": 0.19124074280261993,
"learning_rate": 6.059227593450418e-05,
"loss": 0.01,
"step": 9210
},
{
"grad_norm": 0.22028234601020813,
"learning_rate": 6.051146442758333e-05,
"loss": 0.0128,
"step": 9220
},
{
"grad_norm": 0.2855907082557678,
"learning_rate": 6.043062418272012e-05,
"loss": 0.0103,
"step": 9230
},
{
"grad_norm": 0.23253290355205536,
"learning_rate": 6.0349755420928666e-05,
"loss": 0.0075,
"step": 9240
},
{
"grad_norm": 0.22832125425338745,
"learning_rate": 6.0268858363301105e-05,
"loss": 0.0074,
"step": 9250
},
{
"grad_norm": 0.22071580588817596,
"learning_rate": 6.018793323100689e-05,
"loss": 0.0106,
"step": 9260
},
{
"grad_norm": 0.3454406261444092,
"learning_rate": 6.0106980245292255e-05,
"loss": 0.011,
"step": 9270
},
{
"grad_norm": 0.3467009663581848,
"learning_rate": 6.002599962747957e-05,
"loss": 0.0087,
"step": 9280
},
{
"grad_norm": 0.2289619743824005,
"learning_rate": 5.994499159896673e-05,
"loss": 0.0068,
"step": 9290
},
{
"grad_norm": 0.2502879202365875,
"learning_rate": 5.9863956381226607e-05,
"loss": 0.0138,
"step": 9300
},
{
"grad_norm": 0.23016954958438873,
"learning_rate": 5.9782894195806394e-05,
"loss": 0.0088,
"step": 9310
},
{
"grad_norm": 0.3265341520309448,
"learning_rate": 5.9701805264327004e-05,
"loss": 0.0122,
"step": 9320
},
{
"grad_norm": 0.2779223322868347,
"learning_rate": 5.96206898084825e-05,
"loss": 0.0073,
"step": 9330
},
{
"grad_norm": 0.20430560410022736,
"learning_rate": 5.953954805003942e-05,
"loss": 0.0106,
"step": 9340
},
{
"grad_norm": 0.22642415761947632,
"learning_rate": 5.945838021083623e-05,
"loss": 0.0083,
"step": 9350
},
{
"grad_norm": 0.22150662541389465,
"learning_rate": 5.9377186512782714e-05,
"loss": 0.0065,
"step": 9360
},
{
"grad_norm": 0.364218145608902,
"learning_rate": 5.929596717785935e-05,
"loss": 0.0089,
"step": 9370
},
{
"grad_norm": 0.24294275045394897,
"learning_rate": 5.921472242811668e-05,
"loss": 0.0079,
"step": 9380
},
{
"grad_norm": 0.23857471346855164,
"learning_rate": 5.913345248567475e-05,
"loss": 0.0125,
"step": 9390
},
{
"grad_norm": 0.17391999065876007,
"learning_rate": 5.905215757272248e-05,
"loss": 0.0162,
"step": 9400
},
{
"grad_norm": 0.2742446959018707,
"learning_rate": 5.897083791151706e-05,
"loss": 0.009,
"step": 9410
},
{
"grad_norm": 0.27407529950141907,
"learning_rate": 5.888949372438336e-05,
"loss": 0.0104,
"step": 9420
},
{
"grad_norm": 0.1971682459115982,
"learning_rate": 5.8808125233713255e-05,
"loss": 0.008,
"step": 9430
},
{
"grad_norm": 0.17385496199131012,
"learning_rate": 5.872673266196509e-05,
"loss": 0.007,
"step": 9440
},
{
"grad_norm": 0.2608735263347626,
"learning_rate": 5.864531623166305e-05,
"loss": 0.0083,
"step": 9450
},
{
"grad_norm": 0.2396305799484253,
"learning_rate": 5.856387616539656e-05,
"loss": 0.0082,
"step": 9460
},
{
"grad_norm": 0.26889148354530334,
"learning_rate": 5.848241268581967e-05,
"loss": 0.0085,
"step": 9470
},
{
"grad_norm": 0.26665395498275757,
"learning_rate": 5.840092601565037e-05,
"loss": 0.0094,
"step": 9480
},
{
"grad_norm": 0.23885580897331238,
"learning_rate": 5.8319416377670144e-05,
"loss": 0.008,
"step": 9490
},
{
"grad_norm": 0.2632520794868469,
"learning_rate": 5.82378839947232e-05,
"loss": 0.0098,
"step": 9500
},
{
"grad_norm": 0.3209339678287506,
"learning_rate": 5.815632908971599e-05,
"loss": 0.0106,
"step": 9510
},
{
"grad_norm": 0.282398521900177,
"learning_rate": 5.80747518856165e-05,
"loss": 0.0099,
"step": 9520
},
{
"grad_norm": 0.3100825250148773,
"learning_rate": 5.799315260545367e-05,
"loss": 0.0134,
"step": 9530
},
{
"grad_norm": 0.2550257444381714,
"learning_rate": 5.791153147231686e-05,
"loss": 0.0135,
"step": 9540
},
{
"grad_norm": 0.3137185275554657,
"learning_rate": 5.782988870935509e-05,
"loss": 0.008,
"step": 9550
},
{
"grad_norm": 0.23910042643547058,
"learning_rate": 5.774822453977657e-05,
"loss": 0.0087,
"step": 9560
},
{
"grad_norm": 0.21105986833572388,
"learning_rate": 5.7666539186848036e-05,
"loss": 0.009,
"step": 9570
},
{
"grad_norm": 0.2725152373313904,
"learning_rate": 5.758483287389411e-05,
"loss": 0.0143,
"step": 9580
},
{
"grad_norm": 0.2424250990152359,
"learning_rate": 5.7503105824296735e-05,
"loss": 0.0122,
"step": 9590
},
{
"grad_norm": 0.20699156820774078,
"learning_rate": 5.742135826149453e-05,
"loss": 0.0092,
"step": 9600
},
{
"grad_norm": 0.19423116743564606,
"learning_rate": 5.7339590408982223e-05,
"loss": 0.0065,
"step": 9610
},
{
"grad_norm": 0.2932196259498596,
"learning_rate": 5.725780249031e-05,
"loss": 0.0091,
"step": 9620
},
{
"grad_norm": 0.3803527057170868,
"learning_rate": 5.717599472908292e-05,
"loss": 0.0109,
"step": 9630
},
{
"grad_norm": 0.3079898953437805,
"learning_rate": 5.7094167348960237e-05,
"loss": 0.0084,
"step": 9640
},
{
"grad_norm": 0.2345152646303177,
"learning_rate": 5.7012320573654945e-05,
"loss": 0.0093,
"step": 9650
},
{
"grad_norm": 0.19556953012943268,
"learning_rate": 5.693045462693295e-05,
"loss": 0.008,
"step": 9660
},
{
"grad_norm": 0.22584684193134308,
"learning_rate": 5.684856973261266e-05,
"loss": 0.0073,
"step": 9670
},
{
"grad_norm": 0.2571251094341278,
"learning_rate": 5.6766666114564215e-05,
"loss": 0.0099,
"step": 9680
},
{
"grad_norm": 0.3253817856311798,
"learning_rate": 5.668474399670899e-05,
"loss": 0.0089,
"step": 9690
},
{
"grad_norm": 0.23601661622524261,
"learning_rate": 5.660280360301896e-05,
"loss": 0.0085,
"step": 9700
},
{
"grad_norm": 0.18850122392177582,
"learning_rate": 5.652084515751599e-05,
"loss": 0.0063,
"step": 9710
},
{
"grad_norm": 0.2345300316810608,
"learning_rate": 5.643886888427137e-05,
"loss": 0.009,
"step": 9720
},
{
"grad_norm": 0.21616721153259277,
"learning_rate": 5.6356875007405074e-05,
"loss": 0.0105,
"step": 9730
},
{
"grad_norm": 0.23095466196537018,
"learning_rate": 5.627486375108525e-05,
"loss": 0.0113,
"step": 9740
},
{
"grad_norm": 0.20307700335979462,
"learning_rate": 5.619283533952754e-05,
"loss": 0.0098,
"step": 9750
},
{
"grad_norm": 0.3022615313529968,
"learning_rate": 5.6110789996994474e-05,
"loss": 0.0087,
"step": 9760
},
{
"grad_norm": 0.2918921411037445,
"learning_rate": 5.602872794779491e-05,
"loss": 0.0115,
"step": 9770
},
{
"grad_norm": 0.3074280023574829,
"learning_rate": 5.594664941628334e-05,
"loss": 0.0087,
"step": 9780
},
{
"grad_norm": 0.2652987241744995,
"learning_rate": 5.5864554626859324e-05,
"loss": 0.0102,
"step": 9790
},
{
"grad_norm": 0.2364000827074051,
"learning_rate": 5.578244380396691e-05,
"loss": 0.0071,
"step": 9800
},
{
"grad_norm": 0.25912168622016907,
"learning_rate": 5.570031717209394e-05,
"loss": 0.0073,
"step": 9810
},
{
"grad_norm": 0.2132926732301712,
"learning_rate": 5.561817495577147e-05,
"loss": 0.0081,
"step": 9820
},
{
"grad_norm": 0.3217675983905792,
"learning_rate": 5.5536017379573215e-05,
"loss": 0.0095,
"step": 9830
},
{
"grad_norm": 0.21416251361370087,
"learning_rate": 5.545384466811483e-05,
"loss": 0.0088,
"step": 9840
},
{
"grad_norm": 0.30158206820487976,
"learning_rate": 5.5371657046053384e-05,
"loss": 0.0127,
"step": 9850
},
{
"grad_norm": 0.17979058623313904,
"learning_rate": 5.528945473808669e-05,
"loss": 0.0083,
"step": 9860
},
{
"grad_norm": 0.1995510756969452,
"learning_rate": 5.520723796895272e-05,
"loss": 0.0063,
"step": 9870
},
{
"grad_norm": 0.21220991015434265,
"learning_rate": 5.512500696342897e-05,
"loss": 0.0077,
"step": 9880
},
{
"grad_norm": 0.3290112316608429,
"learning_rate": 5.504276194633188e-05,
"loss": 0.0107,
"step": 9890
},
{
"grad_norm": 0.28488659858703613,
"learning_rate": 5.49605031425162e-05,
"loss": 0.0081,
"step": 9900
},
{
"grad_norm": 0.32763949036598206,
"learning_rate": 5.487823077687434e-05,
"loss": 0.0135,
"step": 9910
},
{
"grad_norm": 0.23580661416053772,
"learning_rate": 5.4795945074335806e-05,
"loss": 0.0094,
"step": 9920
},
{
"grad_norm": 0.22253672778606415,
"learning_rate": 5.471364625986657e-05,
"loss": 0.008,
"step": 9930
},
{
"grad_norm": 0.20649607479572296,
"learning_rate": 5.463133455846845e-05,
"loss": 0.0064,
"step": 9940
},
{
"grad_norm": 0.21485736966133118,
"learning_rate": 5.4549010195178505e-05,
"loss": 0.0142,
"step": 9950
},
{
"grad_norm": 0.22330300509929657,
"learning_rate": 5.446667339506838e-05,
"loss": 0.008,
"step": 9960
},
{
"grad_norm": 0.2618495523929596,
"learning_rate": 5.4384324383243756e-05,
"loss": 0.0099,
"step": 9970
},
{
"grad_norm": 0.28015658259391785,
"learning_rate": 5.430196338484368e-05,
"loss": 0.011,
"step": 9980
},
{
"grad_norm": 0.20648691058158875,
"learning_rate": 5.4219590625039975e-05,
"loss": 0.0083,
"step": 9990
},
{
"grad_norm": 0.22049671411514282,
"learning_rate": 5.413720632903664e-05,
"loss": 0.008,
"step": 10000
}
],
"logging_steps": 10,
"max_steps": 20000,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 24,
"trial_name": null,
"trial_params": null
}