e-zorzi's picture
Add files using upload-large-folder tool
4d172a3 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.25,
"eval_steps": 500,
"global_step": 5000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"grad_norm": 1.3594739437103271,
"learning_rate": 9e-07,
"loss": 1.1913,
"step": 10
},
{
"grad_norm": 1.0572824478149414,
"learning_rate": 1.9e-06,
"loss": 1.1841,
"step": 20
},
{
"grad_norm": 0.5717663764953613,
"learning_rate": 2.9e-06,
"loss": 1.1508,
"step": 30
},
{
"grad_norm": 0.3898443877696991,
"learning_rate": 3.9e-06,
"loss": 1.1205,
"step": 40
},
{
"grad_norm": 0.28664326667785645,
"learning_rate": 4.9000000000000005e-06,
"loss": 1.0888,
"step": 50
},
{
"grad_norm": 0.1729290783405304,
"learning_rate": 5.9e-06,
"loss": 1.0782,
"step": 60
},
{
"grad_norm": 0.17002208530902863,
"learning_rate": 6.900000000000001e-06,
"loss": 1.0691,
"step": 70
},
{
"grad_norm": 0.2152942717075348,
"learning_rate": 7.9e-06,
"loss": 1.0562,
"step": 80
},
{
"grad_norm": 0.19103780388832092,
"learning_rate": 8.9e-06,
"loss": 1.0479,
"step": 90
},
{
"grad_norm": 0.3243984878063202,
"learning_rate": 9.900000000000002e-06,
"loss": 1.0372,
"step": 100
},
{
"grad_norm": 0.1820673942565918,
"learning_rate": 1.09e-05,
"loss": 1.0272,
"step": 110
},
{
"grad_norm": 0.21819084882736206,
"learning_rate": 1.19e-05,
"loss": 1.0236,
"step": 120
},
{
"grad_norm": 0.20377595722675323,
"learning_rate": 1.29e-05,
"loss": 1.0237,
"step": 130
},
{
"grad_norm": 0.20572194457054138,
"learning_rate": 1.3900000000000002e-05,
"loss": 1.0228,
"step": 140
},
{
"grad_norm": 0.20157840847969055,
"learning_rate": 1.49e-05,
"loss": 1.0217,
"step": 150
},
{
"grad_norm": 0.23459017276763916,
"learning_rate": 1.59e-05,
"loss": 1.0192,
"step": 160
},
{
"grad_norm": 0.32469043135643005,
"learning_rate": 1.69e-05,
"loss": 1.0063,
"step": 170
},
{
"grad_norm": 0.36008527874946594,
"learning_rate": 1.79e-05,
"loss": 0.9873,
"step": 180
},
{
"grad_norm": 0.5633573532104492,
"learning_rate": 1.8900000000000002e-05,
"loss": 0.9672,
"step": 190
},
{
"grad_norm": 0.7019369006156921,
"learning_rate": 1.9900000000000003e-05,
"loss": 0.9315,
"step": 200
},
{
"grad_norm": 0.5538105964660645,
"learning_rate": 2.09e-05,
"loss": 0.8958,
"step": 210
},
{
"grad_norm": 0.5306029319763184,
"learning_rate": 2.19e-05,
"loss": 0.8707,
"step": 220
},
{
"grad_norm": 0.6606974005699158,
"learning_rate": 2.29e-05,
"loss": 0.8479,
"step": 230
},
{
"grad_norm": 0.8058410882949829,
"learning_rate": 2.39e-05,
"loss": 0.8169,
"step": 240
},
{
"grad_norm": 0.7277475595474243,
"learning_rate": 2.4900000000000002e-05,
"loss": 0.77,
"step": 250
},
{
"grad_norm": 0.6617355942726135,
"learning_rate": 2.5900000000000003e-05,
"loss": 0.7456,
"step": 260
},
{
"grad_norm": 0.8156651258468628,
"learning_rate": 2.6900000000000003e-05,
"loss": 0.6984,
"step": 270
},
{
"grad_norm": 0.7090954780578613,
"learning_rate": 2.7900000000000004e-05,
"loss": 0.6774,
"step": 280
},
{
"grad_norm": 0.8667084574699402,
"learning_rate": 2.8899999999999998e-05,
"loss": 0.6429,
"step": 290
},
{
"grad_norm": 0.946596622467041,
"learning_rate": 2.9900000000000002e-05,
"loss": 0.6052,
"step": 300
},
{
"grad_norm": 0.8120863437652588,
"learning_rate": 3.09e-05,
"loss": 0.5681,
"step": 310
},
{
"grad_norm": 0.9630921483039856,
"learning_rate": 3.19e-05,
"loss": 0.5267,
"step": 320
},
{
"grad_norm": 0.9185823798179626,
"learning_rate": 3.29e-05,
"loss": 0.497,
"step": 330
},
{
"grad_norm": 0.9909350872039795,
"learning_rate": 3.3900000000000004e-05,
"loss": 0.4704,
"step": 340
},
{
"grad_norm": 0.7408623695373535,
"learning_rate": 3.49e-05,
"loss": 0.4463,
"step": 350
},
{
"grad_norm": 0.8417967557907104,
"learning_rate": 3.59e-05,
"loss": 0.4515,
"step": 360
},
{
"grad_norm": 0.9200495481491089,
"learning_rate": 3.69e-05,
"loss": 0.417,
"step": 370
},
{
"grad_norm": 1.146302342414856,
"learning_rate": 3.79e-05,
"loss": 0.3937,
"step": 380
},
{
"grad_norm": 1.0057293176651,
"learning_rate": 3.8900000000000004e-05,
"loss": 0.3773,
"step": 390
},
{
"grad_norm": 1.112216591835022,
"learning_rate": 3.99e-05,
"loss": 0.348,
"step": 400
},
{
"grad_norm": 1.0176512002944946,
"learning_rate": 4.09e-05,
"loss": 0.3392,
"step": 410
},
{
"grad_norm": 1.0310163497924805,
"learning_rate": 4.19e-05,
"loss": 0.3065,
"step": 420
},
{
"grad_norm": 1.022374153137207,
"learning_rate": 4.29e-05,
"loss": 0.2808,
"step": 430
},
{
"grad_norm": 1.368080735206604,
"learning_rate": 4.39e-05,
"loss": 0.2624,
"step": 440
},
{
"grad_norm": 1.1092591285705566,
"learning_rate": 4.49e-05,
"loss": 0.2405,
"step": 450
},
{
"grad_norm": 0.9738430380821228,
"learning_rate": 4.5900000000000004e-05,
"loss": 0.2254,
"step": 460
},
{
"grad_norm": 1.033246636390686,
"learning_rate": 4.69e-05,
"loss": 0.2162,
"step": 470
},
{
"grad_norm": 0.9855560064315796,
"learning_rate": 4.79e-05,
"loss": 0.2088,
"step": 480
},
{
"grad_norm": 1.0313360691070557,
"learning_rate": 4.89e-05,
"loss": 0.2188,
"step": 490
},
{
"grad_norm": 1.100176215171814,
"learning_rate": 4.99e-05,
"loss": 0.2007,
"step": 500
},
{
"grad_norm": 1.0784265995025635,
"learning_rate": 5.0900000000000004e-05,
"loss": 0.2016,
"step": 510
},
{
"grad_norm": 1.0822303295135498,
"learning_rate": 5.19e-05,
"loss": 0.1961,
"step": 520
},
{
"grad_norm": 1.067589282989502,
"learning_rate": 5.2900000000000005e-05,
"loss": 0.1801,
"step": 530
},
{
"grad_norm": 1.1917147636413574,
"learning_rate": 5.390000000000001e-05,
"loss": 0.1705,
"step": 540
},
{
"grad_norm": 1.3141072988510132,
"learning_rate": 5.4900000000000006e-05,
"loss": 0.1851,
"step": 550
},
{
"grad_norm": 1.002855658531189,
"learning_rate": 5.590000000000001e-05,
"loss": 0.1663,
"step": 560
},
{
"grad_norm": 1.167011022567749,
"learning_rate": 5.69e-05,
"loss": 0.1741,
"step": 570
},
{
"grad_norm": 1.0936863422393799,
"learning_rate": 5.79e-05,
"loss": 0.1661,
"step": 580
},
{
"grad_norm": 0.9669778347015381,
"learning_rate": 5.89e-05,
"loss": 0.1648,
"step": 590
},
{
"grad_norm": 0.9405611753463745,
"learning_rate": 5.99e-05,
"loss": 0.1627,
"step": 600
},
{
"grad_norm": 1.0284767150878906,
"learning_rate": 6.09e-05,
"loss": 0.1496,
"step": 610
},
{
"grad_norm": 1.1097605228424072,
"learning_rate": 6.19e-05,
"loss": 0.1628,
"step": 620
},
{
"grad_norm": 0.9104214310646057,
"learning_rate": 6.29e-05,
"loss": 0.1302,
"step": 630
},
{
"grad_norm": 0.8578998446464539,
"learning_rate": 6.390000000000001e-05,
"loss": 0.1326,
"step": 640
},
{
"grad_norm": 1.1287304162979126,
"learning_rate": 6.49e-05,
"loss": 0.1127,
"step": 650
},
{
"grad_norm": 0.8655268549919128,
"learning_rate": 6.59e-05,
"loss": 0.1202,
"step": 660
},
{
"grad_norm": 0.9937160015106201,
"learning_rate": 6.690000000000001e-05,
"loss": 0.1198,
"step": 670
},
{
"grad_norm": 0.9691420197486877,
"learning_rate": 6.790000000000001e-05,
"loss": 0.1096,
"step": 680
},
{
"grad_norm": 1.0945252180099487,
"learning_rate": 6.89e-05,
"loss": 0.105,
"step": 690
},
{
"grad_norm": 1.0388752222061157,
"learning_rate": 6.99e-05,
"loss": 0.1027,
"step": 700
},
{
"grad_norm": 0.881949245929718,
"learning_rate": 7.09e-05,
"loss": 0.1044,
"step": 710
},
{
"grad_norm": 0.8678519129753113,
"learning_rate": 7.19e-05,
"loss": 0.0842,
"step": 720
},
{
"grad_norm": 1.2314260005950928,
"learning_rate": 7.29e-05,
"loss": 0.0841,
"step": 730
},
{
"grad_norm": 0.7337191700935364,
"learning_rate": 7.390000000000001e-05,
"loss": 0.0771,
"step": 740
},
{
"grad_norm": 1.194354772567749,
"learning_rate": 7.49e-05,
"loss": 0.0791,
"step": 750
},
{
"grad_norm": 1.0703870058059692,
"learning_rate": 7.59e-05,
"loss": 0.0697,
"step": 760
},
{
"grad_norm": 0.9820927977561951,
"learning_rate": 7.69e-05,
"loss": 0.0798,
"step": 770
},
{
"grad_norm": 1.099042534828186,
"learning_rate": 7.790000000000001e-05,
"loss": 0.0736,
"step": 780
},
{
"grad_norm": 0.9056155681610107,
"learning_rate": 7.890000000000001e-05,
"loss": 0.0756,
"step": 790
},
{
"grad_norm": 0.8292648792266846,
"learning_rate": 7.99e-05,
"loss": 0.0796,
"step": 800
},
{
"grad_norm": 0.9507290720939636,
"learning_rate": 8.090000000000001e-05,
"loss": 0.0829,
"step": 810
},
{
"grad_norm": 0.9466397762298584,
"learning_rate": 8.19e-05,
"loss": 0.0688,
"step": 820
},
{
"grad_norm": 0.7956731915473938,
"learning_rate": 8.29e-05,
"loss": 0.0747,
"step": 830
},
{
"grad_norm": 0.7995853424072266,
"learning_rate": 8.39e-05,
"loss": 0.0634,
"step": 840
},
{
"grad_norm": 0.7665478587150574,
"learning_rate": 8.49e-05,
"loss": 0.0661,
"step": 850
},
{
"grad_norm": 0.9283880591392517,
"learning_rate": 8.59e-05,
"loss": 0.0702,
"step": 860
},
{
"grad_norm": 1.126967191696167,
"learning_rate": 8.69e-05,
"loss": 0.0716,
"step": 870
},
{
"grad_norm": 0.8662194609642029,
"learning_rate": 8.790000000000001e-05,
"loss": 0.0667,
"step": 880
},
{
"grad_norm": 0.9572857022285461,
"learning_rate": 8.89e-05,
"loss": 0.0791,
"step": 890
},
{
"grad_norm": 0.9036967158317566,
"learning_rate": 8.99e-05,
"loss": 0.0745,
"step": 900
},
{
"grad_norm": 0.7550048828125,
"learning_rate": 9.090000000000001e-05,
"loss": 0.0746,
"step": 910
},
{
"grad_norm": 0.9990408420562744,
"learning_rate": 9.190000000000001e-05,
"loss": 0.0648,
"step": 920
},
{
"grad_norm": 0.8286410570144653,
"learning_rate": 9.290000000000001e-05,
"loss": 0.0697,
"step": 930
},
{
"grad_norm": 0.9783310890197754,
"learning_rate": 9.39e-05,
"loss": 0.0749,
"step": 940
},
{
"grad_norm": 0.9899768233299255,
"learning_rate": 9.49e-05,
"loss": 0.0722,
"step": 950
},
{
"grad_norm": 0.7450554370880127,
"learning_rate": 9.59e-05,
"loss": 0.0599,
"step": 960
},
{
"grad_norm": 0.7791635394096375,
"learning_rate": 9.69e-05,
"loss": 0.0654,
"step": 970
},
{
"grad_norm": 0.7614015340805054,
"learning_rate": 9.790000000000001e-05,
"loss": 0.0558,
"step": 980
},
{
"grad_norm": 0.9096309542655945,
"learning_rate": 9.89e-05,
"loss": 0.0581,
"step": 990
},
{
"grad_norm": 0.668950080871582,
"learning_rate": 9.99e-05,
"loss": 0.0652,
"step": 1000
},
{
"grad_norm": 0.8658283948898315,
"learning_rate": 9.999994463727085e-05,
"loss": 0.0529,
"step": 1010
},
{
"grad_norm": 0.7495288848876953,
"learning_rate": 9.999975326009292e-05,
"loss": 0.059,
"step": 1020
},
{
"grad_norm": 0.9980189204216003,
"learning_rate": 9.999942518549879e-05,
"loss": 0.0638,
"step": 1030
},
{
"grad_norm": 0.7826606035232544,
"learning_rate": 9.999896041438544e-05,
"loss": 0.0546,
"step": 1040
},
{
"grad_norm": 0.6360778212547302,
"learning_rate": 9.999835894802353e-05,
"loss": 0.054,
"step": 1050
},
{
"grad_norm": 0.7757160067558289,
"learning_rate": 9.999762078805743e-05,
"loss": 0.0591,
"step": 1060
},
{
"grad_norm": 0.7390689849853516,
"learning_rate": 9.999674593650526e-05,
"loss": 0.0595,
"step": 1070
},
{
"grad_norm": 0.6460424065589905,
"learning_rate": 9.99957343957588e-05,
"loss": 0.0658,
"step": 1080
},
{
"grad_norm": 0.8082983493804932,
"learning_rate": 9.99945861685836e-05,
"loss": 0.0596,
"step": 1090
},
{
"grad_norm": 0.7415626645088196,
"learning_rate": 9.999330125811884e-05,
"loss": 0.0483,
"step": 1100
},
{
"grad_norm": 0.8829818367958069,
"learning_rate": 9.999187966787744e-05,
"loss": 0.0619,
"step": 1110
},
{
"grad_norm": 0.8239393830299377,
"learning_rate": 9.999032140174595e-05,
"loss": 0.0528,
"step": 1120
},
{
"grad_norm": 0.8529507517814636,
"learning_rate": 9.998862646398464e-05,
"loss": 0.0654,
"step": 1130
},
{
"grad_norm": 0.7502208948135376,
"learning_rate": 9.998679485922739e-05,
"loss": 0.0526,
"step": 1140
},
{
"grad_norm": 0.6970030069351196,
"learning_rate": 9.998482659248174e-05,
"loss": 0.0547,
"step": 1150
},
{
"grad_norm": 0.9376399517059326,
"learning_rate": 9.998272166912883e-05,
"loss": 0.0557,
"step": 1160
},
{
"grad_norm": 0.7249330282211304,
"learning_rate": 9.998048009492347e-05,
"loss": 0.0504,
"step": 1170
},
{
"grad_norm": 0.8968970775604248,
"learning_rate": 9.997810187599403e-05,
"loss": 0.0526,
"step": 1180
},
{
"grad_norm": 0.7676458358764648,
"learning_rate": 9.997558701884249e-05,
"loss": 0.0506,
"step": 1190
},
{
"grad_norm": 0.6501711010932922,
"learning_rate": 9.997293553034433e-05,
"loss": 0.061,
"step": 1200
},
{
"grad_norm": 0.677116870880127,
"learning_rate": 9.997014741774866e-05,
"loss": 0.0462,
"step": 1210
},
{
"grad_norm": 0.8147766590118408,
"learning_rate": 9.996722268867803e-05,
"loss": 0.0486,
"step": 1220
},
{
"grad_norm": 0.706069827079773,
"learning_rate": 9.996416135112858e-05,
"loss": 0.0511,
"step": 1230
},
{
"grad_norm": 0.6159539818763733,
"learning_rate": 9.996096341346988e-05,
"loss": 0.0492,
"step": 1240
},
{
"grad_norm": 0.6369336843490601,
"learning_rate": 9.995762888444495e-05,
"loss": 0.0479,
"step": 1250
},
{
"grad_norm": 0.7543830275535583,
"learning_rate": 9.995415777317027e-05,
"loss": 0.0493,
"step": 1260
},
{
"grad_norm": 0.7505154609680176,
"learning_rate": 9.995055008913574e-05,
"loss": 0.053,
"step": 1270
},
{
"grad_norm": 0.5397493243217468,
"learning_rate": 9.994680584220463e-05,
"loss": 0.0432,
"step": 1280
},
{
"grad_norm": 0.6707198619842529,
"learning_rate": 9.994292504261355e-05,
"loss": 0.0472,
"step": 1290
},
{
"grad_norm": 0.8792182803153992,
"learning_rate": 9.993890770097247e-05,
"loss": 0.0453,
"step": 1300
},
{
"grad_norm": 0.7324561476707458,
"learning_rate": 9.993475382826467e-05,
"loss": 0.0479,
"step": 1310
},
{
"grad_norm": 0.8385289907455444,
"learning_rate": 9.993046343584664e-05,
"loss": 0.0549,
"step": 1320
},
{
"grad_norm": 0.5908923745155334,
"learning_rate": 9.992603653544816e-05,
"loss": 0.0483,
"step": 1330
},
{
"grad_norm": 0.63700932264328,
"learning_rate": 9.992147313917222e-05,
"loss": 0.0485,
"step": 1340
},
{
"grad_norm": 0.7525864839553833,
"learning_rate": 9.991677325949497e-05,
"loss": 0.0469,
"step": 1350
},
{
"grad_norm": 0.5628486275672913,
"learning_rate": 9.991193690926568e-05,
"loss": 0.0459,
"step": 1360
},
{
"grad_norm": 0.795554518699646,
"learning_rate": 9.990696410170678e-05,
"loss": 0.0467,
"step": 1370
},
{
"grad_norm": 0.7957155704498291,
"learning_rate": 9.990185485041371e-05,
"loss": 0.0481,
"step": 1380
},
{
"grad_norm": 0.5773254632949829,
"learning_rate": 9.989660916935498e-05,
"loss": 0.0471,
"step": 1390
},
{
"grad_norm": 0.6150880455970764,
"learning_rate": 9.989122707287208e-05,
"loss": 0.0426,
"step": 1400
},
{
"grad_norm": 0.7106145620346069,
"learning_rate": 9.988570857567945e-05,
"loss": 0.0537,
"step": 1410
},
{
"grad_norm": 0.9491516947746277,
"learning_rate": 9.988005369286446e-05,
"loss": 0.0525,
"step": 1420
},
{
"grad_norm": 0.6860232353210449,
"learning_rate": 9.987426243988734e-05,
"loss": 0.0429,
"step": 1430
},
{
"grad_norm": 0.7841853499412537,
"learning_rate": 9.986833483258114e-05,
"loss": 0.0524,
"step": 1440
},
{
"grad_norm": 0.6175568103790283,
"learning_rate": 9.986227088715173e-05,
"loss": 0.0385,
"step": 1450
},
{
"grad_norm": 0.5932314991950989,
"learning_rate": 9.98560706201777e-05,
"loss": 0.0408,
"step": 1460
},
{
"grad_norm": 0.7410153150558472,
"learning_rate": 9.984973404861036e-05,
"loss": 0.043,
"step": 1470
},
{
"grad_norm": 0.8330276608467102,
"learning_rate": 9.984326118977361e-05,
"loss": 0.051,
"step": 1480
},
{
"grad_norm": 0.7202706933021545,
"learning_rate": 9.983665206136406e-05,
"loss": 0.0493,
"step": 1490
},
{
"grad_norm": 0.574433445930481,
"learning_rate": 9.982990668145075e-05,
"loss": 0.0466,
"step": 1500
},
{
"grad_norm": 0.7351802587509155,
"learning_rate": 9.982302506847534e-05,
"loss": 0.057,
"step": 1510
},
{
"grad_norm": 0.819564163684845,
"learning_rate": 9.981600724125189e-05,
"loss": 0.0555,
"step": 1520
},
{
"grad_norm": 0.6065496206283569,
"learning_rate": 9.980885321896685e-05,
"loss": 0.0509,
"step": 1530
},
{
"grad_norm": 0.6572223901748657,
"learning_rate": 9.980156302117905e-05,
"loss": 0.044,
"step": 1540
},
{
"grad_norm": 0.6978927254676819,
"learning_rate": 9.979413666781963e-05,
"loss": 0.0465,
"step": 1550
},
{
"grad_norm": 0.5508580803871155,
"learning_rate": 9.978657417919193e-05,
"loss": 0.0452,
"step": 1560
},
{
"grad_norm": 0.5769541263580322,
"learning_rate": 9.977887557597153e-05,
"loss": 0.0475,
"step": 1570
},
{
"grad_norm": 0.5610742568969727,
"learning_rate": 9.97710408792061e-05,
"loss": 0.0469,
"step": 1580
},
{
"grad_norm": 0.5692776441574097,
"learning_rate": 9.976307011031542e-05,
"loss": 0.0449,
"step": 1590
},
{
"grad_norm": 0.5226185321807861,
"learning_rate": 9.975496329109126e-05,
"loss": 0.0476,
"step": 1600
},
{
"grad_norm": 0.7111744284629822,
"learning_rate": 9.974672044369732e-05,
"loss": 0.047,
"step": 1610
},
{
"grad_norm": 0.514858067035675,
"learning_rate": 9.97383415906693e-05,
"loss": 0.043,
"step": 1620
},
{
"grad_norm": 0.5856963396072388,
"learning_rate": 9.97298267549146e-05,
"loss": 0.0471,
"step": 1630
},
{
"grad_norm": 0.6191436052322388,
"learning_rate": 9.972117595971249e-05,
"loss": 0.0422,
"step": 1640
},
{
"grad_norm": 0.5670982599258423,
"learning_rate": 9.971238922871391e-05,
"loss": 0.0419,
"step": 1650
},
{
"grad_norm": 0.7190003991127014,
"learning_rate": 9.970346658594142e-05,
"loss": 0.0453,
"step": 1660
},
{
"grad_norm": 0.6552428007125854,
"learning_rate": 9.969440805578923e-05,
"loss": 0.046,
"step": 1670
},
{
"grad_norm": 0.578118622303009,
"learning_rate": 9.968521366302298e-05,
"loss": 0.0392,
"step": 1680
},
{
"grad_norm": 0.7054030895233154,
"learning_rate": 9.967588343277981e-05,
"loss": 0.0455,
"step": 1690
},
{
"grad_norm": 0.6531293392181396,
"learning_rate": 9.966641739056818e-05,
"loss": 0.0421,
"step": 1700
},
{
"grad_norm": 0.6111751198768616,
"learning_rate": 9.965681556226793e-05,
"loss": 0.0517,
"step": 1710
},
{
"grad_norm": 0.4928556978702545,
"learning_rate": 9.964707797413006e-05,
"loss": 0.044,
"step": 1720
},
{
"grad_norm": 0.6597058773040771,
"learning_rate": 9.963720465277679e-05,
"loss": 0.047,
"step": 1730
},
{
"grad_norm": 0.6202155351638794,
"learning_rate": 9.96271956252014e-05,
"loss": 0.0384,
"step": 1740
},
{
"grad_norm": 0.5262959599494934,
"learning_rate": 9.961705091876816e-05,
"loss": 0.0425,
"step": 1750
},
{
"grad_norm": 0.6935763955116272,
"learning_rate": 9.960677056121235e-05,
"loss": 0.0409,
"step": 1760
},
{
"grad_norm": 0.6149827837944031,
"learning_rate": 9.959635458064005e-05,
"loss": 0.0383,
"step": 1770
},
{
"grad_norm": 0.5901826024055481,
"learning_rate": 9.958580300552815e-05,
"loss": 0.0426,
"step": 1780
},
{
"grad_norm": 0.5597098469734192,
"learning_rate": 9.957511586472426e-05,
"loss": 0.0352,
"step": 1790
},
{
"grad_norm": 0.5581690073013306,
"learning_rate": 9.956429318744662e-05,
"loss": 0.0366,
"step": 1800
},
{
"grad_norm": 0.5969916582107544,
"learning_rate": 9.955333500328404e-05,
"loss": 0.0355,
"step": 1810
},
{
"grad_norm": 0.5474916696548462,
"learning_rate": 9.95422413421957e-05,
"loss": 0.0376,
"step": 1820
},
{
"grad_norm": 0.5651562809944153,
"learning_rate": 9.953101223451133e-05,
"loss": 0.0359,
"step": 1830
},
{
"grad_norm": 0.6243921518325806,
"learning_rate": 9.951964771093085e-05,
"loss": 0.0373,
"step": 1840
},
{
"grad_norm": 0.4624647796154022,
"learning_rate": 9.950814780252442e-05,
"loss": 0.0347,
"step": 1850
},
{
"grad_norm": 0.5893751382827759,
"learning_rate": 9.949651254073236e-05,
"loss": 0.0408,
"step": 1860
},
{
"grad_norm": 0.526287317276001,
"learning_rate": 9.948474195736504e-05,
"loss": 0.0388,
"step": 1870
},
{
"grad_norm": 0.6111840605735779,
"learning_rate": 9.947283608460277e-05,
"loss": 0.0346,
"step": 1880
},
{
"grad_norm": 0.46461328864097595,
"learning_rate": 9.946079495499577e-05,
"loss": 0.0411,
"step": 1890
},
{
"grad_norm": 0.610548734664917,
"learning_rate": 9.944861860146401e-05,
"loss": 0.0407,
"step": 1900
},
{
"grad_norm": 0.5339504480361938,
"learning_rate": 9.943630705729719e-05,
"loss": 0.0398,
"step": 1910
},
{
"grad_norm": 0.46559029817581177,
"learning_rate": 9.942386035615459e-05,
"loss": 0.039,
"step": 1920
},
{
"grad_norm": 0.7745798826217651,
"learning_rate": 9.941127853206503e-05,
"loss": 0.04,
"step": 1930
},
{
"grad_norm": 0.5811882019042969,
"learning_rate": 9.939856161942673e-05,
"loss": 0.0425,
"step": 1940
},
{
"grad_norm": 0.4856541156768799,
"learning_rate": 9.938570965300724e-05,
"loss": 0.0363,
"step": 1950
},
{
"grad_norm": 0.5952467918395996,
"learning_rate": 9.937272266794335e-05,
"loss": 0.0439,
"step": 1960
},
{
"grad_norm": 0.5669976472854614,
"learning_rate": 9.935960069974096e-05,
"loss": 0.05,
"step": 1970
},
{
"grad_norm": 0.5959198474884033,
"learning_rate": 9.934634378427506e-05,
"loss": 0.0382,
"step": 1980
},
{
"grad_norm": 0.520875096321106,
"learning_rate": 9.933295195778954e-05,
"loss": 0.0386,
"step": 1990
},
{
"grad_norm": 0.4351758360862732,
"learning_rate": 9.931942525689715e-05,
"loss": 0.0488,
"step": 2000
},
{
"grad_norm": 0.6345981359481812,
"learning_rate": 9.930576371857936e-05,
"loss": 0.0391,
"step": 2010
},
{
"grad_norm": 0.6230748295783997,
"learning_rate": 9.929196738018629e-05,
"loss": 0.0388,
"step": 2020
},
{
"grad_norm": 0.5425089001655579,
"learning_rate": 9.927803627943662e-05,
"loss": 0.0395,
"step": 2030
},
{
"grad_norm": 0.49332770705223083,
"learning_rate": 9.926397045441744e-05,
"loss": 0.039,
"step": 2040
},
{
"grad_norm": 0.6731558442115784,
"learning_rate": 9.924976994358417e-05,
"loss": 0.0427,
"step": 2050
},
{
"grad_norm": 0.5310463309288025,
"learning_rate": 9.923543478576048e-05,
"loss": 0.0474,
"step": 2060
},
{
"grad_norm": 0.548930823802948,
"learning_rate": 9.922096502013813e-05,
"loss": 0.0423,
"step": 2070
},
{
"grad_norm": 0.5744786262512207,
"learning_rate": 9.92063606862769e-05,
"loss": 0.0372,
"step": 2080
},
{
"grad_norm": 0.6390929222106934,
"learning_rate": 9.919162182410453e-05,
"loss": 0.0368,
"step": 2090
},
{
"grad_norm": 0.5252511501312256,
"learning_rate": 9.917674847391645e-05,
"loss": 0.038,
"step": 2100
},
{
"grad_norm": 0.5656434297561646,
"learning_rate": 9.916174067637584e-05,
"loss": 0.0333,
"step": 2110
},
{
"grad_norm": 0.5288258790969849,
"learning_rate": 9.914659847251348e-05,
"loss": 0.0406,
"step": 2120
},
{
"grad_norm": 0.5040147304534912,
"learning_rate": 9.913132190372753e-05,
"loss": 0.0369,
"step": 2130
},
{
"grad_norm": 0.5128138661384583,
"learning_rate": 9.911591101178359e-05,
"loss": 0.0368,
"step": 2140
},
{
"grad_norm": 0.4942684769630432,
"learning_rate": 9.910036583881443e-05,
"loss": 0.0334,
"step": 2150
},
{
"grad_norm": 0.5318565368652344,
"learning_rate": 9.908468642731995e-05,
"loss": 0.0325,
"step": 2160
},
{
"grad_norm": 0.5772367715835571,
"learning_rate": 9.906887282016707e-05,
"loss": 0.0344,
"step": 2170
},
{
"grad_norm": 0.5957911014556885,
"learning_rate": 9.90529250605896e-05,
"loss": 0.0368,
"step": 2180
},
{
"grad_norm": 0.6259480714797974,
"learning_rate": 9.903684319218809e-05,
"loss": 0.0375,
"step": 2190
},
{
"grad_norm": 0.691277801990509,
"learning_rate": 9.902062725892976e-05,
"loss": 0.0402,
"step": 2200
},
{
"grad_norm": 0.624859094619751,
"learning_rate": 9.900427730514834e-05,
"loss": 0.0316,
"step": 2210
},
{
"grad_norm": 0.46915674209594727,
"learning_rate": 9.8987793375544e-05,
"loss": 0.0352,
"step": 2220
},
{
"grad_norm": 0.5559591054916382,
"learning_rate": 9.897117551518318e-05,
"loss": 0.0353,
"step": 2230
},
{
"grad_norm": 0.47577548027038574,
"learning_rate": 9.895442376949844e-05,
"loss": 0.0395,
"step": 2240
},
{
"grad_norm": 0.7231595516204834,
"learning_rate": 9.893753818428845e-05,
"loss": 0.0442,
"step": 2250
},
{
"grad_norm": 0.4607575535774231,
"learning_rate": 9.892051880571773e-05,
"loss": 0.037,
"step": 2260
},
{
"grad_norm": 0.4901242256164551,
"learning_rate": 9.890336568031663e-05,
"loss": 0.0342,
"step": 2270
},
{
"grad_norm": 0.46413323283195496,
"learning_rate": 9.888607885498113e-05,
"loss": 0.0386,
"step": 2280
},
{
"grad_norm": 0.5028432607650757,
"learning_rate": 9.886865837697275e-05,
"loss": 0.0384,
"step": 2290
},
{
"grad_norm": 0.6079827547073364,
"learning_rate": 9.88511042939184e-05,
"loss": 0.0416,
"step": 2300
},
{
"grad_norm": 0.6189248561859131,
"learning_rate": 9.883341665381028e-05,
"loss": 0.0372,
"step": 2310
},
{
"grad_norm": 0.569456160068512,
"learning_rate": 9.881559550500575e-05,
"loss": 0.0317,
"step": 2320
},
{
"grad_norm": 0.5782006978988647,
"learning_rate": 9.879764089622712e-05,
"loss": 0.0363,
"step": 2330
},
{
"grad_norm": 0.6612024307250977,
"learning_rate": 9.87795528765616e-05,
"loss": 0.0386,
"step": 2340
},
{
"grad_norm": 0.45619797706604004,
"learning_rate": 9.876133149546118e-05,
"loss": 0.0385,
"step": 2350
},
{
"grad_norm": 0.4743977189064026,
"learning_rate": 9.874297680274238e-05,
"loss": 0.0384,
"step": 2360
},
{
"grad_norm": 0.5303918719291687,
"learning_rate": 9.872448884858624e-05,
"loss": 0.0364,
"step": 2370
},
{
"grad_norm": 0.5923212766647339,
"learning_rate": 9.870586768353815e-05,
"loss": 0.0366,
"step": 2380
},
{
"grad_norm": 0.5156052112579346,
"learning_rate": 9.868711335850764e-05,
"loss": 0.0412,
"step": 2390
},
{
"grad_norm": 0.4702778458595276,
"learning_rate": 9.866822592476833e-05,
"loss": 0.0353,
"step": 2400
},
{
"grad_norm": 0.4955006241798401,
"learning_rate": 9.86492054339577e-05,
"loss": 0.0356,
"step": 2410
},
{
"grad_norm": 0.4722374677658081,
"learning_rate": 9.863005193807711e-05,
"loss": 0.0328,
"step": 2420
},
{
"grad_norm": 0.5261074900627136,
"learning_rate": 9.861076548949143e-05,
"loss": 0.0314,
"step": 2430
},
{
"grad_norm": 0.43109720945358276,
"learning_rate": 9.859134614092912e-05,
"loss": 0.0306,
"step": 2440
},
{
"grad_norm": 0.5150691270828247,
"learning_rate": 9.857179394548191e-05,
"loss": 0.0331,
"step": 2450
},
{
"grad_norm": 0.413881778717041,
"learning_rate": 9.855210895660477e-05,
"loss": 0.0313,
"step": 2460
},
{
"grad_norm": 0.5778813362121582,
"learning_rate": 9.853229122811568e-05,
"loss": 0.0327,
"step": 2470
},
{
"grad_norm": 0.5499809980392456,
"learning_rate": 9.851234081419559e-05,
"loss": 0.0371,
"step": 2480
},
{
"grad_norm": 0.533755898475647,
"learning_rate": 9.849225776938814e-05,
"loss": 0.0347,
"step": 2490
},
{
"grad_norm": 0.5036794543266296,
"learning_rate": 9.847204214859964e-05,
"loss": 0.0365,
"step": 2500
},
{
"grad_norm": 0.4547636806964874,
"learning_rate": 9.845169400709879e-05,
"loss": 0.0284,
"step": 2510
},
{
"grad_norm": 0.4148177206516266,
"learning_rate": 9.843121340051664e-05,
"loss": 0.0338,
"step": 2520
},
{
"grad_norm": 0.4307814836502075,
"learning_rate": 9.841060038484641e-05,
"loss": 0.0401,
"step": 2530
},
{
"grad_norm": 0.5055217146873474,
"learning_rate": 9.838985501644328e-05,
"loss": 0.0413,
"step": 2540
},
{
"grad_norm": 0.5252987742424011,
"learning_rate": 9.83689773520243e-05,
"loss": 0.0334,
"step": 2550
},
{
"grad_norm": 0.5325053334236145,
"learning_rate": 9.834796744866819e-05,
"loss": 0.0339,
"step": 2560
},
{
"grad_norm": 0.5485632419586182,
"learning_rate": 9.832682536381525e-05,
"loss": 0.0354,
"step": 2570
},
{
"grad_norm": 0.5406777262687683,
"learning_rate": 9.830555115526711e-05,
"loss": 0.0368,
"step": 2580
},
{
"grad_norm": 0.37698280811309814,
"learning_rate": 9.828414488118667e-05,
"loss": 0.0336,
"step": 2590
},
{
"grad_norm": 0.5253736972808838,
"learning_rate": 9.826260660009785e-05,
"loss": 0.0337,
"step": 2600
},
{
"grad_norm": 0.482319176197052,
"learning_rate": 9.824093637088547e-05,
"loss": 0.0299,
"step": 2610
},
{
"grad_norm": 0.43845584988594055,
"learning_rate": 9.821913425279514e-05,
"loss": 0.032,
"step": 2620
},
{
"grad_norm": 0.4526597559452057,
"learning_rate": 9.8197200305433e-05,
"loss": 0.034,
"step": 2630
},
{
"grad_norm": 0.45589521527290344,
"learning_rate": 9.817513458876564e-05,
"loss": 0.0464,
"step": 2640
},
{
"grad_norm": 0.5381149649620056,
"learning_rate": 9.815293716311987e-05,
"loss": 0.0334,
"step": 2650
},
{
"grad_norm": 0.5279123187065125,
"learning_rate": 9.813060808918262e-05,
"loss": 0.0318,
"step": 2660
},
{
"grad_norm": 0.3532435894012451,
"learning_rate": 9.810814742800069e-05,
"loss": 0.0285,
"step": 2670
},
{
"grad_norm": 0.3765302896499634,
"learning_rate": 9.808555524098074e-05,
"loss": 0.0289,
"step": 2680
},
{
"grad_norm": 0.46037837862968445,
"learning_rate": 9.806283158988887e-05,
"loss": 0.0291,
"step": 2690
},
{
"grad_norm": 0.483735591173172,
"learning_rate": 9.803997653685072e-05,
"loss": 0.0392,
"step": 2700
},
{
"grad_norm": 0.45865148305892944,
"learning_rate": 9.801699014435112e-05,
"loss": 0.0393,
"step": 2710
},
{
"grad_norm": 0.4620376229286194,
"learning_rate": 9.799387247523398e-05,
"loss": 0.0352,
"step": 2720
},
{
"grad_norm": 0.41832435131073,
"learning_rate": 9.797062359270215e-05,
"loss": 0.0319,
"step": 2730
},
{
"grad_norm": 0.4439375400543213,
"learning_rate": 9.794724356031715e-05,
"loss": 0.0307,
"step": 2740
},
{
"grad_norm": 0.5037664771080017,
"learning_rate": 9.792373244199913e-05,
"loss": 0.0306,
"step": 2750
},
{
"grad_norm": 0.378164678812027,
"learning_rate": 9.790009030202658e-05,
"loss": 0.0313,
"step": 2760
},
{
"grad_norm": 0.5053073763847351,
"learning_rate": 9.78763172050362e-05,
"loss": 0.0295,
"step": 2770
},
{
"grad_norm": 0.4680381119251251,
"learning_rate": 9.785241321602274e-05,
"loss": 0.0277,
"step": 2780
},
{
"grad_norm": 0.4624013304710388,
"learning_rate": 9.782837840033879e-05,
"loss": 0.0288,
"step": 2790
},
{
"grad_norm": 0.5074241757392883,
"learning_rate": 9.780421282369461e-05,
"loss": 0.0292,
"step": 2800
},
{
"grad_norm": 0.4835506081581116,
"learning_rate": 9.777991655215797e-05,
"loss": 0.0294,
"step": 2810
},
{
"grad_norm": 0.5738292336463928,
"learning_rate": 9.775548965215394e-05,
"loss": 0.0295,
"step": 2820
},
{
"grad_norm": 0.5334445238113403,
"learning_rate": 9.773093219046474e-05,
"loss": 0.0293,
"step": 2830
},
{
"grad_norm": 0.4011390507221222,
"learning_rate": 9.770624423422954e-05,
"loss": 0.0291,
"step": 2840
},
{
"grad_norm": 0.41171419620513916,
"learning_rate": 9.768142585094426e-05,
"loss": 0.0302,
"step": 2850
},
{
"grad_norm": 0.46391263604164124,
"learning_rate": 9.765647710846142e-05,
"loss": 0.0405,
"step": 2860
},
{
"grad_norm": 0.5071845650672913,
"learning_rate": 9.763139807498991e-05,
"loss": 0.0285,
"step": 2870
},
{
"grad_norm": 0.4814237058162689,
"learning_rate": 9.760618881909487e-05,
"loss": 0.0317,
"step": 2880
},
{
"grad_norm": 0.5396919846534729,
"learning_rate": 9.758084940969744e-05,
"loss": 0.0316,
"step": 2890
},
{
"grad_norm": 0.5363779664039612,
"learning_rate": 9.755537991607459e-05,
"loss": 0.027,
"step": 2900
},
{
"grad_norm": 0.505138099193573,
"learning_rate": 9.752978040785895e-05,
"loss": 0.0354,
"step": 2910
},
{
"grad_norm": 0.5476271510124207,
"learning_rate": 9.750405095503859e-05,
"loss": 0.0299,
"step": 2920
},
{
"grad_norm": 0.5189036130905151,
"learning_rate": 9.747819162795686e-05,
"loss": 0.0331,
"step": 2930
},
{
"grad_norm": 0.45717042684555054,
"learning_rate": 9.745220249731217e-05,
"loss": 0.026,
"step": 2940
},
{
"grad_norm": 0.4337165355682373,
"learning_rate": 9.742608363415781e-05,
"loss": 0.0272,
"step": 2950
},
{
"grad_norm": 0.4811023771762848,
"learning_rate": 9.739983510990176e-05,
"loss": 0.0288,
"step": 2960
},
{
"grad_norm": 0.3455168902873993,
"learning_rate": 9.737345699630647e-05,
"loss": 0.0298,
"step": 2970
},
{
"grad_norm": 0.5057815313339233,
"learning_rate": 9.734694936548869e-05,
"loss": 0.0332,
"step": 2980
},
{
"grad_norm": 0.38619765639305115,
"learning_rate": 9.732031228991932e-05,
"loss": 0.0256,
"step": 2990
},
{
"grad_norm": 0.3297816514968872,
"learning_rate": 9.729354584242302e-05,
"loss": 0.0355,
"step": 3000
},
{
"grad_norm": 0.5174765586853027,
"learning_rate": 9.726665009617832e-05,
"loss": 0.0309,
"step": 3010
},
{
"grad_norm": 0.43245866894721985,
"learning_rate": 9.723962512471714e-05,
"loss": 0.033,
"step": 3020
},
{
"grad_norm": 0.516598105430603,
"learning_rate": 9.72124710019247e-05,
"loss": 0.03,
"step": 3030
},
{
"grad_norm": 0.48712822794914246,
"learning_rate": 9.718518780203934e-05,
"loss": 0.0322,
"step": 3040
},
{
"grad_norm": 0.3674415946006775,
"learning_rate": 9.715777559965228e-05,
"loss": 0.0319,
"step": 3050
},
{
"grad_norm": 0.4218079149723053,
"learning_rate": 9.713023446970746e-05,
"loss": 0.0255,
"step": 3060
},
{
"grad_norm": 0.4967867136001587,
"learning_rate": 9.710256448750126e-05,
"loss": 0.0311,
"step": 3070
},
{
"grad_norm": 0.497653067111969,
"learning_rate": 9.707476572868235e-05,
"loss": 0.0341,
"step": 3080
},
{
"grad_norm": 0.4222137928009033,
"learning_rate": 9.704683826925149e-05,
"loss": 0.0273,
"step": 3090
},
{
"grad_norm": 0.37705838680267334,
"learning_rate": 9.701878218556129e-05,
"loss": 0.036,
"step": 3100
},
{
"grad_norm": 0.5626199841499329,
"learning_rate": 9.699059755431598e-05,
"loss": 0.0331,
"step": 3110
},
{
"grad_norm": 0.46293774247169495,
"learning_rate": 9.696228445257132e-05,
"loss": 0.0277,
"step": 3120
},
{
"grad_norm": 0.42764750123023987,
"learning_rate": 9.693384295773419e-05,
"loss": 0.0327,
"step": 3130
},
{
"grad_norm": 0.4717363715171814,
"learning_rate": 9.690527314756259e-05,
"loss": 0.0339,
"step": 3140
},
{
"grad_norm": 0.458967387676239,
"learning_rate": 9.687657510016527e-05,
"loss": 0.0261,
"step": 3150
},
{
"grad_norm": 0.45871081948280334,
"learning_rate": 9.684774889400161e-05,
"loss": 0.0309,
"step": 3160
},
{
"grad_norm": 0.5132860541343689,
"learning_rate": 9.681879460788135e-05,
"loss": 0.0264,
"step": 3170
},
{
"grad_norm": 0.4729975461959839,
"learning_rate": 9.67897123209644e-05,
"loss": 0.0315,
"step": 3180
},
{
"grad_norm": 0.4921012818813324,
"learning_rate": 9.676050211276062e-05,
"loss": 0.035,
"step": 3190
},
{
"grad_norm": 0.4574073255062103,
"learning_rate": 9.673116406312962e-05,
"loss": 0.0284,
"step": 3200
},
{
"grad_norm": 0.48541590571403503,
"learning_rate": 9.67016982522805e-05,
"loss": 0.028,
"step": 3210
},
{
"grad_norm": 0.4924331307411194,
"learning_rate": 9.667210476077164e-05,
"loss": 0.028,
"step": 3220
},
{
"grad_norm": 0.5730510950088501,
"learning_rate": 9.664238366951055e-05,
"loss": 0.0288,
"step": 3230
},
{
"grad_norm": 0.5551027059555054,
"learning_rate": 9.661253505975355e-05,
"loss": 0.0269,
"step": 3240
},
{
"grad_norm": 0.4366356134414673,
"learning_rate": 9.658255901310557e-05,
"loss": 0.0301,
"step": 3250
},
{
"grad_norm": 0.5327138304710388,
"learning_rate": 9.655245561152e-05,
"loss": 0.0278,
"step": 3260
},
{
"grad_norm": 0.4516207277774811,
"learning_rate": 9.65222249372984e-05,
"loss": 0.0266,
"step": 3270
},
{
"grad_norm": 0.4709407687187195,
"learning_rate": 9.649186707309026e-05,
"loss": 0.0325,
"step": 3280
},
{
"grad_norm": 0.36673372983932495,
"learning_rate": 9.646138210189283e-05,
"loss": 0.0285,
"step": 3290
},
{
"grad_norm": 0.5308244824409485,
"learning_rate": 9.643077010705087e-05,
"loss": 0.0281,
"step": 3300
},
{
"grad_norm": 0.45568153262138367,
"learning_rate": 9.640003117225637e-05,
"loss": 0.0286,
"step": 3310
},
{
"grad_norm": 0.4082559049129486,
"learning_rate": 9.636916538154846e-05,
"loss": 0.0241,
"step": 3320
},
{
"grad_norm": 0.48012563586235046,
"learning_rate": 9.633817281931296e-05,
"loss": 0.0297,
"step": 3330
},
{
"grad_norm": 0.4177444875240326,
"learning_rate": 9.630705357028242e-05,
"loss": 0.032,
"step": 3340
},
{
"grad_norm": 0.48793429136276245,
"learning_rate": 9.627580771953563e-05,
"loss": 0.0285,
"step": 3350
},
{
"grad_norm": 0.4371464252471924,
"learning_rate": 9.624443535249759e-05,
"loss": 0.0275,
"step": 3360
},
{
"grad_norm": 0.4983312487602234,
"learning_rate": 9.621293655493913e-05,
"loss": 0.0254,
"step": 3370
},
{
"grad_norm": 0.5624396204948425,
"learning_rate": 9.618131141297675e-05,
"loss": 0.027,
"step": 3380
},
{
"grad_norm": 0.43570947647094727,
"learning_rate": 9.614956001307242e-05,
"loss": 0.0301,
"step": 3390
},
{
"grad_norm": 0.4448493719100952,
"learning_rate": 9.611768244203321e-05,
"loss": 0.0351,
"step": 3400
},
{
"grad_norm": 0.4213621914386749,
"learning_rate": 9.60856787870112e-05,
"loss": 0.0292,
"step": 3410
},
{
"grad_norm": 0.4154338836669922,
"learning_rate": 9.605354913550318e-05,
"loss": 0.0262,
"step": 3420
},
{
"grad_norm": 0.45102718472480774,
"learning_rate": 9.602129357535037e-05,
"loss": 0.0313,
"step": 3430
},
{
"grad_norm": 0.38145503401756287,
"learning_rate": 9.598891219473825e-05,
"loss": 0.027,
"step": 3440
},
{
"grad_norm": 0.41790488362312317,
"learning_rate": 9.595640508219625e-05,
"loss": 0.0291,
"step": 3450
},
{
"grad_norm": 0.4644753336906433,
"learning_rate": 9.592377232659761e-05,
"loss": 0.0249,
"step": 3460
},
{
"grad_norm": 0.4731713533401489,
"learning_rate": 9.589101401715904e-05,
"loss": 0.0263,
"step": 3470
},
{
"grad_norm": 0.42398542165756226,
"learning_rate": 9.585813024344045e-05,
"loss": 0.026,
"step": 3480
},
{
"grad_norm": 0.5419644117355347,
"learning_rate": 9.58251210953449e-05,
"loss": 0.0296,
"step": 3490
},
{
"grad_norm": 0.463670939207077,
"learning_rate": 9.579198666311809e-05,
"loss": 0.0238,
"step": 3500
},
{
"grad_norm": 0.39643239974975586,
"learning_rate": 9.575872703734832e-05,
"loss": 0.0292,
"step": 3510
},
{
"grad_norm": 0.3542700409889221,
"learning_rate": 9.572534230896611e-05,
"loss": 0.0231,
"step": 3520
},
{
"grad_norm": 0.43060752749443054,
"learning_rate": 9.569183256924403e-05,
"loss": 0.025,
"step": 3530
},
{
"grad_norm": 0.40233463048934937,
"learning_rate": 9.565819790979646e-05,
"loss": 0.0422,
"step": 3540
},
{
"grad_norm": 0.4497774839401245,
"learning_rate": 9.562443842257925e-05,
"loss": 0.029,
"step": 3550
},
{
"grad_norm": 0.5018470287322998,
"learning_rate": 9.559055419988956e-05,
"loss": 0.0283,
"step": 3560
},
{
"grad_norm": 0.47868454456329346,
"learning_rate": 9.555654533436557e-05,
"loss": 0.0349,
"step": 3570
},
{
"grad_norm": 0.4413691759109497,
"learning_rate": 9.552241191898621e-05,
"loss": 0.0238,
"step": 3580
},
{
"grad_norm": 0.40998080372810364,
"learning_rate": 9.548815404707092e-05,
"loss": 0.03,
"step": 3590
},
{
"grad_norm": 0.43824273347854614,
"learning_rate": 9.545377181227942e-05,
"loss": 0.0284,
"step": 3600
},
{
"grad_norm": 0.4570449888706207,
"learning_rate": 9.541926530861145e-05,
"loss": 0.0266,
"step": 3610
},
{
"grad_norm": 0.44766074419021606,
"learning_rate": 9.538463463040645e-05,
"loss": 0.0278,
"step": 3620
},
{
"grad_norm": 0.481611967086792,
"learning_rate": 9.534987987234337e-05,
"loss": 0.0277,
"step": 3630
},
{
"grad_norm": 0.4858357608318329,
"learning_rate": 9.53150011294404e-05,
"loss": 0.0265,
"step": 3640
},
{
"grad_norm": 0.40574368834495544,
"learning_rate": 9.527999849705471e-05,
"loss": 0.0297,
"step": 3650
},
{
"grad_norm": 0.4581122100353241,
"learning_rate": 9.524487207088213e-05,
"loss": 0.0224,
"step": 3660
},
{
"grad_norm": 0.4100882411003113,
"learning_rate": 9.520962194695698e-05,
"loss": 0.0239,
"step": 3670
},
{
"grad_norm": 0.40333643555641174,
"learning_rate": 9.517424822165175e-05,
"loss": 0.0238,
"step": 3680
},
{
"grad_norm": 0.5596145987510681,
"learning_rate": 9.513875099167685e-05,
"loss": 0.0245,
"step": 3690
},
{
"grad_norm": 0.5230712890625,
"learning_rate": 9.510313035408035e-05,
"loss": 0.0262,
"step": 3700
},
{
"grad_norm": 0.39155617356300354,
"learning_rate": 9.506738640624775e-05,
"loss": 0.0264,
"step": 3710
},
{
"grad_norm": 0.4129464328289032,
"learning_rate": 9.50315192459016e-05,
"loss": 0.0208,
"step": 3720
},
{
"grad_norm": 0.5159543752670288,
"learning_rate": 9.499552897110136e-05,
"loss": 0.0239,
"step": 3730
},
{
"grad_norm": 0.5178094506263733,
"learning_rate": 9.495941568024304e-05,
"loss": 0.0253,
"step": 3740
},
{
"grad_norm": 0.43580612540245056,
"learning_rate": 9.492317947205904e-05,
"loss": 0.0268,
"step": 3750
},
{
"grad_norm": 0.4596274495124817,
"learning_rate": 9.488682044561775e-05,
"loss": 0.0256,
"step": 3760
},
{
"grad_norm": 0.41573286056518555,
"learning_rate": 9.485033870032335e-05,
"loss": 0.0243,
"step": 3770
},
{
"grad_norm": 0.47876912355422974,
"learning_rate": 9.481373433591556e-05,
"loss": 0.0215,
"step": 3780
},
{
"grad_norm": 0.4741547703742981,
"learning_rate": 9.47770074524693e-05,
"loss": 0.027,
"step": 3790
},
{
"grad_norm": 0.4306631088256836,
"learning_rate": 9.474015815039446e-05,
"loss": 0.0277,
"step": 3800
},
{
"grad_norm": 0.46127429604530334,
"learning_rate": 9.470318653043565e-05,
"loss": 0.0273,
"step": 3810
},
{
"grad_norm": 0.5021414160728455,
"learning_rate": 9.466609269367185e-05,
"loss": 0.0263,
"step": 3820
},
{
"grad_norm": 0.5333779454231262,
"learning_rate": 9.46288767415162e-05,
"loss": 0.0234,
"step": 3830
},
{
"grad_norm": 0.4366990625858307,
"learning_rate": 9.459153877571567e-05,
"loss": 0.0225,
"step": 3840
},
{
"grad_norm": 0.4819251298904419,
"learning_rate": 9.455407889835087e-05,
"loss": 0.0238,
"step": 3850
},
{
"grad_norm": 0.3999616503715515,
"learning_rate": 9.451649721183564e-05,
"loss": 0.0234,
"step": 3860
},
{
"grad_norm": 0.37807697057724,
"learning_rate": 9.447879381891692e-05,
"loss": 0.0258,
"step": 3870
},
{
"grad_norm": 0.5266739130020142,
"learning_rate": 9.444096882267428e-05,
"loss": 0.0329,
"step": 3880
},
{
"grad_norm": 0.3961910903453827,
"learning_rate": 9.440302232651988e-05,
"loss": 0.0226,
"step": 3890
},
{
"grad_norm": 0.3786242604255676,
"learning_rate": 9.436495443419795e-05,
"loss": 0.024,
"step": 3900
},
{
"grad_norm": 0.4175941050052643,
"learning_rate": 9.432676524978466e-05,
"loss": 0.0219,
"step": 3910
},
{
"grad_norm": 0.44096827507019043,
"learning_rate": 9.42884548776878e-05,
"loss": 0.0253,
"step": 3920
},
{
"grad_norm": 0.41201087832450867,
"learning_rate": 9.425002342264646e-05,
"loss": 0.0223,
"step": 3930
},
{
"grad_norm": 0.5009353160858154,
"learning_rate": 9.421147098973077e-05,
"loss": 0.0266,
"step": 3940
},
{
"grad_norm": 0.5505723357200623,
"learning_rate": 9.41727976843416e-05,
"loss": 0.0258,
"step": 3950
},
{
"grad_norm": 0.45981982350349426,
"learning_rate": 9.413400361221029e-05,
"loss": 0.0279,
"step": 3960
},
{
"grad_norm": 0.4804719388484955,
"learning_rate": 9.409508887939835e-05,
"loss": 0.022,
"step": 3970
},
{
"grad_norm": 0.4238436222076416,
"learning_rate": 9.40560535922972e-05,
"loss": 0.0212,
"step": 3980
},
{
"grad_norm": 0.403974324464798,
"learning_rate": 9.40168978576278e-05,
"loss": 0.0189,
"step": 3990
},
{
"grad_norm": 0.48837044835090637,
"learning_rate": 9.397762178244043e-05,
"loss": 0.0244,
"step": 4000
},
{
"grad_norm": 0.48128196597099304,
"learning_rate": 9.393822547411439e-05,
"loss": 0.0217,
"step": 4010
},
{
"grad_norm": 0.3272818624973297,
"learning_rate": 9.389870904035769e-05,
"loss": 0.0242,
"step": 4020
},
{
"grad_norm": 0.36953118443489075,
"learning_rate": 9.385907258920672e-05,
"loss": 0.0246,
"step": 4030
},
{
"grad_norm": 0.41161492466926575,
"learning_rate": 9.381931622902607e-05,
"loss": 0.021,
"step": 4040
},
{
"grad_norm": 0.4544064998626709,
"learning_rate": 9.377944006850807e-05,
"loss": 0.0193,
"step": 4050
},
{
"grad_norm": 0.47396498918533325,
"learning_rate": 9.373944421667265e-05,
"loss": 0.0213,
"step": 4060
},
{
"grad_norm": 0.4621795117855072,
"learning_rate": 9.369932878286691e-05,
"loss": 0.0266,
"step": 4070
},
{
"grad_norm": 0.5184421539306641,
"learning_rate": 9.365909387676494e-05,
"loss": 0.0196,
"step": 4080
},
{
"grad_norm": 0.4004800319671631,
"learning_rate": 9.361873960836744e-05,
"loss": 0.0263,
"step": 4090
},
{
"grad_norm": 0.3737598657608032,
"learning_rate": 9.357826608800142e-05,
"loss": 0.0196,
"step": 4100
},
{
"grad_norm": 0.4000731110572815,
"learning_rate": 9.353767342631994e-05,
"loss": 0.0203,
"step": 4110
},
{
"grad_norm": 0.3826330006122589,
"learning_rate": 9.34969617343018e-05,
"loss": 0.0219,
"step": 4120
},
{
"grad_norm": 0.5988262891769409,
"learning_rate": 9.345613112325122e-05,
"loss": 0.0204,
"step": 4130
},
{
"grad_norm": 0.4280189275741577,
"learning_rate": 9.34151817047975e-05,
"loss": 0.0224,
"step": 4140
},
{
"grad_norm": 0.3716961145401001,
"learning_rate": 9.33741135908948e-05,
"loss": 0.0262,
"step": 4150
},
{
"grad_norm": 0.4295980930328369,
"learning_rate": 9.33329268938218e-05,
"loss": 0.0207,
"step": 4160
},
{
"grad_norm": 0.425942063331604,
"learning_rate": 9.329162172618132e-05,
"loss": 0.0238,
"step": 4170
},
{
"grad_norm": 0.416522741317749,
"learning_rate": 9.325019820090013e-05,
"loss": 0.0226,
"step": 4180
},
{
"grad_norm": 0.5610533952713013,
"learning_rate": 9.320865643122855e-05,
"loss": 0.0208,
"step": 4190
},
{
"grad_norm": 0.379802942276001,
"learning_rate": 9.316699653074023e-05,
"loss": 0.022,
"step": 4200
},
{
"grad_norm": 0.4576219618320465,
"learning_rate": 9.312521861333172e-05,
"loss": 0.0166,
"step": 4210
},
{
"grad_norm": 0.45310190320014954,
"learning_rate": 9.308332279322224e-05,
"loss": 0.0242,
"step": 4220
},
{
"grad_norm": 0.4080248177051544,
"learning_rate": 9.304130918495338e-05,
"loss": 0.0224,
"step": 4230
},
{
"grad_norm": 0.33399489521980286,
"learning_rate": 9.299917790338874e-05,
"loss": 0.0187,
"step": 4240
},
{
"grad_norm": 0.356057733297348,
"learning_rate": 9.295692906371363e-05,
"loss": 0.0173,
"step": 4250
},
{
"grad_norm": 0.42619287967681885,
"learning_rate": 9.291456278143476e-05,
"loss": 0.0264,
"step": 4260
},
{
"grad_norm": 0.3479536175727844,
"learning_rate": 9.287207917237994e-05,
"loss": 0.0213,
"step": 4270
},
{
"grad_norm": 0.3362795114517212,
"learning_rate": 9.282947835269773e-05,
"loss": 0.0206,
"step": 4280
},
{
"grad_norm": 0.43236204981803894,
"learning_rate": 9.278676043885715e-05,
"loss": 0.0191,
"step": 4290
},
{
"grad_norm": 0.32585880160331726,
"learning_rate": 9.274392554764733e-05,
"loss": 0.0194,
"step": 4300
},
{
"grad_norm": 0.4723697900772095,
"learning_rate": 9.270097379617723e-05,
"loss": 0.016,
"step": 4310
},
{
"grad_norm": 0.42713454365730286,
"learning_rate": 9.26579053018753e-05,
"loss": 0.0154,
"step": 4320
},
{
"grad_norm": 0.33830246329307556,
"learning_rate": 9.261472018248918e-05,
"loss": 0.0146,
"step": 4330
},
{
"grad_norm": 0.4066753387451172,
"learning_rate": 9.25714185560853e-05,
"loss": 0.0259,
"step": 4340
},
{
"grad_norm": 0.448772668838501,
"learning_rate": 9.252800054104868e-05,
"loss": 0.0187,
"step": 4350
},
{
"grad_norm": 0.4219300448894501,
"learning_rate": 9.248446625608252e-05,
"loss": 0.0208,
"step": 4360
},
{
"grad_norm": 0.39920371770858765,
"learning_rate": 9.244081582020789e-05,
"loss": 0.0175,
"step": 4370
},
{
"grad_norm": 0.42131638526916504,
"learning_rate": 9.239704935276339e-05,
"loss": 0.0182,
"step": 4380
},
{
"grad_norm": 0.45648935437202454,
"learning_rate": 9.235316697340489e-05,
"loss": 0.0158,
"step": 4390
},
{
"grad_norm": 0.42188429832458496,
"learning_rate": 9.230916880210512e-05,
"loss": 0.0183,
"step": 4400
},
{
"grad_norm": 0.36581969261169434,
"learning_rate": 9.226505495915342e-05,
"loss": 0.0147,
"step": 4410
},
{
"grad_norm": 0.42502549290657043,
"learning_rate": 9.222082556515536e-05,
"loss": 0.0198,
"step": 4420
},
{
"grad_norm": 0.35229989886283875,
"learning_rate": 9.217648074103242e-05,
"loss": 0.0153,
"step": 4430
},
{
"grad_norm": 0.4085313379764557,
"learning_rate": 9.213202060802161e-05,
"loss": 0.0192,
"step": 4440
},
{
"grad_norm": 0.4650028645992279,
"learning_rate": 9.208744528767528e-05,
"loss": 0.0173,
"step": 4450
},
{
"grad_norm": 0.4048616886138916,
"learning_rate": 9.204275490186064e-05,
"loss": 0.0204,
"step": 4460
},
{
"grad_norm": 0.4178619980812073,
"learning_rate": 9.199794957275949e-05,
"loss": 0.0204,
"step": 4470
},
{
"grad_norm": 0.46256691217422485,
"learning_rate": 9.19530294228679e-05,
"loss": 0.0177,
"step": 4480
},
{
"grad_norm": 0.35352519154548645,
"learning_rate": 9.190799457499583e-05,
"loss": 0.028,
"step": 4490
},
{
"grad_norm": 0.4470050632953644,
"learning_rate": 9.186284515226686e-05,
"loss": 0.0194,
"step": 4500
},
{
"grad_norm": 0.3508913815021515,
"learning_rate": 9.181758127811777e-05,
"loss": 0.0241,
"step": 4510
},
{
"grad_norm": 0.411702424287796,
"learning_rate": 9.177220307629825e-05,
"loss": 0.0204,
"step": 4520
},
{
"grad_norm": 0.4468960762023926,
"learning_rate": 9.172671067087059e-05,
"loss": 0.0194,
"step": 4530
},
{
"grad_norm": 0.4807928204536438,
"learning_rate": 9.16811041862093e-05,
"loss": 0.0256,
"step": 4540
},
{
"grad_norm": 0.39205247163772583,
"learning_rate": 9.163538374700076e-05,
"loss": 0.0185,
"step": 4550
},
{
"grad_norm": 0.44329723715782166,
"learning_rate": 9.158954947824287e-05,
"loss": 0.0178,
"step": 4560
},
{
"grad_norm": 0.47283023595809937,
"learning_rate": 9.154360150524482e-05,
"loss": 0.0174,
"step": 4570
},
{
"grad_norm": 0.38849857449531555,
"learning_rate": 9.14975399536266e-05,
"loss": 0.0143,
"step": 4580
},
{
"grad_norm": 0.3656264543533325,
"learning_rate": 9.14513649493187e-05,
"loss": 0.0212,
"step": 4590
},
{
"grad_norm": 0.4674840271472931,
"learning_rate": 9.140507661856187e-05,
"loss": 0.0153,
"step": 4600
},
{
"grad_norm": 0.4313472509384155,
"learning_rate": 9.135867508790661e-05,
"loss": 0.0214,
"step": 4610
},
{
"grad_norm": 0.3471619486808777,
"learning_rate": 9.131216048421291e-05,
"loss": 0.0165,
"step": 4620
},
{
"grad_norm": 0.4542539715766907,
"learning_rate": 9.126553293464998e-05,
"loss": 0.0189,
"step": 4630
},
{
"grad_norm": 0.47608688473701477,
"learning_rate": 9.121879256669572e-05,
"loss": 0.017,
"step": 4640
},
{
"grad_norm": 0.3959465026855469,
"learning_rate": 9.117193950813652e-05,
"loss": 0.0164,
"step": 4650
},
{
"grad_norm": 0.408431738615036,
"learning_rate": 9.112497388706685e-05,
"loss": 0.0255,
"step": 4660
},
{
"grad_norm": 0.4116475582122803,
"learning_rate": 9.10778958318889e-05,
"loss": 0.0174,
"step": 4670
},
{
"grad_norm": 0.3917919993400574,
"learning_rate": 9.103070547131232e-05,
"loss": 0.0199,
"step": 4680
},
{
"grad_norm": 0.3482106029987335,
"learning_rate": 9.098340293435375e-05,
"loss": 0.0179,
"step": 4690
},
{
"grad_norm": 0.34646838903427124,
"learning_rate": 9.093598835033649e-05,
"loss": 0.0174,
"step": 4700
},
{
"grad_norm": 0.39419376850128174,
"learning_rate": 9.088846184889021e-05,
"loss": 0.0191,
"step": 4710
},
{
"grad_norm": 0.4543268084526062,
"learning_rate": 9.084082355995057e-05,
"loss": 0.0213,
"step": 4720
},
{
"grad_norm": 0.4212946891784668,
"learning_rate": 9.079307361375882e-05,
"loss": 0.0181,
"step": 4730
},
{
"grad_norm": 0.3014923334121704,
"learning_rate": 9.074521214086149e-05,
"loss": 0.019,
"step": 4740
},
{
"grad_norm": 0.36527299880981445,
"learning_rate": 9.069723927211001e-05,
"loss": 0.0179,
"step": 4750
},
{
"grad_norm": 0.3752840757369995,
"learning_rate": 9.064915513866037e-05,
"loss": 0.0183,
"step": 4760
},
{
"grad_norm": 0.42201003432273865,
"learning_rate": 9.060095987197279e-05,
"loss": 0.0162,
"step": 4770
},
{
"grad_norm": 0.3307137191295624,
"learning_rate": 9.055265360381126e-05,
"loss": 0.0206,
"step": 4780
},
{
"grad_norm": 0.33322593569755554,
"learning_rate": 9.050423646624326e-05,
"loss": 0.016,
"step": 4790
},
{
"grad_norm": 0.35324618220329285,
"learning_rate": 9.045570859163943e-05,
"loss": 0.0194,
"step": 4800
},
{
"grad_norm": 0.427572637796402,
"learning_rate": 9.04070701126731e-05,
"loss": 0.015,
"step": 4810
},
{
"grad_norm": 0.3561609983444214,
"learning_rate": 9.035832116232001e-05,
"loss": 0.0145,
"step": 4820
},
{
"grad_norm": 0.37716561555862427,
"learning_rate": 9.030946187385796e-05,
"loss": 0.016,
"step": 4830
},
{
"grad_norm": 0.39859738945961,
"learning_rate": 9.026049238086635e-05,
"loss": 0.0178,
"step": 4840
},
{
"grad_norm": 0.4500395655632019,
"learning_rate": 9.021141281722591e-05,
"loss": 0.0202,
"step": 4850
},
{
"grad_norm": 0.34830138087272644,
"learning_rate": 9.01622233171183e-05,
"loss": 0.0169,
"step": 4860
},
{
"grad_norm": 0.3729107677936554,
"learning_rate": 9.011292401502574e-05,
"loss": 0.0212,
"step": 4870
},
{
"grad_norm": 0.3912448585033417,
"learning_rate": 9.006351504573063e-05,
"loss": 0.0146,
"step": 4880
},
{
"grad_norm": 0.4137353003025055,
"learning_rate": 9.001399654431519e-05,
"loss": 0.0171,
"step": 4890
},
{
"grad_norm": 0.4444160759449005,
"learning_rate": 8.996436864616116e-05,
"loss": 0.0162,
"step": 4900
},
{
"grad_norm": 0.3148241639137268,
"learning_rate": 8.991463148694925e-05,
"loss": 0.0191,
"step": 4910
},
{
"grad_norm": 0.4391416907310486,
"learning_rate": 8.986478520265902e-05,
"loss": 0.0187,
"step": 4920
},
{
"grad_norm": 0.4296688139438629,
"learning_rate": 8.981482992956827e-05,
"loss": 0.0143,
"step": 4930
},
{
"grad_norm": 0.29728299379348755,
"learning_rate": 8.976476580425282e-05,
"loss": 0.0148,
"step": 4940
},
{
"grad_norm": 0.4356195032596588,
"learning_rate": 8.971459296358606e-05,
"loss": 0.0287,
"step": 4950
},
{
"grad_norm": 0.4179481565952301,
"learning_rate": 8.966431154473864e-05,
"loss": 0.0157,
"step": 4960
},
{
"grad_norm": 0.3610477149486542,
"learning_rate": 8.961392168517803e-05,
"loss": 0.0159,
"step": 4970
},
{
"grad_norm": 0.34345686435699463,
"learning_rate": 8.956342352266821e-05,
"loss": 0.016,
"step": 4980
},
{
"grad_norm": 0.3698787987232208,
"learning_rate": 8.95128171952692e-05,
"loss": 0.0214,
"step": 4990
},
{
"grad_norm": 0.327648788690567,
"learning_rate": 8.946210284133676e-05,
"loss": 0.0173,
"step": 5000
}
],
"logging_steps": 10,
"max_steps": 20000,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 24,
"trial_name": null,
"trial_params": null
}