rubenfb23's picture
Upload folder using huggingface_hub
e064b6e verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 23218,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004307003187182358,
"grad_norm": 0.6166621446609497,
"learning_rate": 4.220499569336779e-07,
"loss": 0.6745,
"step": 50
},
{
"epoch": 0.008614006374364717,
"grad_norm": 1.4086202383041382,
"learning_rate": 8.527131782945737e-07,
"loss": 0.665,
"step": 100
},
{
"epoch": 0.012921009561547075,
"grad_norm": 0.8809443712234497,
"learning_rate": 1.2833763996554696e-06,
"loss": 0.638,
"step": 150
},
{
"epoch": 0.017228012748729434,
"grad_norm": 2.4636318683624268,
"learning_rate": 1.7140396210163654e-06,
"loss": 0.5199,
"step": 200
},
{
"epoch": 0.021535015935911794,
"grad_norm": 4.417299270629883,
"learning_rate": 2.144702842377261e-06,
"loss": 0.295,
"step": 250
},
{
"epoch": 0.02584201912309415,
"grad_norm": 0.9570991396903992,
"learning_rate": 2.575366063738157e-06,
"loss": 0.2265,
"step": 300
},
{
"epoch": 0.03014902231027651,
"grad_norm": 5.928269863128662,
"learning_rate": 3.006029285099053e-06,
"loss": 0.1726,
"step": 350
},
{
"epoch": 0.03445602549745887,
"grad_norm": 3.2107982635498047,
"learning_rate": 3.436692506459949e-06,
"loss": 0.1693,
"step": 400
},
{
"epoch": 0.038763028684641224,
"grad_norm": 20.335567474365234,
"learning_rate": 3.867355727820845e-06,
"loss": 0.1624,
"step": 450
},
{
"epoch": 0.04307003187182359,
"grad_norm": 5.493223190307617,
"learning_rate": 4.2980189491817404e-06,
"loss": 0.1553,
"step": 500
},
{
"epoch": 0.047377035059005944,
"grad_norm": 2.554831027984619,
"learning_rate": 4.728682170542636e-06,
"loss": 0.1287,
"step": 550
},
{
"epoch": 0.0516840382461883,
"grad_norm": 7.436851978302002,
"learning_rate": 5.159345391903532e-06,
"loss": 0.1536,
"step": 600
},
{
"epoch": 0.055991041433370664,
"grad_norm": 3.6525373458862305,
"learning_rate": 5.590008613264428e-06,
"loss": 0.1418,
"step": 650
},
{
"epoch": 0.06029804462055302,
"grad_norm": 2.3898279666900635,
"learning_rate": 6.020671834625324e-06,
"loss": 0.1236,
"step": 700
},
{
"epoch": 0.06460504780773538,
"grad_norm": 5.284865856170654,
"learning_rate": 6.45133505598622e-06,
"loss": 0.1248,
"step": 750
},
{
"epoch": 0.06891205099491773,
"grad_norm": 8.725616455078125,
"learning_rate": 6.881998277347115e-06,
"loss": 0.1328,
"step": 800
},
{
"epoch": 0.07321905418210009,
"grad_norm": 5.838630199432373,
"learning_rate": 7.312661498708011e-06,
"loss": 0.1302,
"step": 850
},
{
"epoch": 0.07752605736928245,
"grad_norm": 2.8252811431884766,
"learning_rate": 7.743324720068907e-06,
"loss": 0.1318,
"step": 900
},
{
"epoch": 0.08183306055646482,
"grad_norm": 3.0458829402923584,
"learning_rate": 8.173987941429803e-06,
"loss": 0.1144,
"step": 950
},
{
"epoch": 0.08614006374364717,
"grad_norm": 12.496984481811523,
"learning_rate": 8.604651162790698e-06,
"loss": 0.1128,
"step": 1000
},
{
"epoch": 0.09044706693082953,
"grad_norm": 1.416028380393982,
"learning_rate": 9.035314384151595e-06,
"loss": 0.1091,
"step": 1050
},
{
"epoch": 0.09475407011801189,
"grad_norm": 1.2219202518463135,
"learning_rate": 9.46597760551249e-06,
"loss": 0.1188,
"step": 1100
},
{
"epoch": 0.09906107330519424,
"grad_norm": 5.654454708099365,
"learning_rate": 9.896640826873386e-06,
"loss": 0.1271,
"step": 1150
},
{
"epoch": 0.1033680764923766,
"grad_norm": 2.2038986682891846,
"learning_rate": 1.032730404823428e-05,
"loss": 0.1082,
"step": 1200
},
{
"epoch": 0.10767507967955896,
"grad_norm": 2.594078779220581,
"learning_rate": 1.0757967269595177e-05,
"loss": 0.1135,
"step": 1250
},
{
"epoch": 0.11198208286674133,
"grad_norm": 2.2295544147491455,
"learning_rate": 1.1188630490956073e-05,
"loss": 0.1231,
"step": 1300
},
{
"epoch": 0.11628908605392368,
"grad_norm": 6.695245742797852,
"learning_rate": 1.1619293712316968e-05,
"loss": 0.1113,
"step": 1350
},
{
"epoch": 0.12059608924110604,
"grad_norm": 2.6315550804138184,
"learning_rate": 1.2049956933677865e-05,
"loss": 0.1088,
"step": 1400
},
{
"epoch": 0.1249030924282884,
"grad_norm": 8.70291805267334,
"learning_rate": 1.2480620155038761e-05,
"loss": 0.0987,
"step": 1450
},
{
"epoch": 0.12921009561547075,
"grad_norm": 3.3458549976348877,
"learning_rate": 1.2911283376399657e-05,
"loss": 0.12,
"step": 1500
},
{
"epoch": 0.1335170988026531,
"grad_norm": 1.7857741117477417,
"learning_rate": 1.3341946597760554e-05,
"loss": 0.1224,
"step": 1550
},
{
"epoch": 0.13782410198983547,
"grad_norm": 2.5017483234405518,
"learning_rate": 1.3772609819121447e-05,
"loss": 0.1039,
"step": 1600
},
{
"epoch": 0.14213110517701782,
"grad_norm": 1.9605143070220947,
"learning_rate": 1.4203273040482343e-05,
"loss": 0.1018,
"step": 1650
},
{
"epoch": 0.14643810836420018,
"grad_norm": 1.5602117776870728,
"learning_rate": 1.463393626184324e-05,
"loss": 0.1084,
"step": 1700
},
{
"epoch": 0.15074511155138254,
"grad_norm": 1.4006476402282715,
"learning_rate": 1.5064599483204136e-05,
"loss": 0.1123,
"step": 1750
},
{
"epoch": 0.1550521147385649,
"grad_norm": 2.3718955516815186,
"learning_rate": 1.549526270456503e-05,
"loss": 0.1023,
"step": 1800
},
{
"epoch": 0.15935911792574725,
"grad_norm": 4.667544364929199,
"learning_rate": 1.5925925925925926e-05,
"loss": 0.1047,
"step": 1850
},
{
"epoch": 0.16366612111292964,
"grad_norm": 2.8184595108032227,
"learning_rate": 1.6356589147286824e-05,
"loss": 0.1057,
"step": 1900
},
{
"epoch": 0.167973124300112,
"grad_norm": 1.4751886129379272,
"learning_rate": 1.678725236864772e-05,
"loss": 0.0972,
"step": 1950
},
{
"epoch": 0.17228012748729435,
"grad_norm": 3.6895549297332764,
"learning_rate": 1.7217915590008613e-05,
"loss": 0.0996,
"step": 2000
},
{
"epoch": 0.1765871306744767,
"grad_norm": 1.049835443496704,
"learning_rate": 1.764857881136951e-05,
"loss": 0.0963,
"step": 2050
},
{
"epoch": 0.18089413386165906,
"grad_norm": 2.069823980331421,
"learning_rate": 1.8079242032730406e-05,
"loss": 0.0964,
"step": 2100
},
{
"epoch": 0.18520113704884142,
"grad_norm": 1.360253095626831,
"learning_rate": 1.85099052540913e-05,
"loss": 0.0903,
"step": 2150
},
{
"epoch": 0.18950814023602378,
"grad_norm": 3.293531656265259,
"learning_rate": 1.89405684754522e-05,
"loss": 0.1102,
"step": 2200
},
{
"epoch": 0.19381514342320613,
"grad_norm": 1.5345733165740967,
"learning_rate": 1.9371231696813094e-05,
"loss": 0.1011,
"step": 2250
},
{
"epoch": 0.1981221466103885,
"grad_norm": 1.7733877897262573,
"learning_rate": 1.980189491817399e-05,
"loss": 0.0786,
"step": 2300
},
{
"epoch": 0.20242914979757085,
"grad_norm": 1.409213662147522,
"learning_rate": 1.997415773353752e-05,
"loss": 0.1018,
"step": 2350
},
{
"epoch": 0.2067361529847532,
"grad_norm": 1.9454050064086914,
"learning_rate": 1.9926301684532928e-05,
"loss": 0.1121,
"step": 2400
},
{
"epoch": 0.21104315617193556,
"grad_norm": 0.5753230452537537,
"learning_rate": 1.987844563552833e-05,
"loss": 0.0796,
"step": 2450
},
{
"epoch": 0.21535015935911792,
"grad_norm": 6.277276039123535,
"learning_rate": 1.983058958652374e-05,
"loss": 0.1025,
"step": 2500
},
{
"epoch": 0.21965716254630027,
"grad_norm": 1.9022142887115479,
"learning_rate": 1.9782733537519143e-05,
"loss": 0.0961,
"step": 2550
},
{
"epoch": 0.22396416573348266,
"grad_norm": 1.920341968536377,
"learning_rate": 1.973487748851455e-05,
"loss": 0.0965,
"step": 2600
},
{
"epoch": 0.228271168920665,
"grad_norm": 3.8428711891174316,
"learning_rate": 1.9687021439509954e-05,
"loss": 0.0786,
"step": 2650
},
{
"epoch": 0.23257817210784737,
"grad_norm": 2.7966816425323486,
"learning_rate": 1.963916539050536e-05,
"loss": 0.1064,
"step": 2700
},
{
"epoch": 0.23688517529502973,
"grad_norm": 0.9752281904220581,
"learning_rate": 1.9591309341500768e-05,
"loss": 0.0938,
"step": 2750
},
{
"epoch": 0.24119217848221208,
"grad_norm": 0.9420919418334961,
"learning_rate": 1.9543453292496172e-05,
"loss": 0.0991,
"step": 2800
},
{
"epoch": 0.24549918166939444,
"grad_norm": 1.6354459524154663,
"learning_rate": 1.949559724349158e-05,
"loss": 0.096,
"step": 2850
},
{
"epoch": 0.2498061848565768,
"grad_norm": 0.6382321715354919,
"learning_rate": 1.9447741194486983e-05,
"loss": 0.086,
"step": 2900
},
{
"epoch": 0.25411318804375915,
"grad_norm": 3.3475544452667236,
"learning_rate": 1.939988514548239e-05,
"loss": 0.0938,
"step": 2950
},
{
"epoch": 0.2584201912309415,
"grad_norm": 1.1161267757415771,
"learning_rate": 1.9352029096477794e-05,
"loss": 0.088,
"step": 3000
},
{
"epoch": 0.26272719441812387,
"grad_norm": 2.1411211490631104,
"learning_rate": 1.93041730474732e-05,
"loss": 0.1006,
"step": 3050
},
{
"epoch": 0.2670341976053062,
"grad_norm": 5.084458827972412,
"learning_rate": 1.9256316998468606e-05,
"loss": 0.09,
"step": 3100
},
{
"epoch": 0.2713412007924886,
"grad_norm": 1.219672441482544,
"learning_rate": 1.9208460949464013e-05,
"loss": 0.0826,
"step": 3150
},
{
"epoch": 0.27564820397967094,
"grad_norm": 2.7958974838256836,
"learning_rate": 1.916060490045942e-05,
"loss": 0.0879,
"step": 3200
},
{
"epoch": 0.2799552071668533,
"grad_norm": 0.9086557626724243,
"learning_rate": 1.9112748851454824e-05,
"loss": 0.0926,
"step": 3250
},
{
"epoch": 0.28426221035403565,
"grad_norm": 3.29379940032959,
"learning_rate": 1.906489280245023e-05,
"loss": 0.076,
"step": 3300
},
{
"epoch": 0.288569213541218,
"grad_norm": 2.127718925476074,
"learning_rate": 1.901703675344564e-05,
"loss": 0.0822,
"step": 3350
},
{
"epoch": 0.29287621672840036,
"grad_norm": 1.128344178199768,
"learning_rate": 1.8969180704441042e-05,
"loss": 0.0878,
"step": 3400
},
{
"epoch": 0.2971832199155827,
"grad_norm": 0.8325080275535583,
"learning_rate": 1.892132465543645e-05,
"loss": 0.1006,
"step": 3450
},
{
"epoch": 0.3014902231027651,
"grad_norm": 1.3302809000015259,
"learning_rate": 1.8873468606431853e-05,
"loss": 0.0838,
"step": 3500
},
{
"epoch": 0.30579722628994743,
"grad_norm": 0.9956411719322205,
"learning_rate": 1.882561255742726e-05,
"loss": 0.0796,
"step": 3550
},
{
"epoch": 0.3101042294771298,
"grad_norm": 2.7951183319091797,
"learning_rate": 1.8777756508422668e-05,
"loss": 0.0746,
"step": 3600
},
{
"epoch": 0.31441123266431215,
"grad_norm": 0.9167538285255432,
"learning_rate": 1.8729900459418072e-05,
"loss": 0.0756,
"step": 3650
},
{
"epoch": 0.3187182358514945,
"grad_norm": 3.4193942546844482,
"learning_rate": 1.868204441041348e-05,
"loss": 0.0772,
"step": 3700
},
{
"epoch": 0.3230252390386769,
"grad_norm": 1.3220958709716797,
"learning_rate": 1.8634188361408883e-05,
"loss": 0.0843,
"step": 3750
},
{
"epoch": 0.32733224222585927,
"grad_norm": 0.9294602870941162,
"learning_rate": 1.858633231240429e-05,
"loss": 0.0813,
"step": 3800
},
{
"epoch": 0.33163924541304163,
"grad_norm": 1.0229344367980957,
"learning_rate": 1.8538476263399698e-05,
"loss": 0.0855,
"step": 3850
},
{
"epoch": 0.335946248600224,
"grad_norm": 2.287496566772461,
"learning_rate": 1.84906202143951e-05,
"loss": 0.0901,
"step": 3900
},
{
"epoch": 0.34025325178740634,
"grad_norm": 1.0064690113067627,
"learning_rate": 1.844276416539051e-05,
"loss": 0.091,
"step": 3950
},
{
"epoch": 0.3445602549745887,
"grad_norm": 1.040418028831482,
"learning_rate": 1.8394908116385913e-05,
"loss": 0.1087,
"step": 4000
},
{
"epoch": 0.34886725816177105,
"grad_norm": 1.4364063739776611,
"learning_rate": 1.834705206738132e-05,
"loss": 0.0868,
"step": 4050
},
{
"epoch": 0.3531742613489534,
"grad_norm": 1.8169975280761719,
"learning_rate": 1.8299196018376724e-05,
"loss": 0.0855,
"step": 4100
},
{
"epoch": 0.35748126453613577,
"grad_norm": 5.961976528167725,
"learning_rate": 1.825133996937213e-05,
"loss": 0.0704,
"step": 4150
},
{
"epoch": 0.3617882677233181,
"grad_norm": 1.6580802202224731,
"learning_rate": 1.8203483920367535e-05,
"loss": 0.09,
"step": 4200
},
{
"epoch": 0.3660952709105005,
"grad_norm": 2.251880168914795,
"learning_rate": 1.8155627871362942e-05,
"loss": 0.0891,
"step": 4250
},
{
"epoch": 0.37040227409768284,
"grad_norm": 2.918473720550537,
"learning_rate": 1.8107771822358346e-05,
"loss": 0.0877,
"step": 4300
},
{
"epoch": 0.3747092772848652,
"grad_norm": 0.5215052366256714,
"learning_rate": 1.8059915773353753e-05,
"loss": 0.0828,
"step": 4350
},
{
"epoch": 0.37901628047204755,
"grad_norm": 1.037458896636963,
"learning_rate": 1.801205972434916e-05,
"loss": 0.0692,
"step": 4400
},
{
"epoch": 0.3833232836592299,
"grad_norm": 0.9075079560279846,
"learning_rate": 1.7964203675344564e-05,
"loss": 0.0745,
"step": 4450
},
{
"epoch": 0.38763028684641226,
"grad_norm": 1.210403561592102,
"learning_rate": 1.791634762633997e-05,
"loss": 0.0814,
"step": 4500
},
{
"epoch": 0.3919372900335946,
"grad_norm": 1.3449194431304932,
"learning_rate": 1.7868491577335375e-05,
"loss": 0.0887,
"step": 4550
},
{
"epoch": 0.396244293220777,
"grad_norm": 0.8855172395706177,
"learning_rate": 1.7820635528330783e-05,
"loss": 0.0848,
"step": 4600
},
{
"epoch": 0.40055129640795933,
"grad_norm": 0.9978507161140442,
"learning_rate": 1.7772779479326187e-05,
"loss": 0.0743,
"step": 4650
},
{
"epoch": 0.4048582995951417,
"grad_norm": 1.354919195175171,
"learning_rate": 1.7724923430321594e-05,
"loss": 0.0942,
"step": 4700
},
{
"epoch": 0.40916530278232405,
"grad_norm": 2.4924111366271973,
"learning_rate": 1.7677067381316998e-05,
"loss": 0.0806,
"step": 4750
},
{
"epoch": 0.4134723059695064,
"grad_norm": 1.5234886407852173,
"learning_rate": 1.7629211332312405e-05,
"loss": 0.0753,
"step": 4800
},
{
"epoch": 0.41777930915668876,
"grad_norm": 1.6104071140289307,
"learning_rate": 1.758135528330781e-05,
"loss": 0.0708,
"step": 4850
},
{
"epoch": 0.4220863123438711,
"grad_norm": 0.8267254829406738,
"learning_rate": 1.7533499234303216e-05,
"loss": 0.0793,
"step": 4900
},
{
"epoch": 0.4263933155310535,
"grad_norm": 1.4067633152008057,
"learning_rate": 1.7485643185298623e-05,
"loss": 0.0951,
"step": 4950
},
{
"epoch": 0.43070031871823583,
"grad_norm": 0.9799253344535828,
"learning_rate": 1.7437787136294027e-05,
"loss": 0.0861,
"step": 5000
},
{
"epoch": 0.4350073219054182,
"grad_norm": 1.111114740371704,
"learning_rate": 1.7389931087289434e-05,
"loss": 0.0809,
"step": 5050
},
{
"epoch": 0.43931432509260054,
"grad_norm": 0.965411901473999,
"learning_rate": 1.7342075038284842e-05,
"loss": 0.0755,
"step": 5100
},
{
"epoch": 0.4436213282797829,
"grad_norm": 3.719944953918457,
"learning_rate": 1.7294218989280246e-05,
"loss": 0.0823,
"step": 5150
},
{
"epoch": 0.4479283314669653,
"grad_norm": 0.42969638109207153,
"learning_rate": 1.7246362940275653e-05,
"loss": 0.07,
"step": 5200
},
{
"epoch": 0.45223533465414767,
"grad_norm": 2.1813602447509766,
"learning_rate": 1.7198506891271057e-05,
"loss": 0.0706,
"step": 5250
},
{
"epoch": 0.45654233784133,
"grad_norm": 0.8767011761665344,
"learning_rate": 1.7150650842266464e-05,
"loss": 0.0798,
"step": 5300
},
{
"epoch": 0.4608493410285124,
"grad_norm": 0.997157096862793,
"learning_rate": 1.710279479326187e-05,
"loss": 0.0801,
"step": 5350
},
{
"epoch": 0.46515634421569474,
"grad_norm": 0.7616205215454102,
"learning_rate": 1.7054938744257275e-05,
"loss": 0.083,
"step": 5400
},
{
"epoch": 0.4694633474028771,
"grad_norm": 2.203051805496216,
"learning_rate": 1.7007082695252682e-05,
"loss": 0.0755,
"step": 5450
},
{
"epoch": 0.47377035059005945,
"grad_norm": 0.6811165809631348,
"learning_rate": 1.695922664624809e-05,
"loss": 0.0694,
"step": 5500
},
{
"epoch": 0.4780773537772418,
"grad_norm": 1.0467352867126465,
"learning_rate": 1.6911370597243494e-05,
"loss": 0.0773,
"step": 5550
},
{
"epoch": 0.48238435696442417,
"grad_norm": 1.0311206579208374,
"learning_rate": 1.68635145482389e-05,
"loss": 0.0826,
"step": 5600
},
{
"epoch": 0.4866913601516065,
"grad_norm": 4.649372577667236,
"learning_rate": 1.6815658499234305e-05,
"loss": 0.0909,
"step": 5650
},
{
"epoch": 0.4909983633387889,
"grad_norm": 3.3261115550994873,
"learning_rate": 1.6767802450229712e-05,
"loss": 0.0904,
"step": 5700
},
{
"epoch": 0.49530536652597124,
"grad_norm": 0.9026235342025757,
"learning_rate": 1.6719946401225116e-05,
"loss": 0.0781,
"step": 5750
},
{
"epoch": 0.4996123697131536,
"grad_norm": 1.0204756259918213,
"learning_rate": 1.6672090352220523e-05,
"loss": 0.0762,
"step": 5800
},
{
"epoch": 0.503919372900336,
"grad_norm": 1.9559473991394043,
"learning_rate": 1.6624234303215927e-05,
"loss": 0.0768,
"step": 5850
},
{
"epoch": 0.5082263760875183,
"grad_norm": 2.3736560344696045,
"learning_rate": 1.6576378254211334e-05,
"loss": 0.0613,
"step": 5900
},
{
"epoch": 0.5125333792747007,
"grad_norm": 0.923481285572052,
"learning_rate": 1.6528522205206738e-05,
"loss": 0.0796,
"step": 5950
},
{
"epoch": 0.516840382461883,
"grad_norm": 0.42188379168510437,
"learning_rate": 1.6480666156202145e-05,
"loss": 0.0733,
"step": 6000
},
{
"epoch": 0.5211473856490654,
"grad_norm": 1.1153289079666138,
"learning_rate": 1.6432810107197553e-05,
"loss": 0.0828,
"step": 6050
},
{
"epoch": 0.5254543888362477,
"grad_norm": 0.7091718912124634,
"learning_rate": 1.6384954058192956e-05,
"loss": 0.0612,
"step": 6100
},
{
"epoch": 0.5297613920234301,
"grad_norm": 0.7706901431083679,
"learning_rate": 1.6337098009188364e-05,
"loss": 0.0772,
"step": 6150
},
{
"epoch": 0.5340683952106124,
"grad_norm": 0.29516345262527466,
"learning_rate": 1.6289241960183768e-05,
"loss": 0.0665,
"step": 6200
},
{
"epoch": 0.5383753983977948,
"grad_norm": 1.5661741495132446,
"learning_rate": 1.6241385911179175e-05,
"loss": 0.0734,
"step": 6250
},
{
"epoch": 0.5426824015849772,
"grad_norm": 1.2785195112228394,
"learning_rate": 1.619352986217458e-05,
"loss": 0.0768,
"step": 6300
},
{
"epoch": 0.5469894047721595,
"grad_norm": 1.2388705015182495,
"learning_rate": 1.6145673813169986e-05,
"loss": 0.0782,
"step": 6350
},
{
"epoch": 0.5512964079593419,
"grad_norm": 0.7553074359893799,
"learning_rate": 1.609781776416539e-05,
"loss": 0.0764,
"step": 6400
},
{
"epoch": 0.5556034111465242,
"grad_norm": 1.6529933214187622,
"learning_rate": 1.6049961715160797e-05,
"loss": 0.0769,
"step": 6450
},
{
"epoch": 0.5599104143337066,
"grad_norm": 2.259467124938965,
"learning_rate": 1.60021056661562e-05,
"loss": 0.0805,
"step": 6500
},
{
"epoch": 0.5642174175208889,
"grad_norm": 0.834506630897522,
"learning_rate": 1.5954249617151608e-05,
"loss": 0.0701,
"step": 6550
},
{
"epoch": 0.5685244207080713,
"grad_norm": 0.7409648299217224,
"learning_rate": 1.5906393568147016e-05,
"loss": 0.0691,
"step": 6600
},
{
"epoch": 0.5728314238952537,
"grad_norm": 2.443349838256836,
"learning_rate": 1.585853751914242e-05,
"loss": 0.0789,
"step": 6650
},
{
"epoch": 0.577138427082436,
"grad_norm": 1.4207009077072144,
"learning_rate": 1.5810681470137827e-05,
"loss": 0.0701,
"step": 6700
},
{
"epoch": 0.5814454302696184,
"grad_norm": 1.454414963722229,
"learning_rate": 1.576282542113323e-05,
"loss": 0.0745,
"step": 6750
},
{
"epoch": 0.5857524334568007,
"grad_norm": 0.995476484298706,
"learning_rate": 1.5714969372128638e-05,
"loss": 0.0766,
"step": 6800
},
{
"epoch": 0.5900594366439831,
"grad_norm": 0.39929381012916565,
"learning_rate": 1.5667113323124045e-05,
"loss": 0.0731,
"step": 6850
},
{
"epoch": 0.5943664398311654,
"grad_norm": 4.782962799072266,
"learning_rate": 1.561925727411945e-05,
"loss": 0.078,
"step": 6900
},
{
"epoch": 0.5986734430183478,
"grad_norm": 0.7349683046340942,
"learning_rate": 1.5571401225114856e-05,
"loss": 0.0822,
"step": 6950
},
{
"epoch": 0.6029804462055302,
"grad_norm": 2.0340960025787354,
"learning_rate": 1.5523545176110263e-05,
"loss": 0.0634,
"step": 7000
},
{
"epoch": 0.6072874493927125,
"grad_norm": 0.9049922823905945,
"learning_rate": 1.5475689127105667e-05,
"loss": 0.0867,
"step": 7050
},
{
"epoch": 0.6115944525798949,
"grad_norm": 0.9008879065513611,
"learning_rate": 1.5427833078101075e-05,
"loss": 0.0693,
"step": 7100
},
{
"epoch": 0.6159014557670772,
"grad_norm": 0.8665018081665039,
"learning_rate": 1.537997702909648e-05,
"loss": 0.0639,
"step": 7150
},
{
"epoch": 0.6202084589542596,
"grad_norm": 0.8610183000564575,
"learning_rate": 1.5332120980091886e-05,
"loss": 0.0622,
"step": 7200
},
{
"epoch": 0.6245154621414419,
"grad_norm": 1.0662976503372192,
"learning_rate": 1.5284264931087293e-05,
"loss": 0.0761,
"step": 7250
},
{
"epoch": 0.6288224653286243,
"grad_norm": 1.417581558227539,
"learning_rate": 1.5236408882082697e-05,
"loss": 0.0666,
"step": 7300
},
{
"epoch": 0.6331294685158066,
"grad_norm": 1.198586344718933,
"learning_rate": 1.5188552833078102e-05,
"loss": 0.0722,
"step": 7350
},
{
"epoch": 0.637436471702989,
"grad_norm": 0.3623594045639038,
"learning_rate": 1.5140696784073508e-05,
"loss": 0.0701,
"step": 7400
},
{
"epoch": 0.6417434748901715,
"grad_norm": 1.3689919710159302,
"learning_rate": 1.5092840735068914e-05,
"loss": 0.0867,
"step": 7450
},
{
"epoch": 0.6460504780773538,
"grad_norm": 1.0699403285980225,
"learning_rate": 1.5044984686064319e-05,
"loss": 0.0646,
"step": 7500
},
{
"epoch": 0.6503574812645362,
"grad_norm": 0.4037761092185974,
"learning_rate": 1.4997128637059726e-05,
"loss": 0.059,
"step": 7550
},
{
"epoch": 0.6546644844517185,
"grad_norm": 1.1219407320022583,
"learning_rate": 1.494927258805513e-05,
"loss": 0.0776,
"step": 7600
},
{
"epoch": 0.6589714876389009,
"grad_norm": 0.5265269875526428,
"learning_rate": 1.4901416539050538e-05,
"loss": 0.0691,
"step": 7650
},
{
"epoch": 0.6632784908260833,
"grad_norm": 0.6277808547019958,
"learning_rate": 1.4853560490045945e-05,
"loss": 0.067,
"step": 7700
},
{
"epoch": 0.6675854940132656,
"grad_norm": 1.1883065700531006,
"learning_rate": 1.4805704441041349e-05,
"loss": 0.0735,
"step": 7750
},
{
"epoch": 0.671892497200448,
"grad_norm": 1.3201600313186646,
"learning_rate": 1.4757848392036756e-05,
"loss": 0.071,
"step": 7800
},
{
"epoch": 0.6761995003876303,
"grad_norm": 0.35750851035118103,
"learning_rate": 1.470999234303216e-05,
"loss": 0.0705,
"step": 7850
},
{
"epoch": 0.6805065035748127,
"grad_norm": 1.2730865478515625,
"learning_rate": 1.4662136294027567e-05,
"loss": 0.0753,
"step": 7900
},
{
"epoch": 0.684813506761995,
"grad_norm": 2.2789065837860107,
"learning_rate": 1.4614280245022971e-05,
"loss": 0.0703,
"step": 7950
},
{
"epoch": 0.6891205099491774,
"grad_norm": 1.8654379844665527,
"learning_rate": 1.4566424196018378e-05,
"loss": 0.0752,
"step": 8000
},
{
"epoch": 0.6934275131363598,
"grad_norm": 0.888477623462677,
"learning_rate": 1.4518568147013784e-05,
"loss": 0.0727,
"step": 8050
},
{
"epoch": 0.6977345163235421,
"grad_norm": 1.23393714427948,
"learning_rate": 1.447071209800919e-05,
"loss": 0.0633,
"step": 8100
},
{
"epoch": 0.7020415195107245,
"grad_norm": 0.9582380652427673,
"learning_rate": 1.4422856049004595e-05,
"loss": 0.075,
"step": 8150
},
{
"epoch": 0.7063485226979068,
"grad_norm": 0.9180455207824707,
"learning_rate": 1.4375e-05,
"loss": 0.067,
"step": 8200
},
{
"epoch": 0.7106555258850892,
"grad_norm": 1.2393083572387695,
"learning_rate": 1.4327143950995408e-05,
"loss": 0.0634,
"step": 8250
},
{
"epoch": 0.7149625290722715,
"grad_norm": 1.2908138036727905,
"learning_rate": 1.4279287901990813e-05,
"loss": 0.0787,
"step": 8300
},
{
"epoch": 0.7192695322594539,
"grad_norm": 2.0125656127929688,
"learning_rate": 1.4231431852986219e-05,
"loss": 0.0806,
"step": 8350
},
{
"epoch": 0.7235765354466362,
"grad_norm": 1.1681885719299316,
"learning_rate": 1.4183575803981624e-05,
"loss": 0.0657,
"step": 8400
},
{
"epoch": 0.7278835386338186,
"grad_norm": 1.7651474475860596,
"learning_rate": 1.4135719754977032e-05,
"loss": 0.0678,
"step": 8450
},
{
"epoch": 0.732190541821001,
"grad_norm": 1.4047213792800903,
"learning_rate": 1.4087863705972436e-05,
"loss": 0.0561,
"step": 8500
},
{
"epoch": 0.7364975450081833,
"grad_norm": 1.0144027471542358,
"learning_rate": 1.4040007656967843e-05,
"loss": 0.0704,
"step": 8550
},
{
"epoch": 0.7408045481953657,
"grad_norm": 1.1934571266174316,
"learning_rate": 1.3992151607963247e-05,
"loss": 0.0612,
"step": 8600
},
{
"epoch": 0.745111551382548,
"grad_norm": 1.2061142921447754,
"learning_rate": 1.3944295558958654e-05,
"loss": 0.0681,
"step": 8650
},
{
"epoch": 0.7494185545697304,
"grad_norm": 1.0598750114440918,
"learning_rate": 1.3896439509954058e-05,
"loss": 0.0719,
"step": 8700
},
{
"epoch": 0.7537255577569127,
"grad_norm": 0.9914436340332031,
"learning_rate": 1.3848583460949465e-05,
"loss": 0.0731,
"step": 8750
},
{
"epoch": 0.7580325609440951,
"grad_norm": 2.260218620300293,
"learning_rate": 1.3800727411944872e-05,
"loss": 0.0648,
"step": 8800
},
{
"epoch": 0.7623395641312775,
"grad_norm": 0.8335168957710266,
"learning_rate": 1.3752871362940276e-05,
"loss": 0.0627,
"step": 8850
},
{
"epoch": 0.7666465673184598,
"grad_norm": 1.749588131904602,
"learning_rate": 1.3705015313935683e-05,
"loss": 0.0698,
"step": 8900
},
{
"epoch": 0.7709535705056422,
"grad_norm": 1.2013710737228394,
"learning_rate": 1.3657159264931087e-05,
"loss": 0.0734,
"step": 8950
},
{
"epoch": 0.7752605736928245,
"grad_norm": 1.2394059896469116,
"learning_rate": 1.3609303215926495e-05,
"loss": 0.0644,
"step": 9000
},
{
"epoch": 0.7795675768800069,
"grad_norm": 2.5147886276245117,
"learning_rate": 1.35614471669219e-05,
"loss": 0.0752,
"step": 9050
},
{
"epoch": 0.7838745800671892,
"grad_norm": 0.8637904524803162,
"learning_rate": 1.3513591117917306e-05,
"loss": 0.0627,
"step": 9100
},
{
"epoch": 0.7881815832543716,
"grad_norm": 1.1269769668579102,
"learning_rate": 1.3465735068912711e-05,
"loss": 0.0685,
"step": 9150
},
{
"epoch": 0.792488586441554,
"grad_norm": 0.3503759801387787,
"learning_rate": 1.3417879019908119e-05,
"loss": 0.0674,
"step": 9200
},
{
"epoch": 0.7967955896287363,
"grad_norm": 1.6583272218704224,
"learning_rate": 1.3370022970903522e-05,
"loss": 0.0617,
"step": 9250
},
{
"epoch": 0.8011025928159187,
"grad_norm": 0.49781036376953125,
"learning_rate": 1.332216692189893e-05,
"loss": 0.0483,
"step": 9300
},
{
"epoch": 0.805409596003101,
"grad_norm": 0.423948734998703,
"learning_rate": 1.3274310872894335e-05,
"loss": 0.0651,
"step": 9350
},
{
"epoch": 0.8097165991902834,
"grad_norm": 3.2630441188812256,
"learning_rate": 1.322645482388974e-05,
"loss": 0.0751,
"step": 9400
},
{
"epoch": 0.8140236023774657,
"grad_norm": 1.1352860927581787,
"learning_rate": 1.3178598774885148e-05,
"loss": 0.0587,
"step": 9450
},
{
"epoch": 0.8183306055646481,
"grad_norm": 0.735758364200592,
"learning_rate": 1.3130742725880552e-05,
"loss": 0.0594,
"step": 9500
},
{
"epoch": 0.8226376087518305,
"grad_norm": 1.7869194746017456,
"learning_rate": 1.308288667687596e-05,
"loss": 0.0738,
"step": 9550
},
{
"epoch": 0.8269446119390128,
"grad_norm": 4.783473491668701,
"learning_rate": 1.3035030627871363e-05,
"loss": 0.0689,
"step": 9600
},
{
"epoch": 0.8312516151261952,
"grad_norm": 0.6986812949180603,
"learning_rate": 1.298717457886677e-05,
"loss": 0.0683,
"step": 9650
},
{
"epoch": 0.8355586183133775,
"grad_norm": 0.807700514793396,
"learning_rate": 1.2939318529862174e-05,
"loss": 0.0724,
"step": 9700
},
{
"epoch": 0.8398656215005599,
"grad_norm": 1.0882675647735596,
"learning_rate": 1.2891462480857581e-05,
"loss": 0.0706,
"step": 9750
},
{
"epoch": 0.8441726246877422,
"grad_norm": 1.652010440826416,
"learning_rate": 1.2843606431852987e-05,
"loss": 0.0592,
"step": 9800
},
{
"epoch": 0.8484796278749246,
"grad_norm": 1.6742961406707764,
"learning_rate": 1.2795750382848393e-05,
"loss": 0.0754,
"step": 9850
},
{
"epoch": 0.852786631062107,
"grad_norm": 0.7621288895606995,
"learning_rate": 1.27478943338438e-05,
"loss": 0.0575,
"step": 9900
},
{
"epoch": 0.8570936342492893,
"grad_norm": 1.4449315071105957,
"learning_rate": 1.2700038284839204e-05,
"loss": 0.0696,
"step": 9950
},
{
"epoch": 0.8614006374364717,
"grad_norm": 0.8159062266349792,
"learning_rate": 1.2652182235834611e-05,
"loss": 0.0662,
"step": 10000
},
{
"epoch": 0.865707640623654,
"grad_norm": 1.413529634475708,
"learning_rate": 1.2604326186830017e-05,
"loss": 0.0701,
"step": 10050
},
{
"epoch": 0.8700146438108364,
"grad_norm": 1.043516755104065,
"learning_rate": 1.2556470137825422e-05,
"loss": 0.0666,
"step": 10100
},
{
"epoch": 0.8743216469980187,
"grad_norm": 1.4546676874160767,
"learning_rate": 1.2508614088820828e-05,
"loss": 0.0565,
"step": 10150
},
{
"epoch": 0.8786286501852011,
"grad_norm": 1.4287497997283936,
"learning_rate": 1.2460758039816235e-05,
"loss": 0.0775,
"step": 10200
},
{
"epoch": 0.8829356533723834,
"grad_norm": 0.24882686138153076,
"learning_rate": 1.2412901990811639e-05,
"loss": 0.0545,
"step": 10250
},
{
"epoch": 0.8872426565595658,
"grad_norm": 0.7413391470909119,
"learning_rate": 1.2365045941807046e-05,
"loss": 0.067,
"step": 10300
},
{
"epoch": 0.8915496597467482,
"grad_norm": 0.6108767986297607,
"learning_rate": 1.231718989280245e-05,
"loss": 0.059,
"step": 10350
},
{
"epoch": 0.8958566629339306,
"grad_norm": 1.301120400428772,
"learning_rate": 1.2269333843797857e-05,
"loss": 0.061,
"step": 10400
},
{
"epoch": 0.900163666121113,
"grad_norm": 0.744163990020752,
"learning_rate": 1.2221477794793264e-05,
"loss": 0.0572,
"step": 10450
},
{
"epoch": 0.9044706693082953,
"grad_norm": 0.7539933323860168,
"learning_rate": 1.2173621745788668e-05,
"loss": 0.0856,
"step": 10500
},
{
"epoch": 0.9087776724954777,
"grad_norm": 0.5975971817970276,
"learning_rate": 1.2125765696784076e-05,
"loss": 0.0623,
"step": 10550
},
{
"epoch": 0.91308467568266,
"grad_norm": 1.0527843236923218,
"learning_rate": 1.207790964777948e-05,
"loss": 0.0564,
"step": 10600
},
{
"epoch": 0.9173916788698424,
"grad_norm": 1.154876947402954,
"learning_rate": 1.2030053598774887e-05,
"loss": 0.0741,
"step": 10650
},
{
"epoch": 0.9216986820570248,
"grad_norm": 0.895031213760376,
"learning_rate": 1.198219754977029e-05,
"loss": 0.0734,
"step": 10700
},
{
"epoch": 0.9260056852442071,
"grad_norm": 1.088548183441162,
"learning_rate": 1.1934341500765698e-05,
"loss": 0.0563,
"step": 10750
},
{
"epoch": 0.9303126884313895,
"grad_norm": 0.8421652913093567,
"learning_rate": 1.1886485451761103e-05,
"loss": 0.0691,
"step": 10800
},
{
"epoch": 0.9346196916185718,
"grad_norm": 2.2791976928710938,
"learning_rate": 1.1838629402756509e-05,
"loss": 0.0814,
"step": 10850
},
{
"epoch": 0.9389266948057542,
"grad_norm": 0.9922728538513184,
"learning_rate": 1.1790773353751915e-05,
"loss": 0.0672,
"step": 10900
},
{
"epoch": 0.9432336979929365,
"grad_norm": 0.9205858111381531,
"learning_rate": 1.1742917304747322e-05,
"loss": 0.0666,
"step": 10950
},
{
"epoch": 0.9475407011801189,
"grad_norm": 1.29545259475708,
"learning_rate": 1.1695061255742727e-05,
"loss": 0.0593,
"step": 11000
},
{
"epoch": 0.9518477043673013,
"grad_norm": 2.6823294162750244,
"learning_rate": 1.1647205206738133e-05,
"loss": 0.0583,
"step": 11050
},
{
"epoch": 0.9561547075544836,
"grad_norm": 1.1763588190078735,
"learning_rate": 1.1599349157733539e-05,
"loss": 0.0658,
"step": 11100
},
{
"epoch": 0.960461710741666,
"grad_norm": 0.281956285238266,
"learning_rate": 1.1551493108728944e-05,
"loss": 0.0564,
"step": 11150
},
{
"epoch": 0.9647687139288483,
"grad_norm": 1.5565913915634155,
"learning_rate": 1.1503637059724351e-05,
"loss": 0.0742,
"step": 11200
},
{
"epoch": 0.9690757171160307,
"grad_norm": 0.7583732604980469,
"learning_rate": 1.1455781010719755e-05,
"loss": 0.0641,
"step": 11250
},
{
"epoch": 0.973382720303213,
"grad_norm": 1.4207026958465576,
"learning_rate": 1.1407924961715163e-05,
"loss": 0.0622,
"step": 11300
},
{
"epoch": 0.9776897234903954,
"grad_norm": 2.625925302505493,
"learning_rate": 1.1360068912710566e-05,
"loss": 0.0672,
"step": 11350
},
{
"epoch": 0.9819967266775778,
"grad_norm": 0.8742978572845459,
"learning_rate": 1.1312212863705974e-05,
"loss": 0.0709,
"step": 11400
},
{
"epoch": 0.9863037298647601,
"grad_norm": 0.6801431179046631,
"learning_rate": 1.1264356814701378e-05,
"loss": 0.0624,
"step": 11450
},
{
"epoch": 0.9906107330519425,
"grad_norm": 0.4742066562175751,
"learning_rate": 1.1216500765696785e-05,
"loss": 0.0764,
"step": 11500
},
{
"epoch": 0.9949177362391248,
"grad_norm": 1.6330316066741943,
"learning_rate": 1.1168644716692192e-05,
"loss": 0.0731,
"step": 11550
},
{
"epoch": 0.9992247394263072,
"grad_norm": 0.3583783209323883,
"learning_rate": 1.1120788667687596e-05,
"loss": 0.055,
"step": 11600
},
{
"epoch": 1.0035317426134895,
"grad_norm": 0.871157169342041,
"learning_rate": 1.1072932618683003e-05,
"loss": 0.0559,
"step": 11650
},
{
"epoch": 1.007838745800672,
"grad_norm": 1.6235480308532715,
"learning_rate": 1.1025076569678409e-05,
"loss": 0.0504,
"step": 11700
},
{
"epoch": 1.0121457489878543,
"grad_norm": 1.0415027141571045,
"learning_rate": 1.0977220520673814e-05,
"loss": 0.0621,
"step": 11750
},
{
"epoch": 1.0164527521750366,
"grad_norm": 1.9939746856689453,
"learning_rate": 1.092936447166922e-05,
"loss": 0.0543,
"step": 11800
},
{
"epoch": 1.020759755362219,
"grad_norm": 1.6792505979537964,
"learning_rate": 1.0881508422664625e-05,
"loss": 0.05,
"step": 11850
},
{
"epoch": 1.0250667585494013,
"grad_norm": 1.1362611055374146,
"learning_rate": 1.0833652373660031e-05,
"loss": 0.0582,
"step": 11900
},
{
"epoch": 1.0293737617365837,
"grad_norm": 0.8512780070304871,
"learning_rate": 1.0785796324655438e-05,
"loss": 0.0506,
"step": 11950
},
{
"epoch": 1.033680764923766,
"grad_norm": 1.2007173299789429,
"learning_rate": 1.0737940275650842e-05,
"loss": 0.057,
"step": 12000
},
{
"epoch": 1.0379877681109484,
"grad_norm": 0.45064929127693176,
"learning_rate": 1.069008422664625e-05,
"loss": 0.0516,
"step": 12050
},
{
"epoch": 1.0422947712981308,
"grad_norm": 1.5519843101501465,
"learning_rate": 1.0642228177641657e-05,
"loss": 0.0498,
"step": 12100
},
{
"epoch": 1.046601774485313,
"grad_norm": 2.6113767623901367,
"learning_rate": 1.059437212863706e-05,
"loss": 0.0475,
"step": 12150
},
{
"epoch": 1.0509087776724955,
"grad_norm": 1.725409984588623,
"learning_rate": 1.0546516079632468e-05,
"loss": 0.0549,
"step": 12200
},
{
"epoch": 1.0552157808596778,
"grad_norm": 1.6692222356796265,
"learning_rate": 1.0498660030627872e-05,
"loss": 0.0541,
"step": 12250
},
{
"epoch": 1.0595227840468602,
"grad_norm": 0.3569232225418091,
"learning_rate": 1.0450803981623279e-05,
"loss": 0.0446,
"step": 12300
},
{
"epoch": 1.0638297872340425,
"grad_norm": 1.5374314785003662,
"learning_rate": 1.0402947932618683e-05,
"loss": 0.0591,
"step": 12350
},
{
"epoch": 1.068136790421225,
"grad_norm": 0.7895877957344055,
"learning_rate": 1.035509188361409e-05,
"loss": 0.0502,
"step": 12400
},
{
"epoch": 1.0724437936084072,
"grad_norm": 2.4290881156921387,
"learning_rate": 1.0307235834609494e-05,
"loss": 0.0449,
"step": 12450
},
{
"epoch": 1.0767507967955896,
"grad_norm": 0.4035605192184448,
"learning_rate": 1.0259379785604901e-05,
"loss": 0.0561,
"step": 12500
},
{
"epoch": 1.081057799982772,
"grad_norm": 3.102343797683716,
"learning_rate": 1.0211523736600307e-05,
"loss": 0.055,
"step": 12550
},
{
"epoch": 1.0853648031699543,
"grad_norm": 2.905097246170044,
"learning_rate": 1.0163667687595712e-05,
"loss": 0.0486,
"step": 12600
},
{
"epoch": 1.0896718063571367,
"grad_norm": 1.0258862972259521,
"learning_rate": 1.011581163859112e-05,
"loss": 0.0562,
"step": 12650
},
{
"epoch": 1.093978809544319,
"grad_norm": 1.214162826538086,
"learning_rate": 1.0067955589586525e-05,
"loss": 0.0556,
"step": 12700
},
{
"epoch": 1.0982858127315014,
"grad_norm": 2.0131213665008545,
"learning_rate": 1.002009954058193e-05,
"loss": 0.0616,
"step": 12750
},
{
"epoch": 1.1025928159186837,
"grad_norm": 1.3988288640975952,
"learning_rate": 9.972243491577336e-06,
"loss": 0.0558,
"step": 12800
},
{
"epoch": 1.106899819105866,
"grad_norm": 1.7865221500396729,
"learning_rate": 9.924387442572742e-06,
"loss": 0.056,
"step": 12850
},
{
"epoch": 1.1112068222930485,
"grad_norm": 1.5038090944290161,
"learning_rate": 9.876531393568147e-06,
"loss": 0.0517,
"step": 12900
},
{
"epoch": 1.1155138254802308,
"grad_norm": 0.6820291876792908,
"learning_rate": 9.828675344563555e-06,
"loss": 0.0637,
"step": 12950
},
{
"epoch": 1.1198208286674132,
"grad_norm": 3.0986287593841553,
"learning_rate": 9.78081929555896e-06,
"loss": 0.0521,
"step": 13000
},
{
"epoch": 1.1241278318545955,
"grad_norm": 2.395266056060791,
"learning_rate": 9.732963246554366e-06,
"loss": 0.0463,
"step": 13050
},
{
"epoch": 1.1284348350417779,
"grad_norm": 2.018718719482422,
"learning_rate": 9.685107197549771e-06,
"loss": 0.065,
"step": 13100
},
{
"epoch": 1.1327418382289602,
"grad_norm": 2.0893940925598145,
"learning_rate": 9.637251148545177e-06,
"loss": 0.0471,
"step": 13150
},
{
"epoch": 1.1370488414161426,
"grad_norm": 0.0586327388882637,
"learning_rate": 9.589395099540583e-06,
"loss": 0.051,
"step": 13200
},
{
"epoch": 1.141355844603325,
"grad_norm": 2.719393014907837,
"learning_rate": 9.541539050535988e-06,
"loss": 0.054,
"step": 13250
},
{
"epoch": 1.1456628477905073,
"grad_norm": 3.360701084136963,
"learning_rate": 9.493683001531394e-06,
"loss": 0.0475,
"step": 13300
},
{
"epoch": 1.1499698509776897,
"grad_norm": 2.0324788093566895,
"learning_rate": 9.4458269525268e-06,
"loss": 0.0572,
"step": 13350
},
{
"epoch": 1.154276854164872,
"grad_norm": 3.841675281524658,
"learning_rate": 9.397970903522205e-06,
"loss": 0.0455,
"step": 13400
},
{
"epoch": 1.1585838573520544,
"grad_norm": 1.2644069194793701,
"learning_rate": 9.350114854517612e-06,
"loss": 0.0551,
"step": 13450
},
{
"epoch": 1.1628908605392367,
"grad_norm": 1.7926990985870361,
"learning_rate": 9.302258805513018e-06,
"loss": 0.0561,
"step": 13500
},
{
"epoch": 1.167197863726419,
"grad_norm": 1.203897476196289,
"learning_rate": 9.254402756508423e-06,
"loss": 0.0595,
"step": 13550
},
{
"epoch": 1.1715048669136015,
"grad_norm": 2.352569341659546,
"learning_rate": 9.206546707503829e-06,
"loss": 0.0526,
"step": 13600
},
{
"epoch": 1.1758118701007838,
"grad_norm": 1.1437283754348755,
"learning_rate": 9.158690658499236e-06,
"loss": 0.0564,
"step": 13650
},
{
"epoch": 1.1801188732879662,
"grad_norm": 0.766175389289856,
"learning_rate": 9.110834609494642e-06,
"loss": 0.0471,
"step": 13700
},
{
"epoch": 1.1844258764751485,
"grad_norm": 1.0126721858978271,
"learning_rate": 9.062978560490047e-06,
"loss": 0.0597,
"step": 13750
},
{
"epoch": 1.1887328796623309,
"grad_norm": 0.4978892505168915,
"learning_rate": 9.015122511485453e-06,
"loss": 0.0528,
"step": 13800
},
{
"epoch": 1.1930398828495132,
"grad_norm": 2.2827069759368896,
"learning_rate": 8.967266462480858e-06,
"loss": 0.0529,
"step": 13850
},
{
"epoch": 1.1973468860366956,
"grad_norm": 1.0703896284103394,
"learning_rate": 8.919410413476264e-06,
"loss": 0.0499,
"step": 13900
},
{
"epoch": 1.201653889223878,
"grad_norm": 1.4585344791412354,
"learning_rate": 8.87155436447167e-06,
"loss": 0.0577,
"step": 13950
},
{
"epoch": 1.2059608924110603,
"grad_norm": 1.5076733827590942,
"learning_rate": 8.823698315467075e-06,
"loss": 0.0523,
"step": 14000
},
{
"epoch": 1.2102678955982427,
"grad_norm": 0.6510587334632874,
"learning_rate": 8.775842266462482e-06,
"loss": 0.0485,
"step": 14050
},
{
"epoch": 1.214574898785425,
"grad_norm": 1.1064165830612183,
"learning_rate": 8.727986217457888e-06,
"loss": 0.0476,
"step": 14100
},
{
"epoch": 1.2188819019726074,
"grad_norm": 0.22263744473457336,
"learning_rate": 8.680130168453293e-06,
"loss": 0.0518,
"step": 14150
},
{
"epoch": 1.2231889051597897,
"grad_norm": 0.7431623935699463,
"learning_rate": 8.632274119448699e-06,
"loss": 0.0531,
"step": 14200
},
{
"epoch": 1.227495908346972,
"grad_norm": 3.2457995414733887,
"learning_rate": 8.584418070444104e-06,
"loss": 0.0417,
"step": 14250
},
{
"epoch": 1.2318029115341544,
"grad_norm": 2.163710117340088,
"learning_rate": 8.53656202143951e-06,
"loss": 0.0519,
"step": 14300
},
{
"epoch": 1.2361099147213368,
"grad_norm": 1.5874712467193604,
"learning_rate": 8.488705972434916e-06,
"loss": 0.0603,
"step": 14350
},
{
"epoch": 1.2404169179085192,
"grad_norm": 1.1613589525222778,
"learning_rate": 8.440849923430323e-06,
"loss": 0.0507,
"step": 14400
},
{
"epoch": 1.2447239210957015,
"grad_norm": 1.43361234664917,
"learning_rate": 8.392993874425728e-06,
"loss": 0.0519,
"step": 14450
},
{
"epoch": 1.2490309242828839,
"grad_norm": 1.9301427602767944,
"learning_rate": 8.345137825421134e-06,
"loss": 0.0648,
"step": 14500
},
{
"epoch": 1.2533379274700662,
"grad_norm": 1.192642331123352,
"learning_rate": 8.29728177641654e-06,
"loss": 0.0605,
"step": 14550
},
{
"epoch": 1.2576449306572486,
"grad_norm": 1.3436222076416016,
"learning_rate": 8.249425727411947e-06,
"loss": 0.0516,
"step": 14600
},
{
"epoch": 1.261951933844431,
"grad_norm": 0.6770356297492981,
"learning_rate": 8.201569678407352e-06,
"loss": 0.0519,
"step": 14650
},
{
"epoch": 1.2662589370316133,
"grad_norm": 0.7577407360076904,
"learning_rate": 8.153713629402758e-06,
"loss": 0.0491,
"step": 14700
},
{
"epoch": 1.2705659402187957,
"grad_norm": 2.2546918392181396,
"learning_rate": 8.105857580398164e-06,
"loss": 0.0449,
"step": 14750
},
{
"epoch": 1.274872943405978,
"grad_norm": 1.6185483932495117,
"learning_rate": 8.058001531393569e-06,
"loss": 0.0578,
"step": 14800
},
{
"epoch": 1.2791799465931604,
"grad_norm": 0.1822790950536728,
"learning_rate": 8.010145482388975e-06,
"loss": 0.0364,
"step": 14850
},
{
"epoch": 1.2834869497803427,
"grad_norm": 1.012245774269104,
"learning_rate": 7.96228943338438e-06,
"loss": 0.0497,
"step": 14900
},
{
"epoch": 1.287793952967525,
"grad_norm": 1.4130667448043823,
"learning_rate": 7.914433384379786e-06,
"loss": 0.0471,
"step": 14950
},
{
"epoch": 1.2921009561547074,
"grad_norm": 0.2369040846824646,
"learning_rate": 7.866577335375191e-06,
"loss": 0.0491,
"step": 15000
},
{
"epoch": 1.2964079593418898,
"grad_norm": 1.4658520221710205,
"learning_rate": 7.818721286370597e-06,
"loss": 0.0459,
"step": 15050
},
{
"epoch": 1.3007149625290721,
"grad_norm": 1.0052220821380615,
"learning_rate": 7.770865237366003e-06,
"loss": 0.0533,
"step": 15100
},
{
"epoch": 1.3050219657162545,
"grad_norm": 1.026128888130188,
"learning_rate": 7.72300918836141e-06,
"loss": 0.0501,
"step": 15150
},
{
"epoch": 1.3093289689034369,
"grad_norm": 1.5102099180221558,
"learning_rate": 7.675153139356815e-06,
"loss": 0.0569,
"step": 15200
},
{
"epoch": 1.3136359720906192,
"grad_norm": 0.44090044498443604,
"learning_rate": 7.627297090352222e-06,
"loss": 0.0589,
"step": 15250
},
{
"epoch": 1.3179429752778016,
"grad_norm": 2.6066102981567383,
"learning_rate": 7.579441041347627e-06,
"loss": 0.0506,
"step": 15300
},
{
"epoch": 1.322249978464984,
"grad_norm": 1.068942666053772,
"learning_rate": 7.531584992343033e-06,
"loss": 0.0498,
"step": 15350
},
{
"epoch": 1.3265569816521663,
"grad_norm": 1.4411200284957886,
"learning_rate": 7.4837289433384385e-06,
"loss": 0.0507,
"step": 15400
},
{
"epoch": 1.3308639848393486,
"grad_norm": 2.66965913772583,
"learning_rate": 7.435872894333844e-06,
"loss": 0.0554,
"step": 15450
},
{
"epoch": 1.335170988026531,
"grad_norm": 0.7807905673980713,
"learning_rate": 7.38801684532925e-06,
"loss": 0.0582,
"step": 15500
},
{
"epoch": 1.3394779912137136,
"grad_norm": 1.0940531492233276,
"learning_rate": 7.340160796324656e-06,
"loss": 0.0481,
"step": 15550
},
{
"epoch": 1.343784994400896,
"grad_norm": 1.802668809890747,
"learning_rate": 7.2923047473200616e-06,
"loss": 0.0468,
"step": 15600
},
{
"epoch": 1.3480919975880783,
"grad_norm": 0.45175236463546753,
"learning_rate": 7.244448698315467e-06,
"loss": 0.0459,
"step": 15650
},
{
"epoch": 1.3523990007752607,
"grad_norm": 0.5869175791740417,
"learning_rate": 7.196592649310874e-06,
"loss": 0.0494,
"step": 15700
},
{
"epoch": 1.356706003962443,
"grad_norm": 0.7470104694366455,
"learning_rate": 7.14873660030628e-06,
"loss": 0.0513,
"step": 15750
},
{
"epoch": 1.3610130071496254,
"grad_norm": 2.0422937870025635,
"learning_rate": 7.1008805513016855e-06,
"loss": 0.0643,
"step": 15800
},
{
"epoch": 1.3653200103368077,
"grad_norm": 1.8812757730484009,
"learning_rate": 7.053024502297091e-06,
"loss": 0.0577,
"step": 15850
},
{
"epoch": 1.36962701352399,
"grad_norm": 0.21352899074554443,
"learning_rate": 7.005168453292497e-06,
"loss": 0.0459,
"step": 15900
},
{
"epoch": 1.3739340167111724,
"grad_norm": 0.8854315876960754,
"learning_rate": 6.957312404287902e-06,
"loss": 0.0472,
"step": 15950
},
{
"epoch": 1.3782410198983548,
"grad_norm": 1.2343345880508423,
"learning_rate": 6.909456355283309e-06,
"loss": 0.0447,
"step": 16000
},
{
"epoch": 1.3825480230855371,
"grad_norm": 1.955546259880066,
"learning_rate": 6.861600306278714e-06,
"loss": 0.0465,
"step": 16050
},
{
"epoch": 1.3868550262727195,
"grad_norm": 0.9426198601722717,
"learning_rate": 6.81374425727412e-06,
"loss": 0.0623,
"step": 16100
},
{
"epoch": 1.3911620294599019,
"grad_norm": 0.7671981453895569,
"learning_rate": 6.765888208269525e-06,
"loss": 0.0539,
"step": 16150
},
{
"epoch": 1.3954690326470842,
"grad_norm": 1.8504656553268433,
"learning_rate": 6.718032159264931e-06,
"loss": 0.0455,
"step": 16200
},
{
"epoch": 1.3997760358342666,
"grad_norm": 2.851039409637451,
"learning_rate": 6.670176110260338e-06,
"loss": 0.0434,
"step": 16250
},
{
"epoch": 1.404083039021449,
"grad_norm": 2.0784494876861572,
"learning_rate": 6.622320061255744e-06,
"loss": 0.0469,
"step": 16300
},
{
"epoch": 1.4083900422086313,
"grad_norm": 1.1852946281433105,
"learning_rate": 6.574464012251149e-06,
"loss": 0.0508,
"step": 16350
},
{
"epoch": 1.4126970453958136,
"grad_norm": 0.8519759774208069,
"learning_rate": 6.526607963246555e-06,
"loss": 0.0622,
"step": 16400
},
{
"epoch": 1.417004048582996,
"grad_norm": 1.7714693546295166,
"learning_rate": 6.4787519142419604e-06,
"loss": 0.045,
"step": 16450
},
{
"epoch": 1.4213110517701784,
"grad_norm": 1.9129570722579956,
"learning_rate": 6.430895865237367e-06,
"loss": 0.0527,
"step": 16500
},
{
"epoch": 1.4256180549573607,
"grad_norm": 0.8473785519599915,
"learning_rate": 6.383039816232772e-06,
"loss": 0.0438,
"step": 16550
},
{
"epoch": 1.429925058144543,
"grad_norm": 1.0529272556304932,
"learning_rate": 6.335183767228178e-06,
"loss": 0.0462,
"step": 16600
},
{
"epoch": 1.4342320613317254,
"grad_norm": 1.336285948753357,
"learning_rate": 6.2873277182235836e-06,
"loss": 0.0458,
"step": 16650
},
{
"epoch": 1.4385390645189078,
"grad_norm": 0.36205947399139404,
"learning_rate": 6.239471669218989e-06,
"loss": 0.0373,
"step": 16700
},
{
"epoch": 1.4428460677060901,
"grad_norm": 0.3680015504360199,
"learning_rate": 6.191615620214395e-06,
"loss": 0.0507,
"step": 16750
},
{
"epoch": 1.4471530708932725,
"grad_norm": 0.24794526398181915,
"learning_rate": 6.143759571209802e-06,
"loss": 0.0465,
"step": 16800
},
{
"epoch": 1.4514600740804549,
"grad_norm": 1.3948922157287598,
"learning_rate": 6.0959035222052075e-06,
"loss": 0.0427,
"step": 16850
},
{
"epoch": 1.4557670772676372,
"grad_norm": 0.9279671311378479,
"learning_rate": 6.048047473200613e-06,
"loss": 0.0509,
"step": 16900
},
{
"epoch": 1.4600740804548196,
"grad_norm": 1.4752745628356934,
"learning_rate": 6.0001914241960195e-06,
"loss": 0.0545,
"step": 16950
},
{
"epoch": 1.464381083642002,
"grad_norm": 1.0804786682128906,
"learning_rate": 5.952335375191425e-06,
"loss": 0.0488,
"step": 17000
},
{
"epoch": 1.4686880868291843,
"grad_norm": 2.13808536529541,
"learning_rate": 5.904479326186831e-06,
"loss": 0.0504,
"step": 17050
},
{
"epoch": 1.4729950900163666,
"grad_norm": 0.6244465708732605,
"learning_rate": 5.856623277182236e-06,
"loss": 0.0513,
"step": 17100
},
{
"epoch": 1.477302093203549,
"grad_norm": 1.500331163406372,
"learning_rate": 5.808767228177642e-06,
"loss": 0.049,
"step": 17150
},
{
"epoch": 1.4816090963907314,
"grad_norm": 1.1396780014038086,
"learning_rate": 5.760911179173047e-06,
"loss": 0.0537,
"step": 17200
},
{
"epoch": 1.4859160995779137,
"grad_norm": 1.0686885118484497,
"learning_rate": 5.713055130168454e-06,
"loss": 0.0524,
"step": 17250
},
{
"epoch": 1.490223102765096,
"grad_norm": 0.20028911530971527,
"learning_rate": 5.665199081163859e-06,
"loss": 0.0471,
"step": 17300
},
{
"epoch": 1.4945301059522784,
"grad_norm": 1.0594055652618408,
"learning_rate": 5.617343032159266e-06,
"loss": 0.0459,
"step": 17350
},
{
"epoch": 1.4988371091394608,
"grad_norm": 3.5541036128997803,
"learning_rate": 5.569486983154671e-06,
"loss": 0.056,
"step": 17400
},
{
"epoch": 1.5031441123266431,
"grad_norm": 1.243882417678833,
"learning_rate": 5.521630934150078e-06,
"loss": 0.056,
"step": 17450
},
{
"epoch": 1.5074511155138255,
"grad_norm": 0.7564629316329956,
"learning_rate": 5.473774885145483e-06,
"loss": 0.0469,
"step": 17500
},
{
"epoch": 1.5117581187010078,
"grad_norm": 1.5014382600784302,
"learning_rate": 5.425918836140889e-06,
"loss": 0.0467,
"step": 17550
},
{
"epoch": 1.5160651218881902,
"grad_norm": 1.7758488655090332,
"learning_rate": 5.378062787136294e-06,
"loss": 0.0503,
"step": 17600
},
{
"epoch": 1.5203721250753726,
"grad_norm": 0.6620356440544128,
"learning_rate": 5.3302067381317e-06,
"loss": 0.051,
"step": 17650
},
{
"epoch": 1.524679128262555,
"grad_norm": 1.3163670301437378,
"learning_rate": 5.2823506891271055e-06,
"loss": 0.0402,
"step": 17700
},
{
"epoch": 1.5289861314497373,
"grad_norm": 1.738950490951538,
"learning_rate": 5.234494640122512e-06,
"loss": 0.0419,
"step": 17750
},
{
"epoch": 1.5332931346369196,
"grad_norm": 1.325990080833435,
"learning_rate": 5.1866385911179175e-06,
"loss": 0.0524,
"step": 17800
},
{
"epoch": 1.537600137824102,
"grad_norm": 1.0886728763580322,
"learning_rate": 5.138782542113323e-06,
"loss": 0.0523,
"step": 17850
},
{
"epoch": 1.5419071410112843,
"grad_norm": 1.1603420972824097,
"learning_rate": 5.0909264931087295e-06,
"loss": 0.0444,
"step": 17900
},
{
"epoch": 1.5462141441984667,
"grad_norm": 0.38339659571647644,
"learning_rate": 5.043070444104136e-06,
"loss": 0.0481,
"step": 17950
},
{
"epoch": 1.550521147385649,
"grad_norm": 1.1160061359405518,
"learning_rate": 4.995214395099541e-06,
"loss": 0.0523,
"step": 18000
},
{
"epoch": 1.5548281505728314,
"grad_norm": 1.545622706413269,
"learning_rate": 4.947358346094947e-06,
"loss": 0.0551,
"step": 18050
},
{
"epoch": 1.5591351537600138,
"grad_norm": 0.8636265397071838,
"learning_rate": 4.899502297090353e-06,
"loss": 0.058,
"step": 18100
},
{
"epoch": 1.5634421569471961,
"grad_norm": 1.2045224905014038,
"learning_rate": 4.851646248085758e-06,
"loss": 0.0492,
"step": 18150
},
{
"epoch": 1.5677491601343785,
"grad_norm": 1.5447427034378052,
"learning_rate": 4.803790199081165e-06,
"loss": 0.0532,
"step": 18200
},
{
"epoch": 1.5720561633215608,
"grad_norm": 0.7917791604995728,
"learning_rate": 4.75593415007657e-06,
"loss": 0.0481,
"step": 18250
},
{
"epoch": 1.5763631665087432,
"grad_norm": 1.0425752401351929,
"learning_rate": 4.7080781010719766e-06,
"loss": 0.0606,
"step": 18300
},
{
"epoch": 1.5806701696959256,
"grad_norm": 1.5828264951705933,
"learning_rate": 4.660222052067382e-06,
"loss": 0.0514,
"step": 18350
},
{
"epoch": 1.584977172883108,
"grad_norm": 0.5627719759941101,
"learning_rate": 4.612366003062788e-06,
"loss": 0.0489,
"step": 18400
},
{
"epoch": 1.5892841760702903,
"grad_norm": 1.3349500894546509,
"learning_rate": 4.564509954058193e-06,
"loss": 0.0412,
"step": 18450
},
{
"epoch": 1.5935911792574726,
"grad_norm": 1.2263731956481934,
"learning_rate": 4.516653905053599e-06,
"loss": 0.0389,
"step": 18500
},
{
"epoch": 1.597898182444655,
"grad_norm": 0.26084914803504944,
"learning_rate": 4.468797856049004e-06,
"loss": 0.0496,
"step": 18550
},
{
"epoch": 1.6022051856318373,
"grad_norm": 2.95097017288208,
"learning_rate": 4.420941807044411e-06,
"loss": 0.0486,
"step": 18600
},
{
"epoch": 1.6065121888190197,
"grad_norm": 1.1969259977340698,
"learning_rate": 4.373085758039816e-06,
"loss": 0.0453,
"step": 18650
},
{
"epoch": 1.610819192006202,
"grad_norm": 1.319840908050537,
"learning_rate": 4.325229709035223e-06,
"loss": 0.0657,
"step": 18700
},
{
"epoch": 1.6151261951933844,
"grad_norm": 0.9771299958229065,
"learning_rate": 4.277373660030628e-06,
"loss": 0.0445,
"step": 18750
},
{
"epoch": 1.6194331983805668,
"grad_norm": 0.4943866431713104,
"learning_rate": 4.229517611026034e-06,
"loss": 0.0568,
"step": 18800
},
{
"epoch": 1.6237402015677491,
"grad_norm": 3.353970527648926,
"learning_rate": 4.1816615620214395e-06,
"loss": 0.0494,
"step": 18850
},
{
"epoch": 1.6280472047549315,
"grad_norm": 0.42139098048210144,
"learning_rate": 4.133805513016846e-06,
"loss": 0.0476,
"step": 18900
},
{
"epoch": 1.6323542079421138,
"grad_norm": 0.5033485889434814,
"learning_rate": 4.0859494640122515e-06,
"loss": 0.0414,
"step": 18950
},
{
"epoch": 1.6366612111292962,
"grad_norm": 5.167110443115234,
"learning_rate": 4.038093415007657e-06,
"loss": 0.045,
"step": 19000
},
{
"epoch": 1.6409682143164785,
"grad_norm": 1.553246259689331,
"learning_rate": 3.990237366003063e-06,
"loss": 0.0433,
"step": 19050
},
{
"epoch": 1.645275217503661,
"grad_norm": 2.0117363929748535,
"learning_rate": 3.942381316998469e-06,
"loss": 0.0575,
"step": 19100
},
{
"epoch": 1.6495822206908433,
"grad_norm": 0.43269413709640503,
"learning_rate": 3.894525267993875e-06,
"loss": 0.0564,
"step": 19150
},
{
"epoch": 1.6538892238780256,
"grad_norm": 1.9576497077941895,
"learning_rate": 3.846669218989281e-06,
"loss": 0.0449,
"step": 19200
},
{
"epoch": 1.6581962270652082,
"grad_norm": 0.1240052804350853,
"learning_rate": 3.7988131699846866e-06,
"loss": 0.0506,
"step": 19250
},
{
"epoch": 1.6625032302523906,
"grad_norm": 0.18176259100437164,
"learning_rate": 3.750957120980092e-06,
"loss": 0.0526,
"step": 19300
},
{
"epoch": 1.666810233439573,
"grad_norm": 0.851411759853363,
"learning_rate": 3.7031010719754977e-06,
"loss": 0.048,
"step": 19350
},
{
"epoch": 1.6711172366267553,
"grad_norm": 1.481475591659546,
"learning_rate": 3.6552450229709037e-06,
"loss": 0.0367,
"step": 19400
},
{
"epoch": 1.6754242398139376,
"grad_norm": 1.092772126197815,
"learning_rate": 3.6073889739663097e-06,
"loss": 0.0524,
"step": 19450
},
{
"epoch": 1.67973124300112,
"grad_norm": 1.5269631147384644,
"learning_rate": 3.5595329249617157e-06,
"loss": 0.051,
"step": 19500
},
{
"epoch": 1.6840382461883023,
"grad_norm": 1.2700409889221191,
"learning_rate": 3.5116768759571212e-06,
"loss": 0.0405,
"step": 19550
},
{
"epoch": 1.6883452493754847,
"grad_norm": 1.794728398323059,
"learning_rate": 3.463820826952527e-06,
"loss": 0.0514,
"step": 19600
},
{
"epoch": 1.692652252562667,
"grad_norm": 1.415767788887024,
"learning_rate": 3.415964777947933e-06,
"loss": 0.0524,
"step": 19650
},
{
"epoch": 1.6969592557498494,
"grad_norm": 2.1614432334899902,
"learning_rate": 3.3681087289433388e-06,
"loss": 0.0606,
"step": 19700
},
{
"epoch": 1.7012662589370318,
"grad_norm": 0.6388081908226013,
"learning_rate": 3.3202526799387448e-06,
"loss": 0.0566,
"step": 19750
},
{
"epoch": 1.7055732621242141,
"grad_norm": 0.44299671053886414,
"learning_rate": 3.2723966309341503e-06,
"loss": 0.0457,
"step": 19800
},
{
"epoch": 1.7098802653113965,
"grad_norm": 2.9245407581329346,
"learning_rate": 3.224540581929556e-06,
"loss": 0.0426,
"step": 19850
},
{
"epoch": 1.7141872684985788,
"grad_norm": 1.0790481567382812,
"learning_rate": 3.176684532924962e-06,
"loss": 0.0431,
"step": 19900
},
{
"epoch": 1.7184942716857612,
"grad_norm": 0.9610195159912109,
"learning_rate": 3.1288284839203675e-06,
"loss": 0.0465,
"step": 19950
},
{
"epoch": 1.7228012748729435,
"grad_norm": 1.2030363082885742,
"learning_rate": 3.080972434915774e-06,
"loss": 0.046,
"step": 20000
},
{
"epoch": 1.727108278060126,
"grad_norm": 1.5671857595443726,
"learning_rate": 3.0331163859111794e-06,
"loss": 0.0462,
"step": 20050
},
{
"epoch": 1.7314152812473083,
"grad_norm": 0.1325124055147171,
"learning_rate": 2.9852603369065854e-06,
"loss": 0.043,
"step": 20100
},
{
"epoch": 1.7357222844344906,
"grad_norm": 0.3264756500720978,
"learning_rate": 2.937404287901991e-06,
"loss": 0.0504,
"step": 20150
},
{
"epoch": 1.740029287621673,
"grad_norm": 1.5625221729278564,
"learning_rate": 2.8895482388973966e-06,
"loss": 0.0552,
"step": 20200
},
{
"epoch": 1.7443362908088553,
"grad_norm": 1.4193893671035767,
"learning_rate": 2.841692189892803e-06,
"loss": 0.0464,
"step": 20250
},
{
"epoch": 1.7486432939960377,
"grad_norm": 1.1241008043289185,
"learning_rate": 2.7938361408882086e-06,
"loss": 0.0451,
"step": 20300
},
{
"epoch": 1.75295029718322,
"grad_norm": 1.5155802965164185,
"learning_rate": 2.7459800918836145e-06,
"loss": 0.0469,
"step": 20350
},
{
"epoch": 1.7572573003704024,
"grad_norm": 1.757646918296814,
"learning_rate": 2.69812404287902e-06,
"loss": 0.0498,
"step": 20400
},
{
"epoch": 1.7615643035575848,
"grad_norm": 0.23753665387630463,
"learning_rate": 2.6502679938744257e-06,
"loss": 0.0468,
"step": 20450
},
{
"epoch": 1.765871306744767,
"grad_norm": 1.951041579246521,
"learning_rate": 2.6024119448698317e-06,
"loss": 0.0493,
"step": 20500
},
{
"epoch": 1.7701783099319495,
"grad_norm": 0.08196475356817245,
"learning_rate": 2.5545558958652377e-06,
"loss": 0.0397,
"step": 20550
},
{
"epoch": 1.7744853131191318,
"grad_norm": 1.0395424365997314,
"learning_rate": 2.5066998468606436e-06,
"loss": 0.0472,
"step": 20600
},
{
"epoch": 1.7787923163063142,
"grad_norm": 2.305999517440796,
"learning_rate": 2.4588437978560492e-06,
"loss": 0.0393,
"step": 20650
},
{
"epoch": 1.7830993194934965,
"grad_norm": 3.8407812118530273,
"learning_rate": 2.4109877488514548e-06,
"loss": 0.0446,
"step": 20700
},
{
"epoch": 1.787406322680679,
"grad_norm": 1.6644717454910278,
"learning_rate": 2.3631316998468608e-06,
"loss": 0.066,
"step": 20750
},
{
"epoch": 1.7917133258678613,
"grad_norm": 1.8919422626495361,
"learning_rate": 2.3152756508422668e-06,
"loss": 0.0495,
"step": 20800
},
{
"epoch": 1.7960203290550436,
"grad_norm": 0.3518455922603607,
"learning_rate": 2.2674196018376723e-06,
"loss": 0.039,
"step": 20850
},
{
"epoch": 1.800327332242226,
"grad_norm": 0.12189528346061707,
"learning_rate": 2.2195635528330783e-06,
"loss": 0.0561,
"step": 20900
},
{
"epoch": 1.8046343354294083,
"grad_norm": 0.3493580222129822,
"learning_rate": 2.171707503828484e-06,
"loss": 0.0456,
"step": 20950
},
{
"epoch": 1.8089413386165907,
"grad_norm": 2.5483009815216064,
"learning_rate": 2.12385145482389e-06,
"loss": 0.0441,
"step": 21000
},
{
"epoch": 1.813248341803773,
"grad_norm": 1.2866871356964111,
"learning_rate": 2.075995405819296e-06,
"loss": 0.0438,
"step": 21050
},
{
"epoch": 1.8175553449909554,
"grad_norm": 0.9496790170669556,
"learning_rate": 2.0281393568147014e-06,
"loss": 0.0546,
"step": 21100
},
{
"epoch": 1.8218623481781377,
"grad_norm": 1.427046537399292,
"learning_rate": 1.9802833078101074e-06,
"loss": 0.0596,
"step": 21150
},
{
"epoch": 1.82616935136532,
"grad_norm": 0.9176798462867737,
"learning_rate": 1.932427258805513e-06,
"loss": 0.0421,
"step": 21200
},
{
"epoch": 1.8304763545525025,
"grad_norm": 0.26487183570861816,
"learning_rate": 1.884571209800919e-06,
"loss": 0.0675,
"step": 21250
},
{
"epoch": 1.8347833577396848,
"grad_norm": 0.9805583953857422,
"learning_rate": 1.836715160796325e-06,
"loss": 0.0462,
"step": 21300
},
{
"epoch": 1.8390903609268672,
"grad_norm": 2.21886944770813,
"learning_rate": 1.7888591117917305e-06,
"loss": 0.0453,
"step": 21350
},
{
"epoch": 1.8433973641140495,
"grad_norm": 0.9027713537216187,
"learning_rate": 1.7410030627871363e-06,
"loss": 0.0523,
"step": 21400
},
{
"epoch": 1.8477043673012319,
"grad_norm": 1.9989973306655884,
"learning_rate": 1.6931470137825423e-06,
"loss": 0.0398,
"step": 21450
},
{
"epoch": 1.8520113704884142,
"grad_norm": 1.208064079284668,
"learning_rate": 1.645290964777948e-06,
"loss": 0.0586,
"step": 21500
},
{
"epoch": 1.8563183736755966,
"grad_norm": 0.7394013404846191,
"learning_rate": 1.5974349157733536e-06,
"loss": 0.0525,
"step": 21550
},
{
"epoch": 1.860625376862779,
"grad_norm": 0.4758661985397339,
"learning_rate": 1.5495788667687596e-06,
"loss": 0.0524,
"step": 21600
},
{
"epoch": 1.8649323800499613,
"grad_norm": 1.2074617147445679,
"learning_rate": 1.5017228177641654e-06,
"loss": 0.0533,
"step": 21650
},
{
"epoch": 1.8692393832371437,
"grad_norm": 0.33246493339538574,
"learning_rate": 1.4538667687595714e-06,
"loss": 0.0524,
"step": 21700
},
{
"epoch": 1.873546386424326,
"grad_norm": 1.7372647523880005,
"learning_rate": 1.4060107197549772e-06,
"loss": 0.0487,
"step": 21750
},
{
"epoch": 1.8778533896115084,
"grad_norm": 0.39233338832855225,
"learning_rate": 1.358154670750383e-06,
"loss": 0.0421,
"step": 21800
},
{
"epoch": 1.8821603927986907,
"grad_norm": 0.8911519646644592,
"learning_rate": 1.310298621745789e-06,
"loss": 0.0453,
"step": 21850
},
{
"epoch": 1.886467395985873,
"grad_norm": 1.263708233833313,
"learning_rate": 1.2624425727411945e-06,
"loss": 0.046,
"step": 21900
},
{
"epoch": 1.8907743991730555,
"grad_norm": 0.7938653826713562,
"learning_rate": 1.2145865237366005e-06,
"loss": 0.041,
"step": 21950
},
{
"epoch": 1.8950814023602378,
"grad_norm": 3.179079294204712,
"learning_rate": 1.166730474732006e-06,
"loss": 0.0501,
"step": 22000
},
{
"epoch": 1.8993884055474202,
"grad_norm": 2.3698971271514893,
"learning_rate": 1.118874425727412e-06,
"loss": 0.0479,
"step": 22050
},
{
"epoch": 1.9036954087346025,
"grad_norm": 0.8952911496162415,
"learning_rate": 1.0710183767228178e-06,
"loss": 0.0412,
"step": 22100
},
{
"epoch": 1.9080024119217849,
"grad_norm": 1.1281505823135376,
"learning_rate": 1.0231623277182236e-06,
"loss": 0.0427,
"step": 22150
},
{
"epoch": 1.9123094151089672,
"grad_norm": 2.677870273590088,
"learning_rate": 9.753062787136294e-07,
"loss": 0.0469,
"step": 22200
},
{
"epoch": 1.9166164182961496,
"grad_norm": 0.686228334903717,
"learning_rate": 9.274502297090353e-07,
"loss": 0.0415,
"step": 22250
},
{
"epoch": 1.920923421483332,
"grad_norm": 1.754647135734558,
"learning_rate": 8.795941807044412e-07,
"loss": 0.044,
"step": 22300
},
{
"epoch": 1.9252304246705143,
"grad_norm": 0.9961816668510437,
"learning_rate": 8.31738131699847e-07,
"loss": 0.0541,
"step": 22350
},
{
"epoch": 1.9295374278576967,
"grad_norm": 2.1705892086029053,
"learning_rate": 7.838820826952527e-07,
"loss": 0.0435,
"step": 22400
},
{
"epoch": 1.933844431044879,
"grad_norm": 1.982060194015503,
"learning_rate": 7.360260336906586e-07,
"loss": 0.0434,
"step": 22450
},
{
"epoch": 1.9381514342320614,
"grad_norm": 1.5377906560897827,
"learning_rate": 6.881699846860644e-07,
"loss": 0.0416,
"step": 22500
},
{
"epoch": 1.9424584374192437,
"grad_norm": 1.7990597486495972,
"learning_rate": 6.403139356814702e-07,
"loss": 0.0429,
"step": 22550
},
{
"epoch": 1.946765440606426,
"grad_norm": 1.0789055824279785,
"learning_rate": 5.924578866768759e-07,
"loss": 0.0498,
"step": 22600
},
{
"epoch": 1.9510724437936084,
"grad_norm": 1.073246717453003,
"learning_rate": 5.446018376722818e-07,
"loss": 0.0526,
"step": 22650
},
{
"epoch": 1.9553794469807908,
"grad_norm": 0.3440791964530945,
"learning_rate": 4.967457886676876e-07,
"loss": 0.0447,
"step": 22700
},
{
"epoch": 1.9596864501679732,
"grad_norm": 1.1172747611999512,
"learning_rate": 4.488897396630935e-07,
"loss": 0.051,
"step": 22750
},
{
"epoch": 1.9639934533551555,
"grad_norm": 0.6673486232757568,
"learning_rate": 4.0103369065849927e-07,
"loss": 0.0435,
"step": 22800
},
{
"epoch": 1.9683004565423379,
"grad_norm": 0.4096444249153137,
"learning_rate": 3.531776416539051e-07,
"loss": 0.0358,
"step": 22850
},
{
"epoch": 1.9726074597295202,
"grad_norm": 0.7605034708976746,
"learning_rate": 3.053215926493109e-07,
"loss": 0.0484,
"step": 22900
},
{
"epoch": 1.9769144629167026,
"grad_norm": 0.8487497568130493,
"learning_rate": 2.574655436447167e-07,
"loss": 0.0498,
"step": 22950
},
{
"epoch": 1.981221466103885,
"grad_norm": 0.12532584369182587,
"learning_rate": 2.0960949464012254e-07,
"loss": 0.0501,
"step": 23000
},
{
"epoch": 1.9855284692910673,
"grad_norm": 0.635976254940033,
"learning_rate": 1.6175344563552835e-07,
"loss": 0.044,
"step": 23050
},
{
"epoch": 1.9898354724782497,
"grad_norm": 1.429021954536438,
"learning_rate": 1.1389739663093415e-07,
"loss": 0.0446,
"step": 23100
},
{
"epoch": 1.994142475665432,
"grad_norm": 1.3071976900100708,
"learning_rate": 6.604134762633997e-08,
"loss": 0.045,
"step": 23150
},
{
"epoch": 1.9984494788526144,
"grad_norm": 1.3366061449050903,
"learning_rate": 1.818529862174579e-08,
"loss": 0.0455,
"step": 23200
}
],
"logging_steps": 50,
"max_steps": 23218,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.954863372257956e+17,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}