ChineseErrorCorrector-32B-LORA / trainer_state.json
twnlp's picture
Upload trainer_state.json
03b9c07 verified
{
"best_metric": 0.03919154778122902,
"best_model_checkpoint": null,
"epoch": 8.378016085790884,
"eval_steps": 10000,
"global_step": 50000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008378016085790885,
"grad_norm": 0.1641591489315033,
"learning_rate": 2e-05,
"loss": 0.2693,
"step": 50
},
{
"epoch": 0.01675603217158177,
"grad_norm": 0.12203536182641983,
"learning_rate": 1.99832299178266e-05,
"loss": 0.1092,
"step": 100
},
{
"epoch": 0.025134048257372654,
"grad_norm": 0.1460294872522354,
"learning_rate": 1.9966459835653196e-05,
"loss": 0.0971,
"step": 150
},
{
"epoch": 0.03351206434316354,
"grad_norm": 0.18384236097335815,
"learning_rate": 1.9949689753479794e-05,
"loss": 0.0891,
"step": 200
},
{
"epoch": 0.041890080428954424,
"grad_norm": 0.14532588422298431,
"learning_rate": 1.993291967130639e-05,
"loss": 0.0825,
"step": 250
},
{
"epoch": 0.05026809651474531,
"grad_norm": 0.1657487004995346,
"learning_rate": 1.991614958913299e-05,
"loss": 0.085,
"step": 300
},
{
"epoch": 0.05864611260053619,
"grad_norm": 0.16286590695381165,
"learning_rate": 1.9899379506959587e-05,
"loss": 0.074,
"step": 350
},
{
"epoch": 0.06702412868632708,
"grad_norm": 0.17318418622016907,
"learning_rate": 1.9882609424786182e-05,
"loss": 0.0723,
"step": 400
},
{
"epoch": 0.07540214477211796,
"grad_norm": 0.20146086812019348,
"learning_rate": 1.986583934261278e-05,
"loss": 0.0726,
"step": 450
},
{
"epoch": 0.08378016085790885,
"grad_norm": 0.25428423285484314,
"learning_rate": 1.9849069260439376e-05,
"loss": 0.0745,
"step": 500
},
{
"epoch": 0.09215817694369974,
"grad_norm": 0.25297069549560547,
"learning_rate": 1.9832299178265975e-05,
"loss": 0.0686,
"step": 550
},
{
"epoch": 0.10053619302949061,
"grad_norm": 0.24197077751159668,
"learning_rate": 1.981552909609257e-05,
"loss": 0.0689,
"step": 600
},
{
"epoch": 0.1089142091152815,
"grad_norm": 0.2025458812713623,
"learning_rate": 1.979875901391917e-05,
"loss": 0.064,
"step": 650
},
{
"epoch": 0.11729222520107238,
"grad_norm": 0.2370821088552475,
"learning_rate": 1.9781988931745768e-05,
"loss": 0.0661,
"step": 700
},
{
"epoch": 0.12567024128686327,
"grad_norm": 0.23466931283473969,
"learning_rate": 1.9765218849572367e-05,
"loss": 0.0628,
"step": 750
},
{
"epoch": 0.13404825737265416,
"grad_norm": 0.27670082449913025,
"learning_rate": 1.9748448767398962e-05,
"loss": 0.0608,
"step": 800
},
{
"epoch": 0.14242627345844505,
"grad_norm": 0.25532266497612,
"learning_rate": 1.973167868522556e-05,
"loss": 0.0578,
"step": 850
},
{
"epoch": 0.15080428954423591,
"grad_norm": 0.2733491063117981,
"learning_rate": 1.9714908603052156e-05,
"loss": 0.0586,
"step": 900
},
{
"epoch": 0.1591823056300268,
"grad_norm": 0.31975990533828735,
"learning_rate": 1.9698138520878755e-05,
"loss": 0.063,
"step": 950
},
{
"epoch": 0.1675603217158177,
"grad_norm": 0.2980721592903137,
"learning_rate": 1.968136843870535e-05,
"loss": 0.0615,
"step": 1000
},
{
"epoch": 0.17593833780160859,
"grad_norm": 0.2662040591239929,
"learning_rate": 1.966459835653195e-05,
"loss": 0.0573,
"step": 1050
},
{
"epoch": 0.18431635388739948,
"grad_norm": 0.24934068322181702,
"learning_rate": 1.9647828274358547e-05,
"loss": 0.0578,
"step": 1100
},
{
"epoch": 0.19269436997319034,
"grad_norm": 0.35513001680374146,
"learning_rate": 1.9631058192185143e-05,
"loss": 0.0567,
"step": 1150
},
{
"epoch": 0.20107238605898123,
"grad_norm": 0.2941363453865051,
"learning_rate": 1.961428811001174e-05,
"loss": 0.0613,
"step": 1200
},
{
"epoch": 0.20945040214477212,
"grad_norm": 0.2334873378276825,
"learning_rate": 1.9597518027838337e-05,
"loss": 0.0564,
"step": 1250
},
{
"epoch": 0.217828418230563,
"grad_norm": 0.3162507116794586,
"learning_rate": 1.9580747945664935e-05,
"loss": 0.0546,
"step": 1300
},
{
"epoch": 0.2262064343163539,
"grad_norm": 0.3287353217601776,
"learning_rate": 1.956397786349153e-05,
"loss": 0.0561,
"step": 1350
},
{
"epoch": 0.23458445040214476,
"grad_norm": 0.34116727113723755,
"learning_rate": 1.954720778131813e-05,
"loss": 0.0541,
"step": 1400
},
{
"epoch": 0.24296246648793565,
"grad_norm": 0.2549584209918976,
"learning_rate": 1.9530437699144725e-05,
"loss": 0.0512,
"step": 1450
},
{
"epoch": 0.25134048257372654,
"grad_norm": 0.2564306855201721,
"learning_rate": 1.9513667616971327e-05,
"loss": 0.053,
"step": 1500
},
{
"epoch": 0.2597184986595174,
"grad_norm": 0.39897748827934265,
"learning_rate": 1.9496897534797922e-05,
"loss": 0.0499,
"step": 1550
},
{
"epoch": 0.2680965147453083,
"grad_norm": 0.3399379849433899,
"learning_rate": 1.948012745262452e-05,
"loss": 0.0527,
"step": 1600
},
{
"epoch": 0.2764745308310992,
"grad_norm": 0.3706755042076111,
"learning_rate": 1.9463357370451116e-05,
"loss": 0.0522,
"step": 1650
},
{
"epoch": 0.2848525469168901,
"grad_norm": 0.3208563029766083,
"learning_rate": 1.9446587288277715e-05,
"loss": 0.0502,
"step": 1700
},
{
"epoch": 0.29323056300268097,
"grad_norm": 0.30643364787101746,
"learning_rate": 1.942981720610431e-05,
"loss": 0.0517,
"step": 1750
},
{
"epoch": 0.30160857908847183,
"grad_norm": 0.28462880849838257,
"learning_rate": 1.941304712393091e-05,
"loss": 0.0483,
"step": 1800
},
{
"epoch": 0.30998659517426275,
"grad_norm": 0.46007809042930603,
"learning_rate": 1.9396277041757504e-05,
"loss": 0.0534,
"step": 1850
},
{
"epoch": 0.3183646112600536,
"grad_norm": 0.27532362937927246,
"learning_rate": 1.9379506959584103e-05,
"loss": 0.049,
"step": 1900
},
{
"epoch": 0.3267426273458445,
"grad_norm": 0.3934316337108612,
"learning_rate": 1.93627368774107e-05,
"loss": 0.0504,
"step": 1950
},
{
"epoch": 0.3351206434316354,
"grad_norm": 0.38043123483657837,
"learning_rate": 1.93459667952373e-05,
"loss": 0.0474,
"step": 2000
},
{
"epoch": 0.34349865951742625,
"grad_norm": 0.33170923590660095,
"learning_rate": 1.9329196713063896e-05,
"loss": 0.0474,
"step": 2050
},
{
"epoch": 0.35187667560321717,
"grad_norm": 0.34464696049690247,
"learning_rate": 1.9312426630890494e-05,
"loss": 0.0494,
"step": 2100
},
{
"epoch": 0.36025469168900803,
"grad_norm": 0.33678779006004333,
"learning_rate": 1.929565654871709e-05,
"loss": 0.0465,
"step": 2150
},
{
"epoch": 0.36863270777479895,
"grad_norm": 0.34634217619895935,
"learning_rate": 1.927888646654369e-05,
"loss": 0.046,
"step": 2200
},
{
"epoch": 0.3770107238605898,
"grad_norm": 0.48531678318977356,
"learning_rate": 1.9262116384370284e-05,
"loss": 0.0463,
"step": 2250
},
{
"epoch": 0.3853887399463807,
"grad_norm": 0.33334800601005554,
"learning_rate": 1.9245346302196882e-05,
"loss": 0.047,
"step": 2300
},
{
"epoch": 0.3937667560321716,
"grad_norm": 0.4677096903324127,
"learning_rate": 1.922857622002348e-05,
"loss": 0.0474,
"step": 2350
},
{
"epoch": 0.40214477211796246,
"grad_norm": 0.35066741704940796,
"learning_rate": 1.9211806137850076e-05,
"loss": 0.0449,
"step": 2400
},
{
"epoch": 0.4105227882037534,
"grad_norm": 0.3857254087924957,
"learning_rate": 1.9195036055676675e-05,
"loss": 0.0453,
"step": 2450
},
{
"epoch": 0.41890080428954424,
"grad_norm": 0.36052629351615906,
"learning_rate": 1.917826597350327e-05,
"loss": 0.0469,
"step": 2500
},
{
"epoch": 0.4272788203753351,
"grad_norm": 0.3650895059108734,
"learning_rate": 1.916149589132987e-05,
"loss": 0.0483,
"step": 2550
},
{
"epoch": 0.435656836461126,
"grad_norm": 0.34670376777648926,
"learning_rate": 1.9144725809156465e-05,
"loss": 0.0449,
"step": 2600
},
{
"epoch": 0.4440348525469169,
"grad_norm": 0.36593642830848694,
"learning_rate": 1.9127955726983063e-05,
"loss": 0.0449,
"step": 2650
},
{
"epoch": 0.4524128686327078,
"grad_norm": 0.31553247570991516,
"learning_rate": 1.9111185644809662e-05,
"loss": 0.0448,
"step": 2700
},
{
"epoch": 0.46079088471849866,
"grad_norm": 0.30997416377067566,
"learning_rate": 1.909441556263626e-05,
"loss": 0.0488,
"step": 2750
},
{
"epoch": 0.4691689008042895,
"grad_norm": 0.4204448461532593,
"learning_rate": 1.9077645480462856e-05,
"loss": 0.0443,
"step": 2800
},
{
"epoch": 0.47754691689008044,
"grad_norm": 0.36868560314178467,
"learning_rate": 1.9060875398289455e-05,
"loss": 0.0428,
"step": 2850
},
{
"epoch": 0.4859249329758713,
"grad_norm": 0.37285274267196655,
"learning_rate": 1.904410531611605e-05,
"loss": 0.0439,
"step": 2900
},
{
"epoch": 0.4943029490616622,
"grad_norm": 0.4258297085762024,
"learning_rate": 1.902733523394265e-05,
"loss": 0.0446,
"step": 2950
},
{
"epoch": 0.5026809651474531,
"grad_norm": 0.34184491634368896,
"learning_rate": 1.9010565151769244e-05,
"loss": 0.0399,
"step": 3000
},
{
"epoch": 0.511058981233244,
"grad_norm": 0.404744029045105,
"learning_rate": 1.8993795069595843e-05,
"loss": 0.0434,
"step": 3050
},
{
"epoch": 0.5194369973190348,
"grad_norm": 0.31526079773902893,
"learning_rate": 1.8977024987422438e-05,
"loss": 0.042,
"step": 3100
},
{
"epoch": 0.5278150134048257,
"grad_norm": 0.39627355337142944,
"learning_rate": 1.8960254905249037e-05,
"loss": 0.0418,
"step": 3150
},
{
"epoch": 0.5361930294906166,
"grad_norm": 0.39220544695854187,
"learning_rate": 1.8943484823075635e-05,
"loss": 0.0434,
"step": 3200
},
{
"epoch": 0.5445710455764075,
"grad_norm": 0.4202696979045868,
"learning_rate": 1.892671474090223e-05,
"loss": 0.0453,
"step": 3250
},
{
"epoch": 0.5529490616621984,
"grad_norm": 0.31564274430274963,
"learning_rate": 1.890994465872883e-05,
"loss": 0.0423,
"step": 3300
},
{
"epoch": 0.5613270777479893,
"grad_norm": 0.43861642479896545,
"learning_rate": 1.8893174576555425e-05,
"loss": 0.0432,
"step": 3350
},
{
"epoch": 0.5697050938337802,
"grad_norm": 0.41774672269821167,
"learning_rate": 1.8876404494382024e-05,
"loss": 0.0424,
"step": 3400
},
{
"epoch": 0.578083109919571,
"grad_norm": 0.44408470392227173,
"learning_rate": 1.8859634412208622e-05,
"loss": 0.0393,
"step": 3450
},
{
"epoch": 0.5864611260053619,
"grad_norm": 0.5111362338066101,
"learning_rate": 1.8842864330035218e-05,
"loss": 0.0402,
"step": 3500
},
{
"epoch": 0.5948391420911529,
"grad_norm": 0.48010021448135376,
"learning_rate": 1.8826094247861816e-05,
"loss": 0.0403,
"step": 3550
},
{
"epoch": 0.6032171581769437,
"grad_norm": 0.41536250710487366,
"learning_rate": 1.8809324165688415e-05,
"loss": 0.0441,
"step": 3600
},
{
"epoch": 0.6115951742627346,
"grad_norm": 0.40686219930648804,
"learning_rate": 1.879255408351501e-05,
"loss": 0.0408,
"step": 3650
},
{
"epoch": 0.6199731903485255,
"grad_norm": 0.4435434937477112,
"learning_rate": 1.877578400134161e-05,
"loss": 0.0434,
"step": 3700
},
{
"epoch": 0.6283512064343163,
"grad_norm": 0.4401046633720398,
"learning_rate": 1.8759013919168204e-05,
"loss": 0.042,
"step": 3750
},
{
"epoch": 0.6367292225201072,
"grad_norm": 0.40911954641342163,
"learning_rate": 1.8742243836994803e-05,
"loss": 0.0413,
"step": 3800
},
{
"epoch": 0.6451072386058981,
"grad_norm": 0.490383118391037,
"learning_rate": 1.87254737548214e-05,
"loss": 0.0423,
"step": 3850
},
{
"epoch": 0.653485254691689,
"grad_norm": 0.4375227093696594,
"learning_rate": 1.8708703672647997e-05,
"loss": 0.0386,
"step": 3900
},
{
"epoch": 0.6618632707774799,
"grad_norm": 0.336227685213089,
"learning_rate": 1.8691933590474596e-05,
"loss": 0.042,
"step": 3950
},
{
"epoch": 0.6702412868632708,
"grad_norm": 0.5190924406051636,
"learning_rate": 1.8675163508301194e-05,
"loss": 0.0405,
"step": 4000
},
{
"epoch": 0.6786193029490617,
"grad_norm": 0.3751809298992157,
"learning_rate": 1.865839342612779e-05,
"loss": 0.0383,
"step": 4050
},
{
"epoch": 0.6869973190348525,
"grad_norm": 0.34148427844047546,
"learning_rate": 1.864162334395439e-05,
"loss": 0.0388,
"step": 4100
},
{
"epoch": 0.6953753351206434,
"grad_norm": 0.5154247879981995,
"learning_rate": 1.8624853261780984e-05,
"loss": 0.0392,
"step": 4150
},
{
"epoch": 0.7037533512064343,
"grad_norm": 0.3212796151638031,
"learning_rate": 1.8608083179607583e-05,
"loss": 0.0397,
"step": 4200
},
{
"epoch": 0.7121313672922251,
"grad_norm": 0.3693840503692627,
"learning_rate": 1.8591313097434178e-05,
"loss": 0.0399,
"step": 4250
},
{
"epoch": 0.7205093833780161,
"grad_norm": 0.384682297706604,
"learning_rate": 1.8574543015260777e-05,
"loss": 0.0394,
"step": 4300
},
{
"epoch": 0.728887399463807,
"grad_norm": 0.5106825828552246,
"learning_rate": 1.8557772933087375e-05,
"loss": 0.0376,
"step": 4350
},
{
"epoch": 0.7372654155495979,
"grad_norm": 0.5798951983451843,
"learning_rate": 1.854100285091397e-05,
"loss": 0.0384,
"step": 4400
},
{
"epoch": 0.7456434316353887,
"grad_norm": 0.4215037226676941,
"learning_rate": 1.852423276874057e-05,
"loss": 0.0371,
"step": 4450
},
{
"epoch": 0.7540214477211796,
"grad_norm": 0.41392162442207336,
"learning_rate": 1.8507462686567165e-05,
"loss": 0.0395,
"step": 4500
},
{
"epoch": 0.7623994638069705,
"grad_norm": 0.38111090660095215,
"learning_rate": 1.8490692604393763e-05,
"loss": 0.0397,
"step": 4550
},
{
"epoch": 0.7707774798927614,
"grad_norm": 0.5323607325553894,
"learning_rate": 1.847392252222036e-05,
"loss": 0.0389,
"step": 4600
},
{
"epoch": 0.7791554959785523,
"grad_norm": 0.3730742335319519,
"learning_rate": 1.8457152440046957e-05,
"loss": 0.037,
"step": 4650
},
{
"epoch": 0.7875335120643432,
"grad_norm": 0.5167490243911743,
"learning_rate": 1.8440382357873553e-05,
"loss": 0.0398,
"step": 4700
},
{
"epoch": 0.795911528150134,
"grad_norm": 0.3720487356185913,
"learning_rate": 1.8423612275700155e-05,
"loss": 0.0353,
"step": 4750
},
{
"epoch": 0.8042895442359249,
"grad_norm": 0.49233752489089966,
"learning_rate": 1.840684219352675e-05,
"loss": 0.0375,
"step": 4800
},
{
"epoch": 0.8126675603217158,
"grad_norm": 0.35151785612106323,
"learning_rate": 1.839007211135335e-05,
"loss": 0.0367,
"step": 4850
},
{
"epoch": 0.8210455764075067,
"grad_norm": 0.4015248119831085,
"learning_rate": 1.8373302029179944e-05,
"loss": 0.0402,
"step": 4900
},
{
"epoch": 0.8294235924932976,
"grad_norm": 0.3868032395839691,
"learning_rate": 1.8356531947006543e-05,
"loss": 0.0364,
"step": 4950
},
{
"epoch": 0.8378016085790885,
"grad_norm": 0.3618241250514984,
"learning_rate": 1.8339761864833138e-05,
"loss": 0.0365,
"step": 5000
},
{
"epoch": 0.8461796246648794,
"grad_norm": 0.4246107041835785,
"learning_rate": 1.8322991782659737e-05,
"loss": 0.0383,
"step": 5050
},
{
"epoch": 0.8545576407506702,
"grad_norm": 0.4502660632133484,
"learning_rate": 1.8306221700486332e-05,
"loss": 0.0384,
"step": 5100
},
{
"epoch": 0.8629356568364611,
"grad_norm": 0.38021931052207947,
"learning_rate": 1.828945161831293e-05,
"loss": 0.036,
"step": 5150
},
{
"epoch": 0.871313672922252,
"grad_norm": 0.49084368348121643,
"learning_rate": 1.827268153613953e-05,
"loss": 0.0403,
"step": 5200
},
{
"epoch": 0.8796916890080428,
"grad_norm": 0.4013173282146454,
"learning_rate": 1.8255911453966125e-05,
"loss": 0.038,
"step": 5250
},
{
"epoch": 0.8880697050938338,
"grad_norm": 0.4591931700706482,
"learning_rate": 1.8239141371792724e-05,
"loss": 0.0345,
"step": 5300
},
{
"epoch": 0.8964477211796247,
"grad_norm": 0.3261602520942688,
"learning_rate": 1.822237128961932e-05,
"loss": 0.0371,
"step": 5350
},
{
"epoch": 0.9048257372654156,
"grad_norm": 0.5109397172927856,
"learning_rate": 1.8205601207445918e-05,
"loss": 0.0352,
"step": 5400
},
{
"epoch": 0.9132037533512064,
"grad_norm": 0.4951651990413666,
"learning_rate": 1.8188831125272516e-05,
"loss": 0.0384,
"step": 5450
},
{
"epoch": 0.9215817694369973,
"grad_norm": 0.2933291494846344,
"learning_rate": 1.817206104309911e-05,
"loss": 0.0367,
"step": 5500
},
{
"epoch": 0.9299597855227882,
"grad_norm": 0.518692135810852,
"learning_rate": 1.815529096092571e-05,
"loss": 0.0362,
"step": 5550
},
{
"epoch": 0.938337801608579,
"grad_norm": 0.4825911223888397,
"learning_rate": 1.813852087875231e-05,
"loss": 0.0363,
"step": 5600
},
{
"epoch": 0.94671581769437,
"grad_norm": 0.45922228693962097,
"learning_rate": 1.8121750796578904e-05,
"loss": 0.0381,
"step": 5650
},
{
"epoch": 0.9550938337801609,
"grad_norm": 0.3230240046977997,
"learning_rate": 1.8104980714405503e-05,
"loss": 0.0342,
"step": 5700
},
{
"epoch": 0.9634718498659517,
"grad_norm": 0.3606482744216919,
"learning_rate": 1.80882106322321e-05,
"loss": 0.0355,
"step": 5750
},
{
"epoch": 0.9718498659517426,
"grad_norm": 0.4341330826282501,
"learning_rate": 1.8071440550058697e-05,
"loss": 0.037,
"step": 5800
},
{
"epoch": 0.9802278820375335,
"grad_norm": 0.42356178164482117,
"learning_rate": 1.8054670467885292e-05,
"loss": 0.0353,
"step": 5850
},
{
"epoch": 0.9886058981233244,
"grad_norm": 0.39021754264831543,
"learning_rate": 1.803790038571189e-05,
"loss": 0.0352,
"step": 5900
},
{
"epoch": 0.9969839142091153,
"grad_norm": 0.29827752709388733,
"learning_rate": 1.802113030353849e-05,
"loss": 0.0355,
"step": 5950
},
{
"epoch": 1.0053619302949062,
"grad_norm": 0.38858547806739807,
"learning_rate": 1.800436022136509e-05,
"loss": 0.0278,
"step": 6000
},
{
"epoch": 1.013739946380697,
"grad_norm": 0.2972586452960968,
"learning_rate": 1.7987590139191684e-05,
"loss": 0.0236,
"step": 6050
},
{
"epoch": 1.022117962466488,
"grad_norm": 0.36482104659080505,
"learning_rate": 1.7970820057018283e-05,
"loss": 0.0251,
"step": 6100
},
{
"epoch": 1.030495978552279,
"grad_norm": 0.37719279527664185,
"learning_rate": 1.7954049974844878e-05,
"loss": 0.023,
"step": 6150
},
{
"epoch": 1.0388739946380696,
"grad_norm": 0.4385906457901001,
"learning_rate": 1.7937279892671477e-05,
"loss": 0.0248,
"step": 6200
},
{
"epoch": 1.0472520107238605,
"grad_norm": 0.508695662021637,
"learning_rate": 1.7920509810498072e-05,
"loss": 0.0254,
"step": 6250
},
{
"epoch": 1.0556300268096515,
"grad_norm": 0.36647507548332214,
"learning_rate": 1.790373972832467e-05,
"loss": 0.0254,
"step": 6300
},
{
"epoch": 1.0640080428954424,
"grad_norm": 0.4308232069015503,
"learning_rate": 1.7886969646151266e-05,
"loss": 0.0226,
"step": 6350
},
{
"epoch": 1.0723860589812333,
"grad_norm": 0.3477235734462738,
"learning_rate": 1.7870199563977865e-05,
"loss": 0.0235,
"step": 6400
},
{
"epoch": 1.0807640750670242,
"grad_norm": 0.45611080527305603,
"learning_rate": 1.7853429481804463e-05,
"loss": 0.0255,
"step": 6450
},
{
"epoch": 1.089142091152815,
"grad_norm": 0.41645970940589905,
"learning_rate": 1.783665939963106e-05,
"loss": 0.0236,
"step": 6500
},
{
"epoch": 1.0975201072386058,
"grad_norm": 0.5107206702232361,
"learning_rate": 1.7819889317457657e-05,
"loss": 0.0243,
"step": 6550
},
{
"epoch": 1.1058981233243967,
"grad_norm": 0.37085390090942383,
"learning_rate": 1.7803119235284253e-05,
"loss": 0.0231,
"step": 6600
},
{
"epoch": 1.1142761394101877,
"grad_norm": 0.3882488012313843,
"learning_rate": 1.778634915311085e-05,
"loss": 0.0226,
"step": 6650
},
{
"epoch": 1.1226541554959786,
"grad_norm": 0.3104082942008972,
"learning_rate": 1.7769579070937447e-05,
"loss": 0.0243,
"step": 6700
},
{
"epoch": 1.1310321715817695,
"grad_norm": 0.500109076499939,
"learning_rate": 1.7752808988764045e-05,
"loss": 0.0245,
"step": 6750
},
{
"epoch": 1.1394101876675604,
"grad_norm": 0.6070294380187988,
"learning_rate": 1.7736038906590644e-05,
"loss": 0.024,
"step": 6800
},
{
"epoch": 1.147788203753351,
"grad_norm": 0.4429844915866852,
"learning_rate": 1.7719268824417243e-05,
"loss": 0.024,
"step": 6850
},
{
"epoch": 1.156166219839142,
"grad_norm": 0.532455563545227,
"learning_rate": 1.7702498742243838e-05,
"loss": 0.0231,
"step": 6900
},
{
"epoch": 1.164544235924933,
"grad_norm": 0.4723723828792572,
"learning_rate": 1.7685728660070437e-05,
"loss": 0.0227,
"step": 6950
},
{
"epoch": 1.1729222520107239,
"grad_norm": 0.40511298179626465,
"learning_rate": 1.7668958577897032e-05,
"loss": 0.0228,
"step": 7000
},
{
"epoch": 1.1813002680965148,
"grad_norm": 0.4623141884803772,
"learning_rate": 1.765218849572363e-05,
"loss": 0.0213,
"step": 7050
},
{
"epoch": 1.1896782841823057,
"grad_norm": 0.5076983571052551,
"learning_rate": 1.7635418413550226e-05,
"loss": 0.0247,
"step": 7100
},
{
"epoch": 1.1980563002680964,
"grad_norm": 0.438363641500473,
"learning_rate": 1.7618648331376825e-05,
"loss": 0.0247,
"step": 7150
},
{
"epoch": 1.2064343163538873,
"grad_norm": 0.4427433907985687,
"learning_rate": 1.7601878249203424e-05,
"loss": 0.0266,
"step": 7200
},
{
"epoch": 1.2148123324396782,
"grad_norm": 0.4235341548919678,
"learning_rate": 1.758510816703002e-05,
"loss": 0.0249,
"step": 7250
},
{
"epoch": 1.2231903485254692,
"grad_norm": 0.3872547149658203,
"learning_rate": 1.7568338084856618e-05,
"loss": 0.0241,
"step": 7300
},
{
"epoch": 1.23156836461126,
"grad_norm": 0.4646087884902954,
"learning_rate": 1.7551568002683216e-05,
"loss": 0.0239,
"step": 7350
},
{
"epoch": 1.239946380697051,
"grad_norm": 0.3509279489517212,
"learning_rate": 1.753479792050981e-05,
"loss": 0.0247,
"step": 7400
},
{
"epoch": 1.248324396782842,
"grad_norm": 0.5066854357719421,
"learning_rate": 1.751802783833641e-05,
"loss": 0.0223,
"step": 7450
},
{
"epoch": 1.2567024128686328,
"grad_norm": 0.5363894104957581,
"learning_rate": 1.7501257756163006e-05,
"loss": 0.0231,
"step": 7500
},
{
"epoch": 1.2650804289544235,
"grad_norm": 0.6059328317642212,
"learning_rate": 1.7484487673989604e-05,
"loss": 0.0247,
"step": 7550
},
{
"epoch": 1.2734584450402144,
"grad_norm": 0.458574503660202,
"learning_rate": 1.7467717591816203e-05,
"loss": 0.0252,
"step": 7600
},
{
"epoch": 1.2818364611260054,
"grad_norm": 0.4931676983833313,
"learning_rate": 1.74509475096428e-05,
"loss": 0.0242,
"step": 7650
},
{
"epoch": 1.2902144772117963,
"grad_norm": 0.4417158365249634,
"learning_rate": 1.7434177427469397e-05,
"loss": 0.0245,
"step": 7700
},
{
"epoch": 1.2985924932975872,
"grad_norm": 0.3258965015411377,
"learning_rate": 1.7417407345295992e-05,
"loss": 0.0254,
"step": 7750
},
{
"epoch": 1.3069705093833779,
"grad_norm": 0.4492965638637543,
"learning_rate": 1.740063726312259e-05,
"loss": 0.0247,
"step": 7800
},
{
"epoch": 1.3153485254691688,
"grad_norm": 0.4415794312953949,
"learning_rate": 1.7383867180949186e-05,
"loss": 0.0243,
"step": 7850
},
{
"epoch": 1.3237265415549597,
"grad_norm": 0.5353983044624329,
"learning_rate": 1.7367097098775785e-05,
"loss": 0.0221,
"step": 7900
},
{
"epoch": 1.3321045576407506,
"grad_norm": 0.5296221375465393,
"learning_rate": 1.735032701660238e-05,
"loss": 0.0241,
"step": 7950
},
{
"epoch": 1.3404825737265416,
"grad_norm": 0.5558563470840454,
"learning_rate": 1.7333556934428983e-05,
"loss": 0.0278,
"step": 8000
},
{
"epoch": 1.3488605898123325,
"grad_norm": 0.5353667736053467,
"learning_rate": 1.7316786852255578e-05,
"loss": 0.0221,
"step": 8050
},
{
"epoch": 1.3572386058981234,
"grad_norm": 0.47603583335876465,
"learning_rate": 1.7300016770082177e-05,
"loss": 0.0241,
"step": 8100
},
{
"epoch": 1.3656166219839143,
"grad_norm": 0.5160461068153381,
"learning_rate": 1.7283246687908772e-05,
"loss": 0.0237,
"step": 8150
},
{
"epoch": 1.3739946380697052,
"grad_norm": 0.5242166519165039,
"learning_rate": 1.726647660573537e-05,
"loss": 0.0241,
"step": 8200
},
{
"epoch": 1.382372654155496,
"grad_norm": 0.4098646342754364,
"learning_rate": 1.7249706523561966e-05,
"loss": 0.0237,
"step": 8250
},
{
"epoch": 1.3907506702412868,
"grad_norm": 0.488899290561676,
"learning_rate": 1.7232936441388565e-05,
"loss": 0.024,
"step": 8300
},
{
"epoch": 1.3991286863270778,
"grad_norm": 0.4955669641494751,
"learning_rate": 1.721616635921516e-05,
"loss": 0.0233,
"step": 8350
},
{
"epoch": 1.4075067024128687,
"grad_norm": 0.4925636053085327,
"learning_rate": 1.719939627704176e-05,
"loss": 0.0241,
"step": 8400
},
{
"epoch": 1.4158847184986596,
"grad_norm": 0.4332300126552582,
"learning_rate": 1.7182626194868357e-05,
"loss": 0.023,
"step": 8450
},
{
"epoch": 1.4242627345844503,
"grad_norm": 0.5092645883560181,
"learning_rate": 1.7165856112694953e-05,
"loss": 0.0247,
"step": 8500
},
{
"epoch": 1.4326407506702412,
"grad_norm": 0.4245849847793579,
"learning_rate": 1.714908603052155e-05,
"loss": 0.0229,
"step": 8550
},
{
"epoch": 1.4410187667560321,
"grad_norm": 0.5392746329307556,
"learning_rate": 1.7132315948348147e-05,
"loss": 0.0243,
"step": 8600
},
{
"epoch": 1.449396782841823,
"grad_norm": 0.4076955020427704,
"learning_rate": 1.7115545866174745e-05,
"loss": 0.023,
"step": 8650
},
{
"epoch": 1.457774798927614,
"grad_norm": 0.39265647530555725,
"learning_rate": 1.709877578400134e-05,
"loss": 0.023,
"step": 8700
},
{
"epoch": 1.4661528150134049,
"grad_norm": 0.49499258399009705,
"learning_rate": 1.708200570182794e-05,
"loss": 0.025,
"step": 8750
},
{
"epoch": 1.4745308310991958,
"grad_norm": 0.38666218519210815,
"learning_rate": 1.7065235619654538e-05,
"loss": 0.0222,
"step": 8800
},
{
"epoch": 1.4829088471849867,
"grad_norm": 0.4817696809768677,
"learning_rate": 1.7048465537481137e-05,
"loss": 0.0248,
"step": 8850
},
{
"epoch": 1.4912868632707774,
"grad_norm": 0.5351291298866272,
"learning_rate": 1.7031695455307732e-05,
"loss": 0.0249,
"step": 8900
},
{
"epoch": 1.4996648793565683,
"grad_norm": 0.37309539318084717,
"learning_rate": 1.701492537313433e-05,
"loss": 0.0236,
"step": 8950
},
{
"epoch": 1.5080428954423593,
"grad_norm": 0.3458901345729828,
"learning_rate": 1.6998155290960926e-05,
"loss": 0.0234,
"step": 9000
},
{
"epoch": 1.5164209115281502,
"grad_norm": 0.4059881269931793,
"learning_rate": 1.6981385208787525e-05,
"loss": 0.0241,
"step": 9050
},
{
"epoch": 1.5247989276139409,
"grad_norm": 0.3939747214317322,
"learning_rate": 1.696461512661412e-05,
"loss": 0.0232,
"step": 9100
},
{
"epoch": 1.5331769436997318,
"grad_norm": 0.43895846605300903,
"learning_rate": 1.694784504444072e-05,
"loss": 0.0233,
"step": 9150
},
{
"epoch": 1.5415549597855227,
"grad_norm": 0.48546019196510315,
"learning_rate": 1.6931074962267314e-05,
"loss": 0.0231,
"step": 9200
},
{
"epoch": 1.5499329758713136,
"grad_norm": 0.7542991638183594,
"learning_rate": 1.6914304880093916e-05,
"loss": 0.0227,
"step": 9250
},
{
"epoch": 1.5583109919571045,
"grad_norm": 0.5190153121948242,
"learning_rate": 1.6897534797920512e-05,
"loss": 0.0257,
"step": 9300
},
{
"epoch": 1.5666890080428955,
"grad_norm": 0.36860191822052,
"learning_rate": 1.688076471574711e-05,
"loss": 0.0237,
"step": 9350
},
{
"epoch": 1.5750670241286864,
"grad_norm": 0.4412299394607544,
"learning_rate": 1.6863994633573706e-05,
"loss": 0.023,
"step": 9400
},
{
"epoch": 1.5834450402144773,
"grad_norm": 0.44900405406951904,
"learning_rate": 1.6847224551400304e-05,
"loss": 0.0219,
"step": 9450
},
{
"epoch": 1.5918230563002682,
"grad_norm": 0.4734587073326111,
"learning_rate": 1.68304544692269e-05,
"loss": 0.0244,
"step": 9500
},
{
"epoch": 1.6002010723860591,
"grad_norm": 0.4086250364780426,
"learning_rate": 1.68136843870535e-05,
"loss": 0.0233,
"step": 9550
},
{
"epoch": 1.6085790884718498,
"grad_norm": 0.33544018864631653,
"learning_rate": 1.6796914304880094e-05,
"loss": 0.0228,
"step": 9600
},
{
"epoch": 1.6169571045576407,
"grad_norm": 0.5728262066841125,
"learning_rate": 1.6780144222706692e-05,
"loss": 0.0247,
"step": 9650
},
{
"epoch": 1.6253351206434317,
"grad_norm": 0.40683993697166443,
"learning_rate": 1.676337414053329e-05,
"loss": 0.0236,
"step": 9700
},
{
"epoch": 1.6337131367292224,
"grad_norm": 0.5062201619148254,
"learning_rate": 1.6746604058359887e-05,
"loss": 0.0227,
"step": 9750
},
{
"epoch": 1.6420911528150133,
"grad_norm": 0.6048519015312195,
"learning_rate": 1.6729833976186485e-05,
"loss": 0.024,
"step": 9800
},
{
"epoch": 1.6504691689008042,
"grad_norm": 0.42808210849761963,
"learning_rate": 1.671306389401308e-05,
"loss": 0.0214,
"step": 9850
},
{
"epoch": 1.6588471849865951,
"grad_norm": 0.452362060546875,
"learning_rate": 1.669629381183968e-05,
"loss": 0.025,
"step": 9900
},
{
"epoch": 1.667225201072386,
"grad_norm": 0.45486265420913696,
"learning_rate": 1.6679523729666275e-05,
"loss": 0.0241,
"step": 9950
},
{
"epoch": 1.675603217158177,
"grad_norm": 0.39436477422714233,
"learning_rate": 1.6662753647492873e-05,
"loss": 0.0238,
"step": 10000
},
{
"epoch": 1.675603217158177,
"eval_loss": 0.042236872017383575,
"eval_runtime": 0.3154,
"eval_samples_per_second": 63.415,
"eval_steps_per_second": 3.171,
"step": 10000
},
{
"epoch": 1.6839812332439679,
"grad_norm": 0.3792615234851837,
"learning_rate": 1.6645983565319472e-05,
"loss": 0.0236,
"step": 10050
},
{
"epoch": 1.6923592493297588,
"grad_norm": 0.4568031132221222,
"learning_rate": 1.662921348314607e-05,
"loss": 0.0245,
"step": 10100
},
{
"epoch": 1.7007372654155497,
"grad_norm": 0.30801689624786377,
"learning_rate": 1.6612443400972666e-05,
"loss": 0.0242,
"step": 10150
},
{
"epoch": 1.7091152815013406,
"grad_norm": 0.404593825340271,
"learning_rate": 1.6595673318799265e-05,
"loss": 0.0244,
"step": 10200
},
{
"epoch": 1.7174932975871313,
"grad_norm": 0.41899484395980835,
"learning_rate": 1.657890323662586e-05,
"loss": 0.0237,
"step": 10250
},
{
"epoch": 1.7258713136729222,
"grad_norm": 0.7052549719810486,
"learning_rate": 1.656213315445246e-05,
"loss": 0.0219,
"step": 10300
},
{
"epoch": 1.7342493297587132,
"grad_norm": 0.6043505072593689,
"learning_rate": 1.6545363072279054e-05,
"loss": 0.023,
"step": 10350
},
{
"epoch": 1.742627345844504,
"grad_norm": 0.47521620988845825,
"learning_rate": 1.6528592990105653e-05,
"loss": 0.0213,
"step": 10400
},
{
"epoch": 1.7510053619302948,
"grad_norm": 0.43603143095970154,
"learning_rate": 1.651182290793225e-05,
"loss": 0.0222,
"step": 10450
},
{
"epoch": 1.7593833780160857,
"grad_norm": 0.5093141794204712,
"learning_rate": 1.6495052825758847e-05,
"loss": 0.0241,
"step": 10500
},
{
"epoch": 1.7677613941018766,
"grad_norm": 0.4269144535064697,
"learning_rate": 1.6478282743585445e-05,
"loss": 0.0225,
"step": 10550
},
{
"epoch": 1.7761394101876675,
"grad_norm": 0.3798427879810333,
"learning_rate": 1.646151266141204e-05,
"loss": 0.0246,
"step": 10600
},
{
"epoch": 1.7845174262734584,
"grad_norm": 0.35155215859413147,
"learning_rate": 1.644474257923864e-05,
"loss": 0.0223,
"step": 10650
},
{
"epoch": 1.7928954423592494,
"grad_norm": 0.3362865447998047,
"learning_rate": 1.6427972497065238e-05,
"loss": 0.0222,
"step": 10700
},
{
"epoch": 1.8012734584450403,
"grad_norm": 0.4176539480686188,
"learning_rate": 1.6411202414891834e-05,
"loss": 0.0221,
"step": 10750
},
{
"epoch": 1.8096514745308312,
"grad_norm": 0.38857483863830566,
"learning_rate": 1.6394432332718432e-05,
"loss": 0.0231,
"step": 10800
},
{
"epoch": 1.8180294906166221,
"grad_norm": 0.5508946776390076,
"learning_rate": 1.637766225054503e-05,
"loss": 0.0244,
"step": 10850
},
{
"epoch": 1.826407506702413,
"grad_norm": 0.23041021823883057,
"learning_rate": 1.6360892168371626e-05,
"loss": 0.0214,
"step": 10900
},
{
"epoch": 1.8347855227882037,
"grad_norm": 0.4554728865623474,
"learning_rate": 1.6344122086198225e-05,
"loss": 0.0216,
"step": 10950
},
{
"epoch": 1.8431635388739946,
"grad_norm": 0.3926387131214142,
"learning_rate": 1.632735200402482e-05,
"loss": 0.0231,
"step": 11000
},
{
"epoch": 1.8515415549597856,
"grad_norm": 0.4310173988342285,
"learning_rate": 1.631058192185142e-05,
"loss": 0.0217,
"step": 11050
},
{
"epoch": 1.8599195710455763,
"grad_norm": 0.5301809310913086,
"learning_rate": 1.6293811839678014e-05,
"loss": 0.0233,
"step": 11100
},
{
"epoch": 1.8682975871313672,
"grad_norm": 0.5201212167739868,
"learning_rate": 1.6277041757504613e-05,
"loss": 0.0238,
"step": 11150
},
{
"epoch": 1.876675603217158,
"grad_norm": 0.5420696139335632,
"learning_rate": 1.626027167533121e-05,
"loss": 0.0228,
"step": 11200
},
{
"epoch": 1.885053619302949,
"grad_norm": 0.449569433927536,
"learning_rate": 1.624350159315781e-05,
"loss": 0.0229,
"step": 11250
},
{
"epoch": 1.89343163538874,
"grad_norm": 0.41790249943733215,
"learning_rate": 1.6226731510984406e-05,
"loss": 0.0211,
"step": 11300
},
{
"epoch": 1.9018096514745308,
"grad_norm": 0.49417269229888916,
"learning_rate": 1.6209961428811004e-05,
"loss": 0.0238,
"step": 11350
},
{
"epoch": 1.9101876675603218,
"grad_norm": 0.7904441952705383,
"learning_rate": 1.61931913466376e-05,
"loss": 0.0238,
"step": 11400
},
{
"epoch": 1.9185656836461127,
"grad_norm": 0.5102431178092957,
"learning_rate": 1.61764212644642e-05,
"loss": 0.0234,
"step": 11450
},
{
"epoch": 1.9269436997319036,
"grad_norm": 0.5872859954833984,
"learning_rate": 1.6159651182290794e-05,
"loss": 0.023,
"step": 11500
},
{
"epoch": 1.9353217158176945,
"grad_norm": 0.4397691488265991,
"learning_rate": 1.6142881100117393e-05,
"loss": 0.0225,
"step": 11550
},
{
"epoch": 1.9436997319034852,
"grad_norm": 0.5159376263618469,
"learning_rate": 1.6126111017943988e-05,
"loss": 0.0236,
"step": 11600
},
{
"epoch": 1.9520777479892761,
"grad_norm": 0.5699421763420105,
"learning_rate": 1.6109340935770587e-05,
"loss": 0.0221,
"step": 11650
},
{
"epoch": 1.960455764075067,
"grad_norm": 0.5751481056213379,
"learning_rate": 1.6092570853597185e-05,
"loss": 0.0238,
"step": 11700
},
{
"epoch": 1.9688337801608577,
"grad_norm": 0.4952080249786377,
"learning_rate": 1.607580077142378e-05,
"loss": 0.0245,
"step": 11750
},
{
"epoch": 1.9772117962466487,
"grad_norm": 0.3852183520793915,
"learning_rate": 1.605903068925038e-05,
"loss": 0.0237,
"step": 11800
},
{
"epoch": 1.9855898123324396,
"grad_norm": 0.5378175973892212,
"learning_rate": 1.6042260607076975e-05,
"loss": 0.0217,
"step": 11850
},
{
"epoch": 1.9939678284182305,
"grad_norm": 0.48786741495132446,
"learning_rate": 1.6025490524903573e-05,
"loss": 0.0212,
"step": 11900
},
{
"epoch": 2.0023458445040214,
"grad_norm": 0.2508140206336975,
"learning_rate": 1.600872044273017e-05,
"loss": 0.0221,
"step": 11950
},
{
"epoch": 2.0107238605898123,
"grad_norm": 0.32956379652023315,
"learning_rate": 1.5991950360556767e-05,
"loss": 0.0115,
"step": 12000
},
{
"epoch": 2.0191018766756033,
"grad_norm": 0.22912301123142242,
"learning_rate": 1.5975180278383366e-05,
"loss": 0.0127,
"step": 12050
},
{
"epoch": 2.027479892761394,
"grad_norm": 0.29201629757881165,
"learning_rate": 1.5958410196209965e-05,
"loss": 0.0103,
"step": 12100
},
{
"epoch": 2.035857908847185,
"grad_norm": 0.3595946431159973,
"learning_rate": 1.594164011403656e-05,
"loss": 0.0105,
"step": 12150
},
{
"epoch": 2.044235924932976,
"grad_norm": 0.22679433226585388,
"learning_rate": 1.592487003186316e-05,
"loss": 0.0107,
"step": 12200
},
{
"epoch": 2.052613941018767,
"grad_norm": 0.40025532245635986,
"learning_rate": 1.5908099949689754e-05,
"loss": 0.0117,
"step": 12250
},
{
"epoch": 2.060991957104558,
"grad_norm": 0.32900357246398926,
"learning_rate": 1.5891329867516353e-05,
"loss": 0.0115,
"step": 12300
},
{
"epoch": 2.0693699731903483,
"grad_norm": 0.2236577868461609,
"learning_rate": 1.5874559785342948e-05,
"loss": 0.0123,
"step": 12350
},
{
"epoch": 2.0777479892761392,
"grad_norm": 0.3712753355503082,
"learning_rate": 1.5857789703169547e-05,
"loss": 0.0114,
"step": 12400
},
{
"epoch": 2.08612600536193,
"grad_norm": 0.4136362373828888,
"learning_rate": 1.5841019620996142e-05,
"loss": 0.011,
"step": 12450
},
{
"epoch": 2.094504021447721,
"grad_norm": 0.3658868074417114,
"learning_rate": 1.582424953882274e-05,
"loss": 0.0124,
"step": 12500
},
{
"epoch": 2.102882037533512,
"grad_norm": 0.44573381543159485,
"learning_rate": 1.580747945664934e-05,
"loss": 0.0112,
"step": 12550
},
{
"epoch": 2.111260053619303,
"grad_norm": 0.4188709557056427,
"learning_rate": 1.5790709374475935e-05,
"loss": 0.0115,
"step": 12600
},
{
"epoch": 2.119638069705094,
"grad_norm": 0.3570314645767212,
"learning_rate": 1.5773939292302534e-05,
"loss": 0.0112,
"step": 12650
},
{
"epoch": 2.1280160857908847,
"grad_norm": 0.3598877191543579,
"learning_rate": 1.5757169210129132e-05,
"loss": 0.0118,
"step": 12700
},
{
"epoch": 2.1363941018766757,
"grad_norm": 0.3769216239452362,
"learning_rate": 1.5740399127955728e-05,
"loss": 0.0107,
"step": 12750
},
{
"epoch": 2.1447721179624666,
"grad_norm": 0.2821277678012848,
"learning_rate": 1.5723629045782326e-05,
"loss": 0.0118,
"step": 12800
},
{
"epoch": 2.1531501340482575,
"grad_norm": 0.26597416400909424,
"learning_rate": 1.570685896360892e-05,
"loss": 0.0113,
"step": 12850
},
{
"epoch": 2.1615281501340484,
"grad_norm": 0.26788029074668884,
"learning_rate": 1.569008888143552e-05,
"loss": 0.0119,
"step": 12900
},
{
"epoch": 2.1699061662198393,
"grad_norm": 0.4225537180900574,
"learning_rate": 1.567331879926212e-05,
"loss": 0.0111,
"step": 12950
},
{
"epoch": 2.17828418230563,
"grad_norm": 0.2967151403427124,
"learning_rate": 1.5656548717088714e-05,
"loss": 0.0116,
"step": 13000
},
{
"epoch": 2.1866621983914207,
"grad_norm": 0.37873271107673645,
"learning_rate": 1.5639778634915313e-05,
"loss": 0.0122,
"step": 13050
},
{
"epoch": 2.1950402144772116,
"grad_norm": 0.3496306836605072,
"learning_rate": 1.562300855274191e-05,
"loss": 0.0115,
"step": 13100
},
{
"epoch": 2.2034182305630026,
"grad_norm": 0.2340189516544342,
"learning_rate": 1.5606238470568507e-05,
"loss": 0.0114,
"step": 13150
},
{
"epoch": 2.2117962466487935,
"grad_norm": 0.34111320972442627,
"learning_rate": 1.5589468388395102e-05,
"loss": 0.0106,
"step": 13200
},
{
"epoch": 2.2201742627345844,
"grad_norm": 0.4557114839553833,
"learning_rate": 1.55726983062217e-05,
"loss": 0.0124,
"step": 13250
},
{
"epoch": 2.2285522788203753,
"grad_norm": 0.3776351511478424,
"learning_rate": 1.55559282240483e-05,
"loss": 0.0109,
"step": 13300
},
{
"epoch": 2.2369302949061662,
"grad_norm": 0.49314960837364197,
"learning_rate": 1.55391581418749e-05,
"loss": 0.0127,
"step": 13350
},
{
"epoch": 2.245308310991957,
"grad_norm": 0.2994402348995209,
"learning_rate": 1.5522388059701494e-05,
"loss": 0.0123,
"step": 13400
},
{
"epoch": 2.253686327077748,
"grad_norm": 0.6113381385803223,
"learning_rate": 1.5505617977528093e-05,
"loss": 0.0122,
"step": 13450
},
{
"epoch": 2.262064343163539,
"grad_norm": 0.43357163667678833,
"learning_rate": 1.5488847895354688e-05,
"loss": 0.0109,
"step": 13500
},
{
"epoch": 2.27044235924933,
"grad_norm": 0.5021244287490845,
"learning_rate": 1.5472077813181287e-05,
"loss": 0.013,
"step": 13550
},
{
"epoch": 2.278820375335121,
"grad_norm": 0.4794227182865143,
"learning_rate": 1.5455307731007882e-05,
"loss": 0.0125,
"step": 13600
},
{
"epoch": 2.2871983914209117,
"grad_norm": 0.2409118264913559,
"learning_rate": 1.543853764883448e-05,
"loss": 0.012,
"step": 13650
},
{
"epoch": 2.295576407506702,
"grad_norm": 0.36879080533981323,
"learning_rate": 1.542176756666108e-05,
"loss": 0.0101,
"step": 13700
},
{
"epoch": 2.303954423592493,
"grad_norm": 0.2825350761413574,
"learning_rate": 1.5404997484487675e-05,
"loss": 0.0113,
"step": 13750
},
{
"epoch": 2.312332439678284,
"grad_norm": 0.5339875221252441,
"learning_rate": 1.5388227402314273e-05,
"loss": 0.0115,
"step": 13800
},
{
"epoch": 2.320710455764075,
"grad_norm": 0.5463636517524719,
"learning_rate": 1.537145732014087e-05,
"loss": 0.0127,
"step": 13850
},
{
"epoch": 2.329088471849866,
"grad_norm": 0.37746766209602356,
"learning_rate": 1.5354687237967467e-05,
"loss": 0.0116,
"step": 13900
},
{
"epoch": 2.337466487935657,
"grad_norm": 0.6131693124771118,
"learning_rate": 1.5337917155794063e-05,
"loss": 0.0115,
"step": 13950
},
{
"epoch": 2.3458445040214477,
"grad_norm": 0.3330284059047699,
"learning_rate": 1.532114707362066e-05,
"loss": 0.0111,
"step": 14000
},
{
"epoch": 2.3542225201072386,
"grad_norm": 0.47551050782203674,
"learning_rate": 1.530437699144726e-05,
"loss": 0.0117,
"step": 14050
},
{
"epoch": 2.3626005361930296,
"grad_norm": 0.5559821128845215,
"learning_rate": 1.528760690927386e-05,
"loss": 0.0134,
"step": 14100
},
{
"epoch": 2.3709785522788205,
"grad_norm": 0.32303518056869507,
"learning_rate": 1.5270836827100454e-05,
"loss": 0.012,
"step": 14150
},
{
"epoch": 2.3793565683646114,
"grad_norm": 0.4595315754413605,
"learning_rate": 1.5254066744927051e-05,
"loss": 0.012,
"step": 14200
},
{
"epoch": 2.3877345844504023,
"grad_norm": 0.5437060594558716,
"learning_rate": 1.5237296662753648e-05,
"loss": 0.0124,
"step": 14250
},
{
"epoch": 2.396112600536193,
"grad_norm": 0.3886863589286804,
"learning_rate": 1.5220526580580245e-05,
"loss": 0.0129,
"step": 14300
},
{
"epoch": 2.4044906166219837,
"grad_norm": 0.5083261132240295,
"learning_rate": 1.5203756498406844e-05,
"loss": 0.0134,
"step": 14350
},
{
"epoch": 2.4128686327077746,
"grad_norm": 0.35092031955718994,
"learning_rate": 1.5186986416233441e-05,
"loss": 0.0116,
"step": 14400
},
{
"epoch": 2.4212466487935655,
"grad_norm": 0.4511415660381317,
"learning_rate": 1.5170216334060038e-05,
"loss": 0.013,
"step": 14450
},
{
"epoch": 2.4296246648793565,
"grad_norm": 0.5314837694168091,
"learning_rate": 1.5153446251886637e-05,
"loss": 0.0128,
"step": 14500
},
{
"epoch": 2.4380026809651474,
"grad_norm": 0.3129260540008545,
"learning_rate": 1.5136676169713234e-05,
"loss": 0.0121,
"step": 14550
},
{
"epoch": 2.4463806970509383,
"grad_norm": 0.3153856694698334,
"learning_rate": 1.511990608753983e-05,
"loss": 0.0138,
"step": 14600
},
{
"epoch": 2.454758713136729,
"grad_norm": 0.8036394715309143,
"learning_rate": 1.5103136005366428e-05,
"loss": 0.0121,
"step": 14650
},
{
"epoch": 2.46313672922252,
"grad_norm": 0.50925213098526,
"learning_rate": 1.5086365923193025e-05,
"loss": 0.0124,
"step": 14700
},
{
"epoch": 2.471514745308311,
"grad_norm": 0.5606102347373962,
"learning_rate": 1.5069595841019622e-05,
"loss": 0.0124,
"step": 14750
},
{
"epoch": 2.479892761394102,
"grad_norm": 0.5037418603897095,
"learning_rate": 1.5052825758846219e-05,
"loss": 0.0136,
"step": 14800
},
{
"epoch": 2.488270777479893,
"grad_norm": 0.3871222138404846,
"learning_rate": 1.5036055676672816e-05,
"loss": 0.0129,
"step": 14850
},
{
"epoch": 2.496648793565684,
"grad_norm": 0.5839509963989258,
"learning_rate": 1.5019285594499416e-05,
"loss": 0.0123,
"step": 14900
},
{
"epoch": 2.5050268096514747,
"grad_norm": 0.7268586754798889,
"learning_rate": 1.5002515512326013e-05,
"loss": 0.012,
"step": 14950
},
{
"epoch": 2.5134048257372656,
"grad_norm": 0.3473876118659973,
"learning_rate": 1.498574543015261e-05,
"loss": 0.0126,
"step": 15000
},
{
"epoch": 2.5217828418230566,
"grad_norm": 0.49601665139198303,
"learning_rate": 1.4968975347979207e-05,
"loss": 0.0121,
"step": 15050
},
{
"epoch": 2.530160857908847,
"grad_norm": 0.23973305523395538,
"learning_rate": 1.4952205265805804e-05,
"loss": 0.0111,
"step": 15100
},
{
"epoch": 2.538538873994638,
"grad_norm": 0.5663930177688599,
"learning_rate": 1.4935435183632401e-05,
"loss": 0.0106,
"step": 15150
},
{
"epoch": 2.546916890080429,
"grad_norm": 0.24828468263149261,
"learning_rate": 1.4918665101458998e-05,
"loss": 0.0132,
"step": 15200
},
{
"epoch": 2.55529490616622,
"grad_norm": 0.17071287333965302,
"learning_rate": 1.4901895019285595e-05,
"loss": 0.0132,
"step": 15250
},
{
"epoch": 2.5636729222520107,
"grad_norm": 0.5064595937728882,
"learning_rate": 1.4885124937112192e-05,
"loss": 0.013,
"step": 15300
},
{
"epoch": 2.5720509383378016,
"grad_norm": 0.48466721177101135,
"learning_rate": 1.4868354854938791e-05,
"loss": 0.0112,
"step": 15350
},
{
"epoch": 2.5804289544235925,
"grad_norm": 0.6913251876831055,
"learning_rate": 1.4851584772765388e-05,
"loss": 0.0129,
"step": 15400
},
{
"epoch": 2.5888069705093835,
"grad_norm": 0.4608655869960785,
"learning_rate": 1.4834814690591985e-05,
"loss": 0.0125,
"step": 15450
},
{
"epoch": 2.5971849865951744,
"grad_norm": 0.5575762391090393,
"learning_rate": 1.4818044608418582e-05,
"loss": 0.0122,
"step": 15500
},
{
"epoch": 2.6055630026809653,
"grad_norm": 0.3975880444049835,
"learning_rate": 1.4801274526245179e-05,
"loss": 0.0115,
"step": 15550
},
{
"epoch": 2.6139410187667558,
"grad_norm": 0.4161764681339264,
"learning_rate": 1.4784504444071776e-05,
"loss": 0.0143,
"step": 15600
},
{
"epoch": 2.6223190348525467,
"grad_norm": 0.5338849425315857,
"learning_rate": 1.4767734361898373e-05,
"loss": 0.0127,
"step": 15650
},
{
"epoch": 2.6306970509383376,
"grad_norm": 0.3128230571746826,
"learning_rate": 1.475096427972497e-05,
"loss": 0.0135,
"step": 15700
},
{
"epoch": 2.6390750670241285,
"grad_norm": 0.5028887987136841,
"learning_rate": 1.473419419755157e-05,
"loss": 0.0117,
"step": 15750
},
{
"epoch": 2.6474530831099194,
"grad_norm": 0.3744266927242279,
"learning_rate": 1.4717424115378167e-05,
"loss": 0.0115,
"step": 15800
},
{
"epoch": 2.6558310991957104,
"grad_norm": 0.4230741560459137,
"learning_rate": 1.4700654033204764e-05,
"loss": 0.013,
"step": 15850
},
{
"epoch": 2.6642091152815013,
"grad_norm": 0.49401816725730896,
"learning_rate": 1.4683883951031361e-05,
"loss": 0.0115,
"step": 15900
},
{
"epoch": 2.672587131367292,
"grad_norm": 0.4584721028804779,
"learning_rate": 1.4667113868857958e-05,
"loss": 0.0133,
"step": 15950
},
{
"epoch": 2.680965147453083,
"grad_norm": 0.760981559753418,
"learning_rate": 1.4650343786684555e-05,
"loss": 0.0127,
"step": 16000
},
{
"epoch": 2.689343163538874,
"grad_norm": 0.37186485528945923,
"learning_rate": 1.4633573704511152e-05,
"loss": 0.0134,
"step": 16050
},
{
"epoch": 2.697721179624665,
"grad_norm": 0.62066251039505,
"learning_rate": 1.461680362233775e-05,
"loss": 0.0141,
"step": 16100
},
{
"epoch": 2.706099195710456,
"grad_norm": 0.3157498240470886,
"learning_rate": 1.4600033540164348e-05,
"loss": 0.0118,
"step": 16150
},
{
"epoch": 2.714477211796247,
"grad_norm": 0.4527428448200226,
"learning_rate": 1.4583263457990945e-05,
"loss": 0.0134,
"step": 16200
},
{
"epoch": 2.7228552278820377,
"grad_norm": 0.31555086374282837,
"learning_rate": 1.4566493375817544e-05,
"loss": 0.0132,
"step": 16250
},
{
"epoch": 2.7312332439678286,
"grad_norm": 0.44448813796043396,
"learning_rate": 1.4549723293644141e-05,
"loss": 0.0124,
"step": 16300
},
{
"epoch": 2.7396112600536195,
"grad_norm": 0.4281978905200958,
"learning_rate": 1.4532953211470738e-05,
"loss": 0.0122,
"step": 16350
},
{
"epoch": 2.7479892761394105,
"grad_norm": 0.45892074704170227,
"learning_rate": 1.4516183129297335e-05,
"loss": 0.012,
"step": 16400
},
{
"epoch": 2.756367292225201,
"grad_norm": 0.30029842257499695,
"learning_rate": 1.4499413047123932e-05,
"loss": 0.0119,
"step": 16450
},
{
"epoch": 2.764745308310992,
"grad_norm": 0.3950155973434448,
"learning_rate": 1.4482642964950529e-05,
"loss": 0.0136,
"step": 16500
},
{
"epoch": 2.7731233243967828,
"grad_norm": 0.4550629258155823,
"learning_rate": 1.4465872882777128e-05,
"loss": 0.0122,
"step": 16550
},
{
"epoch": 2.7815013404825737,
"grad_norm": 0.5514039397239685,
"learning_rate": 1.4449102800603725e-05,
"loss": 0.0135,
"step": 16600
},
{
"epoch": 2.7898793565683646,
"grad_norm": 0.5131493806838989,
"learning_rate": 1.4432332718430322e-05,
"loss": 0.0132,
"step": 16650
},
{
"epoch": 2.7982573726541555,
"grad_norm": 0.39987483620643616,
"learning_rate": 1.4415562636256919e-05,
"loss": 0.0126,
"step": 16700
},
{
"epoch": 2.8066353887399464,
"grad_norm": 0.5557750463485718,
"learning_rate": 1.4398792554083516e-05,
"loss": 0.0121,
"step": 16750
},
{
"epoch": 2.8150134048257374,
"grad_norm": 0.34864020347595215,
"learning_rate": 1.4382022471910113e-05,
"loss": 0.0129,
"step": 16800
},
{
"epoch": 2.8233914209115283,
"grad_norm": 0.4396969974040985,
"learning_rate": 1.436525238973671e-05,
"loss": 0.0137,
"step": 16850
},
{
"epoch": 2.831769436997319,
"grad_norm": 0.4104606509208679,
"learning_rate": 1.4348482307563307e-05,
"loss": 0.0146,
"step": 16900
},
{
"epoch": 2.8401474530831097,
"grad_norm": 0.6937008500099182,
"learning_rate": 1.4331712225389907e-05,
"loss": 0.0123,
"step": 16950
},
{
"epoch": 2.8485254691689006,
"grad_norm": 0.5880556106567383,
"learning_rate": 1.4314942143216504e-05,
"loss": 0.0131,
"step": 17000
},
{
"epoch": 2.8569034852546915,
"grad_norm": 0.4264618158340454,
"learning_rate": 1.4298172061043101e-05,
"loss": 0.0133,
"step": 17050
},
{
"epoch": 2.8652815013404824,
"grad_norm": 0.5207853317260742,
"learning_rate": 1.4281401978869698e-05,
"loss": 0.0137,
"step": 17100
},
{
"epoch": 2.8736595174262733,
"grad_norm": 0.4656062424182892,
"learning_rate": 1.4264631896696295e-05,
"loss": 0.0145,
"step": 17150
},
{
"epoch": 2.8820375335120643,
"grad_norm": 0.38702937960624695,
"learning_rate": 1.4247861814522892e-05,
"loss": 0.0122,
"step": 17200
},
{
"epoch": 2.890415549597855,
"grad_norm": 0.246555358171463,
"learning_rate": 1.423109173234949e-05,
"loss": 0.0135,
"step": 17250
},
{
"epoch": 2.898793565683646,
"grad_norm": 0.2863421142101288,
"learning_rate": 1.4214321650176086e-05,
"loss": 0.0132,
"step": 17300
},
{
"epoch": 2.907171581769437,
"grad_norm": 0.31063777208328247,
"learning_rate": 1.4197551568002685e-05,
"loss": 0.0122,
"step": 17350
},
{
"epoch": 2.915549597855228,
"grad_norm": 0.5885173082351685,
"learning_rate": 1.4180781485829282e-05,
"loss": 0.0117,
"step": 17400
},
{
"epoch": 2.923927613941019,
"grad_norm": 0.41046226024627686,
"learning_rate": 1.4164011403655879e-05,
"loss": 0.0136,
"step": 17450
},
{
"epoch": 2.9323056300268098,
"grad_norm": 0.45641854405403137,
"learning_rate": 1.4147241321482476e-05,
"loss": 0.0136,
"step": 17500
},
{
"epoch": 2.9406836461126007,
"grad_norm": 0.3291575610637665,
"learning_rate": 1.4130471239309073e-05,
"loss": 0.0128,
"step": 17550
},
{
"epoch": 2.9490616621983916,
"grad_norm": 0.4031969904899597,
"learning_rate": 1.411370115713567e-05,
"loss": 0.0144,
"step": 17600
},
{
"epoch": 2.9574396782841825,
"grad_norm": 0.4048541486263275,
"learning_rate": 1.4096931074962267e-05,
"loss": 0.0131,
"step": 17650
},
{
"epoch": 2.9658176943699734,
"grad_norm": 0.42356961965560913,
"learning_rate": 1.4080160992788866e-05,
"loss": 0.0136,
"step": 17700
},
{
"epoch": 2.974195710455764,
"grad_norm": 0.499991774559021,
"learning_rate": 1.4063390910615464e-05,
"loss": 0.0129,
"step": 17750
},
{
"epoch": 2.982573726541555,
"grad_norm": 0.4582955837249756,
"learning_rate": 1.4046620828442061e-05,
"loss": 0.0128,
"step": 17800
},
{
"epoch": 2.9909517426273458,
"grad_norm": 0.40763500332832336,
"learning_rate": 1.4029850746268658e-05,
"loss": 0.0125,
"step": 17850
},
{
"epoch": 2.9993297587131367,
"grad_norm": 0.2882692515850067,
"learning_rate": 1.4013080664095256e-05,
"loss": 0.0131,
"step": 17900
},
{
"epoch": 3.0077077747989276,
"grad_norm": 0.25621238350868225,
"learning_rate": 1.3996310581921853e-05,
"loss": 0.007,
"step": 17950
},
{
"epoch": 3.0160857908847185,
"grad_norm": 0.2496500015258789,
"learning_rate": 1.397954049974845e-05,
"loss": 0.0058,
"step": 18000
},
{
"epoch": 3.0244638069705094,
"grad_norm": 0.5392020344734192,
"learning_rate": 1.3962770417575047e-05,
"loss": 0.0064,
"step": 18050
},
{
"epoch": 3.0328418230563003,
"grad_norm": 0.142150416970253,
"learning_rate": 1.3946000335401644e-05,
"loss": 0.0049,
"step": 18100
},
{
"epoch": 3.0412198391420913,
"grad_norm": 0.2814841866493225,
"learning_rate": 1.392923025322824e-05,
"loss": 0.0048,
"step": 18150
},
{
"epoch": 3.049597855227882,
"grad_norm": 0.9062692523002625,
"learning_rate": 1.3912460171054841e-05,
"loss": 0.0054,
"step": 18200
},
{
"epoch": 3.057975871313673,
"grad_norm": 0.17520900070667267,
"learning_rate": 1.3895690088881438e-05,
"loss": 0.0047,
"step": 18250
},
{
"epoch": 3.066353887399464,
"grad_norm": 0.2684191167354584,
"learning_rate": 1.3878920006708035e-05,
"loss": 0.0055,
"step": 18300
},
{
"epoch": 3.0747319034852545,
"grad_norm": 0.2762264013290405,
"learning_rate": 1.3862149924534632e-05,
"loss": 0.006,
"step": 18350
},
{
"epoch": 3.0831099195710454,
"grad_norm": 0.16580019891262054,
"learning_rate": 1.3845379842361229e-05,
"loss": 0.0053,
"step": 18400
},
{
"epoch": 3.0914879356568363,
"grad_norm": 0.30021271109580994,
"learning_rate": 1.3828609760187826e-05,
"loss": 0.0052,
"step": 18450
},
{
"epoch": 3.0998659517426272,
"grad_norm": 0.3511424958705902,
"learning_rate": 1.3811839678014423e-05,
"loss": 0.0053,
"step": 18500
},
{
"epoch": 3.108243967828418,
"grad_norm": 0.2431810349225998,
"learning_rate": 1.379506959584102e-05,
"loss": 0.0054,
"step": 18550
},
{
"epoch": 3.116621983914209,
"grad_norm": 0.2419600486755371,
"learning_rate": 1.3778299513667619e-05,
"loss": 0.0053,
"step": 18600
},
{
"epoch": 3.125,
"grad_norm": 0.3268046975135803,
"learning_rate": 1.3761529431494216e-05,
"loss": 0.0055,
"step": 18650
},
{
"epoch": 3.133378016085791,
"grad_norm": 0.38957932591438293,
"learning_rate": 1.3744759349320813e-05,
"loss": 0.0051,
"step": 18700
},
{
"epoch": 3.141756032171582,
"grad_norm": 0.31418824195861816,
"learning_rate": 1.372798926714741e-05,
"loss": 0.0052,
"step": 18750
},
{
"epoch": 3.1501340482573728,
"grad_norm": 0.3322865068912506,
"learning_rate": 1.3711219184974007e-05,
"loss": 0.0051,
"step": 18800
},
{
"epoch": 3.1585120643431637,
"grad_norm": 0.22010941803455353,
"learning_rate": 1.3694449102800604e-05,
"loss": 0.0053,
"step": 18850
},
{
"epoch": 3.1668900804289546,
"grad_norm": 0.23425912857055664,
"learning_rate": 1.3677679020627201e-05,
"loss": 0.0055,
"step": 18900
},
{
"epoch": 3.1752680965147455,
"grad_norm": 0.30269861221313477,
"learning_rate": 1.3660908938453798e-05,
"loss": 0.0051,
"step": 18950
},
{
"epoch": 3.1836461126005364,
"grad_norm": 0.28305545449256897,
"learning_rate": 1.3644138856280398e-05,
"loss": 0.0057,
"step": 19000
},
{
"epoch": 3.192024128686327,
"grad_norm": 0.313149631023407,
"learning_rate": 1.3627368774106995e-05,
"loss": 0.0048,
"step": 19050
},
{
"epoch": 3.200402144772118,
"grad_norm": 0.30681276321411133,
"learning_rate": 1.3610598691933592e-05,
"loss": 0.0056,
"step": 19100
},
{
"epoch": 3.2087801608579087,
"grad_norm": 0.17815206944942474,
"learning_rate": 1.359382860976019e-05,
"loss": 0.0055,
"step": 19150
},
{
"epoch": 3.2171581769436997,
"grad_norm": 0.29173994064331055,
"learning_rate": 1.3577058527586786e-05,
"loss": 0.0057,
"step": 19200
},
{
"epoch": 3.2255361930294906,
"grad_norm": 0.3214263916015625,
"learning_rate": 1.3560288445413383e-05,
"loss": 0.0063,
"step": 19250
},
{
"epoch": 3.2339142091152815,
"grad_norm": 0.2251535803079605,
"learning_rate": 1.354351836323998e-05,
"loss": 0.006,
"step": 19300
},
{
"epoch": 3.2422922252010724,
"grad_norm": 0.4358842372894287,
"learning_rate": 1.3526748281066577e-05,
"loss": 0.0047,
"step": 19350
},
{
"epoch": 3.2506702412868633,
"grad_norm": 0.23471078276634216,
"learning_rate": 1.3509978198893176e-05,
"loss": 0.0058,
"step": 19400
},
{
"epoch": 3.2590482573726542,
"grad_norm": 0.28291311860084534,
"learning_rate": 1.3493208116719773e-05,
"loss": 0.005,
"step": 19450
},
{
"epoch": 3.267426273458445,
"grad_norm": 0.23490838706493378,
"learning_rate": 1.347643803454637e-05,
"loss": 0.0055,
"step": 19500
},
{
"epoch": 3.275804289544236,
"grad_norm": 0.3439931571483612,
"learning_rate": 1.3459667952372967e-05,
"loss": 0.0053,
"step": 19550
},
{
"epoch": 3.284182305630027,
"grad_norm": 0.19748039543628693,
"learning_rate": 1.3442897870199564e-05,
"loss": 0.0054,
"step": 19600
},
{
"epoch": 3.2925603217158175,
"grad_norm": 0.3718995749950409,
"learning_rate": 1.3426127788026163e-05,
"loss": 0.0051,
"step": 19650
},
{
"epoch": 3.3009383378016084,
"grad_norm": 0.49980103969573975,
"learning_rate": 1.340935770585276e-05,
"loss": 0.0056,
"step": 19700
},
{
"epoch": 3.3093163538873993,
"grad_norm": 0.5253378748893738,
"learning_rate": 1.3392587623679357e-05,
"loss": 0.0052,
"step": 19750
},
{
"epoch": 3.3176943699731902,
"grad_norm": 0.14330442249774933,
"learning_rate": 1.3375817541505956e-05,
"loss": 0.0058,
"step": 19800
},
{
"epoch": 3.326072386058981,
"grad_norm": 0.2218172401189804,
"learning_rate": 1.3359047459332553e-05,
"loss": 0.006,
"step": 19850
},
{
"epoch": 3.334450402144772,
"grad_norm": 0.4102313816547394,
"learning_rate": 1.334227737715915e-05,
"loss": 0.0062,
"step": 19900
},
{
"epoch": 3.342828418230563,
"grad_norm": 0.37395352125167847,
"learning_rate": 1.3325507294985747e-05,
"loss": 0.0061,
"step": 19950
},
{
"epoch": 3.351206434316354,
"grad_norm": 0.2626063823699951,
"learning_rate": 1.3308737212812344e-05,
"loss": 0.0053,
"step": 20000
},
{
"epoch": 3.351206434316354,
"eval_loss": 0.03919154778122902,
"eval_runtime": 0.3143,
"eval_samples_per_second": 63.636,
"eval_steps_per_second": 3.182,
"step": 20000
},
{
"epoch": 3.359584450402145,
"grad_norm": 0.1779392808675766,
"learning_rate": 1.329196713063894e-05,
"loss": 0.0059,
"step": 20050
},
{
"epoch": 3.3679624664879357,
"grad_norm": 0.37775570154190063,
"learning_rate": 1.3275197048465538e-05,
"loss": 0.0064,
"step": 20100
},
{
"epoch": 3.3763404825737267,
"grad_norm": 0.26373809576034546,
"learning_rate": 1.3258426966292135e-05,
"loss": 0.0064,
"step": 20150
},
{
"epoch": 3.3847184986595176,
"grad_norm": 0.330445259809494,
"learning_rate": 1.3241656884118735e-05,
"loss": 0.0063,
"step": 20200
},
{
"epoch": 3.3930965147453085,
"grad_norm": 0.294837087392807,
"learning_rate": 1.3224886801945332e-05,
"loss": 0.0054,
"step": 20250
},
{
"epoch": 3.4014745308310994,
"grad_norm": 0.16401290893554688,
"learning_rate": 1.3208116719771929e-05,
"loss": 0.0057,
"step": 20300
},
{
"epoch": 3.4098525469168903,
"grad_norm": 0.5002830624580383,
"learning_rate": 1.3191346637598526e-05,
"loss": 0.0058,
"step": 20350
},
{
"epoch": 3.418230563002681,
"grad_norm": 0.321429580450058,
"learning_rate": 1.3174576555425123e-05,
"loss": 0.0067,
"step": 20400
},
{
"epoch": 3.4266085790884717,
"grad_norm": 0.28548842668533325,
"learning_rate": 1.315780647325172e-05,
"loss": 0.0055,
"step": 20450
},
{
"epoch": 3.4349865951742626,
"grad_norm": 0.20685793459415436,
"learning_rate": 1.3141036391078317e-05,
"loss": 0.0057,
"step": 20500
},
{
"epoch": 3.4433646112600536,
"grad_norm": 0.30995652079582214,
"learning_rate": 1.3124266308904914e-05,
"loss": 0.0055,
"step": 20550
},
{
"epoch": 3.4517426273458445,
"grad_norm": 0.41536813974380493,
"learning_rate": 1.3107496226731513e-05,
"loss": 0.0062,
"step": 20600
},
{
"epoch": 3.4601206434316354,
"grad_norm": 0.29047590494155884,
"learning_rate": 1.309072614455811e-05,
"loss": 0.0063,
"step": 20650
},
{
"epoch": 3.4684986595174263,
"grad_norm": 0.23248636722564697,
"learning_rate": 1.3073956062384707e-05,
"loss": 0.006,
"step": 20700
},
{
"epoch": 3.4768766756032172,
"grad_norm": 0.2018858790397644,
"learning_rate": 1.3057185980211304e-05,
"loss": 0.0055,
"step": 20750
},
{
"epoch": 3.485254691689008,
"grad_norm": 0.3976786732673645,
"learning_rate": 1.3040415898037901e-05,
"loss": 0.0053,
"step": 20800
},
{
"epoch": 3.493632707774799,
"grad_norm": 0.34822383522987366,
"learning_rate": 1.3023645815864498e-05,
"loss": 0.0065,
"step": 20850
},
{
"epoch": 3.5020107238605895,
"grad_norm": 0.23607690632343292,
"learning_rate": 1.3006875733691095e-05,
"loss": 0.0059,
"step": 20900
},
{
"epoch": 3.5103887399463805,
"grad_norm": 0.40768417716026306,
"learning_rate": 1.2990105651517692e-05,
"loss": 0.0057,
"step": 20950
},
{
"epoch": 3.5187667560321714,
"grad_norm": 0.42595741152763367,
"learning_rate": 1.2973335569344292e-05,
"loss": 0.0066,
"step": 21000
},
{
"epoch": 3.5271447721179623,
"grad_norm": 0.4516412615776062,
"learning_rate": 1.295656548717089e-05,
"loss": 0.0062,
"step": 21050
},
{
"epoch": 3.535522788203753,
"grad_norm": 0.42684000730514526,
"learning_rate": 1.2939795404997486e-05,
"loss": 0.0059,
"step": 21100
},
{
"epoch": 3.543900804289544,
"grad_norm": 0.5775489211082458,
"learning_rate": 1.2923025322824083e-05,
"loss": 0.0054,
"step": 21150
},
{
"epoch": 3.552278820375335,
"grad_norm": 0.7901192307472229,
"learning_rate": 1.290625524065068e-05,
"loss": 0.0052,
"step": 21200
},
{
"epoch": 3.560656836461126,
"grad_norm": 0.2339819371700287,
"learning_rate": 1.2889485158477277e-05,
"loss": 0.006,
"step": 21250
},
{
"epoch": 3.569034852546917,
"grad_norm": 0.34695181250572205,
"learning_rate": 1.2872715076303874e-05,
"loss": 0.005,
"step": 21300
},
{
"epoch": 3.577412868632708,
"grad_norm": 0.3339728116989136,
"learning_rate": 1.2855944994130471e-05,
"loss": 0.0056,
"step": 21350
},
{
"epoch": 3.5857908847184987,
"grad_norm": 0.22279733419418335,
"learning_rate": 1.2839174911957068e-05,
"loss": 0.0062,
"step": 21400
},
{
"epoch": 3.5941689008042896,
"grad_norm": 0.2896275222301483,
"learning_rate": 1.2822404829783667e-05,
"loss": 0.0062,
"step": 21450
},
{
"epoch": 3.6025469168900806,
"grad_norm": 0.4125616252422333,
"learning_rate": 1.2805634747610264e-05,
"loss": 0.0064,
"step": 21500
},
{
"epoch": 3.6109249329758715,
"grad_norm": 0.3267725110054016,
"learning_rate": 1.2788864665436861e-05,
"loss": 0.0065,
"step": 21550
},
{
"epoch": 3.6193029490616624,
"grad_norm": 0.4519464373588562,
"learning_rate": 1.277209458326346e-05,
"loss": 0.0062,
"step": 21600
},
{
"epoch": 3.6276809651474533,
"grad_norm": 0.4503564238548279,
"learning_rate": 1.2755324501090057e-05,
"loss": 0.0068,
"step": 21650
},
{
"epoch": 3.6360589812332442,
"grad_norm": 0.14587004482746124,
"learning_rate": 1.2738554418916654e-05,
"loss": 0.0056,
"step": 21700
},
{
"epoch": 3.6444369973190347,
"grad_norm": 0.3932003378868103,
"learning_rate": 1.2721784336743251e-05,
"loss": 0.0055,
"step": 21750
},
{
"epoch": 3.6528150134048256,
"grad_norm": 0.41266146302223206,
"learning_rate": 1.2705014254569848e-05,
"loss": 0.0058,
"step": 21800
},
{
"epoch": 3.6611930294906165,
"grad_norm": 0.40585076808929443,
"learning_rate": 1.2688244172396447e-05,
"loss": 0.0057,
"step": 21850
},
{
"epoch": 3.6695710455764075,
"grad_norm": 0.4181327819824219,
"learning_rate": 1.2671474090223044e-05,
"loss": 0.0069,
"step": 21900
},
{
"epoch": 3.6779490616621984,
"grad_norm": 0.20495828986167908,
"learning_rate": 1.265470400804964e-05,
"loss": 0.0066,
"step": 21950
},
{
"epoch": 3.6863270777479893,
"grad_norm": 0.22110895812511444,
"learning_rate": 1.2637933925876238e-05,
"loss": 0.006,
"step": 22000
},
{
"epoch": 3.69470509383378,
"grad_norm": 0.43901216983795166,
"learning_rate": 1.2621163843702835e-05,
"loss": 0.0064,
"step": 22050
},
{
"epoch": 3.703083109919571,
"grad_norm": 0.34933629631996155,
"learning_rate": 1.2604393761529432e-05,
"loss": 0.0062,
"step": 22100
},
{
"epoch": 3.711461126005362,
"grad_norm": 0.3028928339481354,
"learning_rate": 1.2587623679356029e-05,
"loss": 0.0065,
"step": 22150
},
{
"epoch": 3.719839142091153,
"grad_norm": 0.25583240389823914,
"learning_rate": 1.2570853597182626e-05,
"loss": 0.0067,
"step": 22200
},
{
"epoch": 3.7282171581769434,
"grad_norm": 0.15429948270320892,
"learning_rate": 1.2554083515009226e-05,
"loss": 0.0067,
"step": 22250
},
{
"epoch": 3.7365951742627344,
"grad_norm": 0.35330894589424133,
"learning_rate": 1.2537313432835823e-05,
"loss": 0.0073,
"step": 22300
},
{
"epoch": 3.7449731903485253,
"grad_norm": 0.5058137774467468,
"learning_rate": 1.252054335066242e-05,
"loss": 0.0061,
"step": 22350
},
{
"epoch": 3.753351206434316,
"grad_norm": 0.27442070841789246,
"learning_rate": 1.2503773268489017e-05,
"loss": 0.0072,
"step": 22400
},
{
"epoch": 3.761729222520107,
"grad_norm": 0.34210237860679626,
"learning_rate": 1.2487003186315614e-05,
"loss": 0.0059,
"step": 22450
},
{
"epoch": 3.770107238605898,
"grad_norm": 0.36563077569007874,
"learning_rate": 1.2470233104142211e-05,
"loss": 0.0064,
"step": 22500
},
{
"epoch": 3.778485254691689,
"grad_norm": 0.3370627760887146,
"learning_rate": 1.2453463021968808e-05,
"loss": 0.0054,
"step": 22550
},
{
"epoch": 3.78686327077748,
"grad_norm": 0.17612957954406738,
"learning_rate": 1.2436692939795405e-05,
"loss": 0.0071,
"step": 22600
},
{
"epoch": 3.795241286863271,
"grad_norm": 0.33844587206840515,
"learning_rate": 1.2419922857622004e-05,
"loss": 0.0059,
"step": 22650
},
{
"epoch": 3.8036193029490617,
"grad_norm": 0.22707916796207428,
"learning_rate": 1.2403152775448601e-05,
"loss": 0.0062,
"step": 22700
},
{
"epoch": 3.8119973190348526,
"grad_norm": 0.2754456102848053,
"learning_rate": 1.2386382693275198e-05,
"loss": 0.0067,
"step": 22750
},
{
"epoch": 3.8203753351206435,
"grad_norm": 0.411072313785553,
"learning_rate": 1.2369612611101795e-05,
"loss": 0.0066,
"step": 22800
},
{
"epoch": 3.8287533512064345,
"grad_norm": 0.3319416344165802,
"learning_rate": 1.2352842528928392e-05,
"loss": 0.0061,
"step": 22850
},
{
"epoch": 3.8371313672922254,
"grad_norm": 0.20878171920776367,
"learning_rate": 1.2336072446754989e-05,
"loss": 0.0059,
"step": 22900
},
{
"epoch": 3.8455093833780163,
"grad_norm": 0.1912664771080017,
"learning_rate": 1.2319302364581586e-05,
"loss": 0.0059,
"step": 22950
},
{
"epoch": 3.853887399463807,
"grad_norm": 0.3744626045227051,
"learning_rate": 1.2302532282408185e-05,
"loss": 0.006,
"step": 23000
},
{
"epoch": 3.862265415549598,
"grad_norm": 0.37646523118019104,
"learning_rate": 1.2285762200234783e-05,
"loss": 0.0056,
"step": 23050
},
{
"epoch": 3.8706434316353886,
"grad_norm": 0.17005406320095062,
"learning_rate": 1.226899211806138e-05,
"loss": 0.0062,
"step": 23100
},
{
"epoch": 3.8790214477211795,
"grad_norm": 0.35684868693351746,
"learning_rate": 1.2252222035887977e-05,
"loss": 0.0069,
"step": 23150
},
{
"epoch": 3.8873994638069704,
"grad_norm": 0.37645256519317627,
"learning_rate": 1.2235451953714574e-05,
"loss": 0.0071,
"step": 23200
},
{
"epoch": 3.8957774798927614,
"grad_norm": 0.5175814032554626,
"learning_rate": 1.2218681871541171e-05,
"loss": 0.0059,
"step": 23250
},
{
"epoch": 3.9041554959785523,
"grad_norm": 0.3317829966545105,
"learning_rate": 1.2201911789367768e-05,
"loss": 0.0064,
"step": 23300
},
{
"epoch": 3.912533512064343,
"grad_norm": 0.2565181255340576,
"learning_rate": 1.2185141707194365e-05,
"loss": 0.0069,
"step": 23350
},
{
"epoch": 3.920911528150134,
"grad_norm": 0.38244709372520447,
"learning_rate": 1.2168371625020963e-05,
"loss": 0.0068,
"step": 23400
},
{
"epoch": 3.929289544235925,
"grad_norm": 0.4136451184749603,
"learning_rate": 1.2151601542847561e-05,
"loss": 0.0067,
"step": 23450
},
{
"epoch": 3.937667560321716,
"grad_norm": 0.2662147581577301,
"learning_rate": 1.213483146067416e-05,
"loss": 0.0061,
"step": 23500
},
{
"epoch": 3.946045576407507,
"grad_norm": 0.291955828666687,
"learning_rate": 1.2118061378500757e-05,
"loss": 0.0058,
"step": 23550
},
{
"epoch": 3.9544235924932973,
"grad_norm": 0.30662792921066284,
"learning_rate": 1.2101291296327354e-05,
"loss": 0.0071,
"step": 23600
},
{
"epoch": 3.9628016085790883,
"grad_norm": 0.5435032844543457,
"learning_rate": 1.2084521214153951e-05,
"loss": 0.0071,
"step": 23650
},
{
"epoch": 3.971179624664879,
"grad_norm": 0.2924433648586273,
"learning_rate": 1.2067751131980548e-05,
"loss": 0.0074,
"step": 23700
},
{
"epoch": 3.97955764075067,
"grad_norm": 0.47101885080337524,
"learning_rate": 1.2050981049807145e-05,
"loss": 0.0065,
"step": 23750
},
{
"epoch": 3.987935656836461,
"grad_norm": 0.39184531569480896,
"learning_rate": 1.2034210967633742e-05,
"loss": 0.0065,
"step": 23800
},
{
"epoch": 3.996313672922252,
"grad_norm": 0.27226710319519043,
"learning_rate": 1.201744088546034e-05,
"loss": 0.0058,
"step": 23850
},
{
"epoch": 4.004691689008043,
"grad_norm": 0.37524715065956116,
"learning_rate": 1.2000670803286938e-05,
"loss": 0.0045,
"step": 23900
},
{
"epoch": 4.013069705093834,
"grad_norm": 0.0983668640255928,
"learning_rate": 1.1983900721113535e-05,
"loss": 0.0023,
"step": 23950
},
{
"epoch": 4.021447721179625,
"grad_norm": 0.32168978452682495,
"learning_rate": 1.1967130638940132e-05,
"loss": 0.0024,
"step": 24000
},
{
"epoch": 4.029825737265416,
"grad_norm": 0.2205764651298523,
"learning_rate": 1.1950360556766729e-05,
"loss": 0.0022,
"step": 24050
},
{
"epoch": 4.0382037533512065,
"grad_norm": 0.21505975723266602,
"learning_rate": 1.1933590474593326e-05,
"loss": 0.0023,
"step": 24100
},
{
"epoch": 4.046581769436997,
"grad_norm": 0.0701180100440979,
"learning_rate": 1.1916820392419923e-05,
"loss": 0.0019,
"step": 24150
},
{
"epoch": 4.054959785522788,
"grad_norm": 0.3256973624229431,
"learning_rate": 1.190005031024652e-05,
"loss": 0.0026,
"step": 24200
},
{
"epoch": 4.063337801608579,
"grad_norm": 0.308699369430542,
"learning_rate": 1.1883280228073117e-05,
"loss": 0.0024,
"step": 24250
},
{
"epoch": 4.07171581769437,
"grad_norm": 0.29565149545669556,
"learning_rate": 1.1866510145899717e-05,
"loss": 0.0026,
"step": 24300
},
{
"epoch": 4.080093833780161,
"grad_norm": 0.23600581288337708,
"learning_rate": 1.1849740063726314e-05,
"loss": 0.0021,
"step": 24350
},
{
"epoch": 4.088471849865952,
"grad_norm": 0.188632994890213,
"learning_rate": 1.1832969981552911e-05,
"loss": 0.0025,
"step": 24400
},
{
"epoch": 4.096849865951743,
"grad_norm": 0.35330700874328613,
"learning_rate": 1.1816199899379508e-05,
"loss": 0.0025,
"step": 24450
},
{
"epoch": 4.105227882037534,
"grad_norm": 0.14944002032279968,
"learning_rate": 1.1799429817206105e-05,
"loss": 0.0026,
"step": 24500
},
{
"epoch": 4.113605898123325,
"grad_norm": 0.23015423119068146,
"learning_rate": 1.1782659735032702e-05,
"loss": 0.0023,
"step": 24550
},
{
"epoch": 4.121983914209116,
"grad_norm": 0.43203842639923096,
"learning_rate": 1.17658896528593e-05,
"loss": 0.0024,
"step": 24600
},
{
"epoch": 4.130361930294907,
"grad_norm": 0.16286316514015198,
"learning_rate": 1.1749119570685896e-05,
"loss": 0.0024,
"step": 24650
},
{
"epoch": 4.138739946380697,
"grad_norm": 0.44359683990478516,
"learning_rate": 1.1732349488512495e-05,
"loss": 0.0024,
"step": 24700
},
{
"epoch": 4.147117962466488,
"grad_norm": 0.07397326827049255,
"learning_rate": 1.1715579406339092e-05,
"loss": 0.0027,
"step": 24750
},
{
"epoch": 4.1554959785522785,
"grad_norm": 0.07963547110557556,
"learning_rate": 1.1698809324165689e-05,
"loss": 0.0026,
"step": 24800
},
{
"epoch": 4.163873994638069,
"grad_norm": 0.27886438369750977,
"learning_rate": 1.1682039241992286e-05,
"loss": 0.0026,
"step": 24850
},
{
"epoch": 4.17225201072386,
"grad_norm": 0.11975943297147751,
"learning_rate": 1.1665269159818883e-05,
"loss": 0.0025,
"step": 24900
},
{
"epoch": 4.180630026809651,
"grad_norm": 0.1815500408411026,
"learning_rate": 1.1648499077645482e-05,
"loss": 0.0024,
"step": 24950
},
{
"epoch": 4.189008042895442,
"grad_norm": 0.7938502430915833,
"learning_rate": 1.1631728995472079e-05,
"loss": 0.0024,
"step": 25000
},
{
"epoch": 4.197386058981233,
"grad_norm": 0.32321181893348694,
"learning_rate": 1.1614958913298676e-05,
"loss": 0.0026,
"step": 25050
},
{
"epoch": 4.205764075067024,
"grad_norm": 0.09507790207862854,
"learning_rate": 1.1598188831125274e-05,
"loss": 0.0023,
"step": 25100
},
{
"epoch": 4.214142091152815,
"grad_norm": 0.31748858094215393,
"learning_rate": 1.1581418748951872e-05,
"loss": 0.0025,
"step": 25150
},
{
"epoch": 4.222520107238606,
"grad_norm": 0.39395052194595337,
"learning_rate": 1.1564648666778469e-05,
"loss": 0.0027,
"step": 25200
},
{
"epoch": 4.230898123324397,
"grad_norm": 0.12810911238193512,
"learning_rate": 1.1547878584605066e-05,
"loss": 0.0023,
"step": 25250
},
{
"epoch": 4.239276139410188,
"grad_norm": 0.18891964852809906,
"learning_rate": 1.1531108502431663e-05,
"loss": 0.0024,
"step": 25300
},
{
"epoch": 4.247654155495979,
"grad_norm": 0.16266460716724396,
"learning_rate": 1.151433842025826e-05,
"loss": 0.0032,
"step": 25350
},
{
"epoch": 4.2560321715817695,
"grad_norm": 0.3381274342536926,
"learning_rate": 1.1497568338084857e-05,
"loss": 0.0027,
"step": 25400
},
{
"epoch": 4.26441018766756,
"grad_norm": 0.7718698978424072,
"learning_rate": 1.1480798255911454e-05,
"loss": 0.0026,
"step": 25450
},
{
"epoch": 4.272788203753351,
"grad_norm": 0.7734161615371704,
"learning_rate": 1.1464028173738054e-05,
"loss": 0.0029,
"step": 25500
},
{
"epoch": 4.281166219839142,
"grad_norm": 0.24889783561229706,
"learning_rate": 1.1447258091564651e-05,
"loss": 0.0027,
"step": 25550
},
{
"epoch": 4.289544235924933,
"grad_norm": 0.1416139155626297,
"learning_rate": 1.1430488009391248e-05,
"loss": 0.0023,
"step": 25600
},
{
"epoch": 4.297922252010724,
"grad_norm": 0.2628386318683624,
"learning_rate": 1.1413717927217845e-05,
"loss": 0.0026,
"step": 25650
},
{
"epoch": 4.306300268096515,
"grad_norm": 0.1891651153564453,
"learning_rate": 1.1396947845044442e-05,
"loss": 0.0021,
"step": 25700
},
{
"epoch": 4.314678284182306,
"grad_norm": 0.33927446603775024,
"learning_rate": 1.1380177762871039e-05,
"loss": 0.0023,
"step": 25750
},
{
"epoch": 4.323056300268097,
"grad_norm": 0.2871659994125366,
"learning_rate": 1.1363407680697636e-05,
"loss": 0.0025,
"step": 25800
},
{
"epoch": 4.331434316353888,
"grad_norm": 0.16000057756900787,
"learning_rate": 1.1346637598524233e-05,
"loss": 0.0028,
"step": 25850
},
{
"epoch": 4.339812332439679,
"grad_norm": 0.2464749813079834,
"learning_rate": 1.1329867516350832e-05,
"loss": 0.0027,
"step": 25900
},
{
"epoch": 4.348190348525469,
"grad_norm": 0.22312916815280914,
"learning_rate": 1.1313097434177429e-05,
"loss": 0.0021,
"step": 25950
},
{
"epoch": 4.35656836461126,
"grad_norm": 0.04916452243924141,
"learning_rate": 1.1296327352004026e-05,
"loss": 0.0028,
"step": 26000
},
{
"epoch": 4.3649463806970505,
"grad_norm": 0.09145969897508621,
"learning_rate": 1.1279557269830623e-05,
"loss": 0.0024,
"step": 26050
},
{
"epoch": 4.3733243967828415,
"grad_norm": 0.15269909799098969,
"learning_rate": 1.126278718765722e-05,
"loss": 0.0027,
"step": 26100
},
{
"epoch": 4.381702412868632,
"grad_norm": 0.14302955567836761,
"learning_rate": 1.1246017105483817e-05,
"loss": 0.0026,
"step": 26150
},
{
"epoch": 4.390080428954423,
"grad_norm": 0.38677042722702026,
"learning_rate": 1.1229247023310414e-05,
"loss": 0.0025,
"step": 26200
},
{
"epoch": 4.398458445040214,
"grad_norm": 0.18998374044895172,
"learning_rate": 1.1212476941137011e-05,
"loss": 0.0026,
"step": 26250
},
{
"epoch": 4.406836461126005,
"grad_norm": 0.07754815369844437,
"learning_rate": 1.1195706858963611e-05,
"loss": 0.0031,
"step": 26300
},
{
"epoch": 4.415214477211796,
"grad_norm": 0.31846073269844055,
"learning_rate": 1.1178936776790208e-05,
"loss": 0.0024,
"step": 26350
},
{
"epoch": 4.423592493297587,
"grad_norm": 0.3100847601890564,
"learning_rate": 1.1162166694616805e-05,
"loss": 0.0029,
"step": 26400
},
{
"epoch": 4.431970509383378,
"grad_norm": 0.08093760907649994,
"learning_rate": 1.1145396612443402e-05,
"loss": 0.0023,
"step": 26450
},
{
"epoch": 4.440348525469169,
"grad_norm": 0.12065700441598892,
"learning_rate": 1.112862653027e-05,
"loss": 0.0024,
"step": 26500
},
{
"epoch": 4.44872654155496,
"grad_norm": 0.18668776750564575,
"learning_rate": 1.1111856448096596e-05,
"loss": 0.0021,
"step": 26550
},
{
"epoch": 4.457104557640751,
"grad_norm": 0.06442166119813919,
"learning_rate": 1.1095086365923193e-05,
"loss": 0.0022,
"step": 26600
},
{
"epoch": 4.465482573726542,
"grad_norm": 0.3723543584346771,
"learning_rate": 1.107831628374979e-05,
"loss": 0.0026,
"step": 26650
},
{
"epoch": 4.4738605898123325,
"grad_norm": 0.17430204153060913,
"learning_rate": 1.1061546201576389e-05,
"loss": 0.0025,
"step": 26700
},
{
"epoch": 4.482238605898123,
"grad_norm": 0.24499832093715668,
"learning_rate": 1.1044776119402986e-05,
"loss": 0.0026,
"step": 26750
},
{
"epoch": 4.490616621983914,
"grad_norm": 0.12160493433475494,
"learning_rate": 1.1028006037229583e-05,
"loss": 0.0027,
"step": 26800
},
{
"epoch": 4.498994638069705,
"grad_norm": 0.13980576395988464,
"learning_rate": 1.101123595505618e-05,
"loss": 0.0025,
"step": 26850
},
{
"epoch": 4.507372654155496,
"grad_norm": 0.9759100079536438,
"learning_rate": 1.0994465872882779e-05,
"loss": 0.0027,
"step": 26900
},
{
"epoch": 4.515750670241287,
"grad_norm": 0.26894333958625793,
"learning_rate": 1.0977695790709376e-05,
"loss": 0.0025,
"step": 26950
},
{
"epoch": 4.524128686327078,
"grad_norm": 0.17025631666183472,
"learning_rate": 1.0960925708535973e-05,
"loss": 0.0026,
"step": 27000
},
{
"epoch": 4.532506702412869,
"grad_norm": 0.38619130849838257,
"learning_rate": 1.094415562636257e-05,
"loss": 0.0027,
"step": 27050
},
{
"epoch": 4.54088471849866,
"grad_norm": 0.2916272282600403,
"learning_rate": 1.0927385544189169e-05,
"loss": 0.003,
"step": 27100
},
{
"epoch": 4.549262734584451,
"grad_norm": 0.26764917373657227,
"learning_rate": 1.0910615462015766e-05,
"loss": 0.0024,
"step": 27150
},
{
"epoch": 4.557640750670242,
"grad_norm": 0.1611230969429016,
"learning_rate": 1.0893845379842363e-05,
"loss": 0.003,
"step": 27200
},
{
"epoch": 4.566018766756033,
"grad_norm": 0.23360604047775269,
"learning_rate": 1.087707529766896e-05,
"loss": 0.003,
"step": 27250
},
{
"epoch": 4.5743967828418235,
"grad_norm": 0.19292519986629486,
"learning_rate": 1.0860305215495557e-05,
"loss": 0.0026,
"step": 27300
},
{
"epoch": 4.582774798927614,
"grad_norm": 0.4826861023902893,
"learning_rate": 1.0843535133322154e-05,
"loss": 0.0029,
"step": 27350
},
{
"epoch": 4.591152815013404,
"grad_norm": 0.11598275601863861,
"learning_rate": 1.082676505114875e-05,
"loss": 0.003,
"step": 27400
},
{
"epoch": 4.599530831099195,
"grad_norm": 0.2721264958381653,
"learning_rate": 1.0809994968975348e-05,
"loss": 0.0032,
"step": 27450
},
{
"epoch": 4.607908847184986,
"grad_norm": 0.1644926518201828,
"learning_rate": 1.0793224886801945e-05,
"loss": 0.003,
"step": 27500
},
{
"epoch": 4.616286863270777,
"grad_norm": 0.17666374146938324,
"learning_rate": 1.0776454804628545e-05,
"loss": 0.0028,
"step": 27550
},
{
"epoch": 4.624664879356568,
"grad_norm": 0.21726448833942413,
"learning_rate": 1.0759684722455142e-05,
"loss": 0.0025,
"step": 27600
},
{
"epoch": 4.633042895442359,
"grad_norm": 0.21972903609275818,
"learning_rate": 1.0742914640281739e-05,
"loss": 0.0028,
"step": 27650
},
{
"epoch": 4.64142091152815,
"grad_norm": 0.2247893065214157,
"learning_rate": 1.0726144558108336e-05,
"loss": 0.0026,
"step": 27700
},
{
"epoch": 4.649798927613941,
"grad_norm": 0.1289321780204773,
"learning_rate": 1.0709374475934933e-05,
"loss": 0.0032,
"step": 27750
},
{
"epoch": 4.658176943699732,
"grad_norm": 0.4954499304294586,
"learning_rate": 1.069260439376153e-05,
"loss": 0.0028,
"step": 27800
},
{
"epoch": 4.666554959785523,
"grad_norm": 0.18240614235401154,
"learning_rate": 1.0675834311588127e-05,
"loss": 0.0027,
"step": 27850
},
{
"epoch": 4.674932975871314,
"grad_norm": 0.1933482438325882,
"learning_rate": 1.0659064229414724e-05,
"loss": 0.0031,
"step": 27900
},
{
"epoch": 4.6833109919571045,
"grad_norm": 0.11678989976644516,
"learning_rate": 1.0642294147241323e-05,
"loss": 0.0032,
"step": 27950
},
{
"epoch": 4.6916890080428955,
"grad_norm": 0.18577493727207184,
"learning_rate": 1.062552406506792e-05,
"loss": 0.0026,
"step": 28000
},
{
"epoch": 4.700067024128686,
"grad_norm": 0.07589305937290192,
"learning_rate": 1.0608753982894517e-05,
"loss": 0.003,
"step": 28050
},
{
"epoch": 4.708445040214477,
"grad_norm": 0.20295588672161102,
"learning_rate": 1.0591983900721114e-05,
"loss": 0.0028,
"step": 28100
},
{
"epoch": 4.716823056300268,
"grad_norm": 0.27186664938926697,
"learning_rate": 1.0575213818547711e-05,
"loss": 0.0033,
"step": 28150
},
{
"epoch": 4.725201072386059,
"grad_norm": 0.3505285382270813,
"learning_rate": 1.0558443736374308e-05,
"loss": 0.0029,
"step": 28200
},
{
"epoch": 4.73357908847185,
"grad_norm": 0.4643058776855469,
"learning_rate": 1.0541673654200905e-05,
"loss": 0.0033,
"step": 28250
},
{
"epoch": 4.741957104557641,
"grad_norm": 0.29970914125442505,
"learning_rate": 1.0524903572027502e-05,
"loss": 0.0029,
"step": 28300
},
{
"epoch": 4.750335120643432,
"grad_norm": 0.3563650846481323,
"learning_rate": 1.0508133489854102e-05,
"loss": 0.0026,
"step": 28350
},
{
"epoch": 4.758713136729223,
"grad_norm": 0.2816406190395355,
"learning_rate": 1.04913634076807e-05,
"loss": 0.0033,
"step": 28400
},
{
"epoch": 4.767091152815014,
"grad_norm": 0.3998458981513977,
"learning_rate": 1.0474593325507296e-05,
"loss": 0.0033,
"step": 28450
},
{
"epoch": 4.775469168900805,
"grad_norm": 0.29152771830558777,
"learning_rate": 1.0457823243333893e-05,
"loss": 0.0031,
"step": 28500
},
{
"epoch": 4.783847184986596,
"grad_norm": 0.3733079731464386,
"learning_rate": 1.044105316116049e-05,
"loss": 0.0031,
"step": 28550
},
{
"epoch": 4.792225201072386,
"grad_norm": 0.2442307472229004,
"learning_rate": 1.0424283078987087e-05,
"loss": 0.0032,
"step": 28600
},
{
"epoch": 4.8006032171581765,
"grad_norm": 0.6178602576255798,
"learning_rate": 1.0407512996813684e-05,
"loss": 0.0029,
"step": 28650
},
{
"epoch": 4.808981233243967,
"grad_norm": 0.3169240951538086,
"learning_rate": 1.0390742914640281e-05,
"loss": 0.0029,
"step": 28700
},
{
"epoch": 4.817359249329758,
"grad_norm": 0.21497473120689392,
"learning_rate": 1.037397283246688e-05,
"loss": 0.0032,
"step": 28750
},
{
"epoch": 4.825737265415549,
"grad_norm": 0.4647163450717926,
"learning_rate": 1.0357202750293477e-05,
"loss": 0.0027,
"step": 28800
},
{
"epoch": 4.83411528150134,
"grad_norm": 0.18522508442401886,
"learning_rate": 1.0340432668120076e-05,
"loss": 0.0026,
"step": 28850
},
{
"epoch": 4.842493297587131,
"grad_norm": 0.201819509267807,
"learning_rate": 1.0323662585946673e-05,
"loss": 0.0025,
"step": 28900
},
{
"epoch": 4.850871313672922,
"grad_norm": 0.2343200445175171,
"learning_rate": 1.030689250377327e-05,
"loss": 0.0027,
"step": 28950
},
{
"epoch": 4.859249329758713,
"grad_norm": 0.164067804813385,
"learning_rate": 1.0290122421599867e-05,
"loss": 0.0032,
"step": 29000
},
{
"epoch": 4.867627345844504,
"grad_norm": 1.0820327997207642,
"learning_rate": 1.0273352339426464e-05,
"loss": 0.0033,
"step": 29050
},
{
"epoch": 4.876005361930295,
"grad_norm": 0.18911249935626984,
"learning_rate": 1.0256582257253061e-05,
"loss": 0.0024,
"step": 29100
},
{
"epoch": 4.884383378016086,
"grad_norm": 0.1403694599866867,
"learning_rate": 1.023981217507966e-05,
"loss": 0.0028,
"step": 29150
},
{
"epoch": 4.892761394101877,
"grad_norm": 0.20968593657016754,
"learning_rate": 1.0223042092906257e-05,
"loss": 0.0034,
"step": 29200
},
{
"epoch": 4.9011394101876675,
"grad_norm": 0.26832762360572815,
"learning_rate": 1.0206272010732854e-05,
"loss": 0.0035,
"step": 29250
},
{
"epoch": 4.909517426273458,
"grad_norm": 0.26747608184814453,
"learning_rate": 1.018950192855945e-05,
"loss": 0.0034,
"step": 29300
},
{
"epoch": 4.917895442359249,
"grad_norm": 0.999813437461853,
"learning_rate": 1.0172731846386048e-05,
"loss": 0.0029,
"step": 29350
},
{
"epoch": 4.92627345844504,
"grad_norm": 0.33220162987709045,
"learning_rate": 1.0155961764212645e-05,
"loss": 0.0033,
"step": 29400
},
{
"epoch": 4.934651474530831,
"grad_norm": 0.13821391761302948,
"learning_rate": 1.0139191682039242e-05,
"loss": 0.003,
"step": 29450
},
{
"epoch": 4.943029490616622,
"grad_norm": 0.33970770239830017,
"learning_rate": 1.0122421599865839e-05,
"loss": 0.0028,
"step": 29500
},
{
"epoch": 4.951407506702413,
"grad_norm": 0.09418370574712753,
"learning_rate": 1.0105651517692439e-05,
"loss": 0.0029,
"step": 29550
},
{
"epoch": 4.959785522788204,
"grad_norm": 0.10416509956121445,
"learning_rate": 1.0088881435519036e-05,
"loss": 0.0031,
"step": 29600
},
{
"epoch": 4.968163538873995,
"grad_norm": 0.7082052230834961,
"learning_rate": 1.0072111353345633e-05,
"loss": 0.0025,
"step": 29650
},
{
"epoch": 4.976541554959786,
"grad_norm": 0.3095639944076538,
"learning_rate": 1.005534127117223e-05,
"loss": 0.0035,
"step": 29700
},
{
"epoch": 4.984919571045577,
"grad_norm": 0.119889035820961,
"learning_rate": 1.0038571188998827e-05,
"loss": 0.0033,
"step": 29750
},
{
"epoch": 4.993297587131368,
"grad_norm": 0.29492849111557007,
"learning_rate": 1.0021801106825424e-05,
"loss": 0.0029,
"step": 29800
},
{
"epoch": 5.0016756032171585,
"grad_norm": 0.0865137055516243,
"learning_rate": 1.0005031024652021e-05,
"loss": 0.0028,
"step": 29850
},
{
"epoch": 5.0100536193029495,
"grad_norm": 0.22691671550273895,
"learning_rate": 9.98826094247862e-06,
"loss": 0.0011,
"step": 29900
},
{
"epoch": 5.01843163538874,
"grad_norm": 0.0516495518386364,
"learning_rate": 9.971490860305217e-06,
"loss": 0.0012,
"step": 29950
},
{
"epoch": 5.02680965147453,
"grad_norm": 0.08190739154815674,
"learning_rate": 9.954720778131814e-06,
"loss": 0.0011,
"step": 30000
},
{
"epoch": 5.02680965147453,
"eval_loss": 0.05592558532953262,
"eval_runtime": 0.3145,
"eval_samples_per_second": 63.586,
"eval_steps_per_second": 3.179,
"step": 30000
},
{
"epoch": 5.035187667560321,
"grad_norm": 0.025882409885525703,
"learning_rate": 9.937950695958411e-06,
"loss": 0.0008,
"step": 30050
},
{
"epoch": 5.043565683646112,
"grad_norm": 0.12556754052639008,
"learning_rate": 9.921180613785008e-06,
"loss": 0.001,
"step": 30100
},
{
"epoch": 5.051943699731903,
"grad_norm": 0.09527916461229324,
"learning_rate": 9.904410531611605e-06,
"loss": 0.0011,
"step": 30150
},
{
"epoch": 5.060321715817694,
"grad_norm": 0.30216673016548157,
"learning_rate": 9.887640449438202e-06,
"loss": 0.0013,
"step": 30200
},
{
"epoch": 5.068699731903485,
"grad_norm": 0.030431820079684258,
"learning_rate": 9.8708703672648e-06,
"loss": 0.0011,
"step": 30250
},
{
"epoch": 5.077077747989276,
"grad_norm": 0.08387458324432373,
"learning_rate": 9.854100285091398e-06,
"loss": 0.0008,
"step": 30300
},
{
"epoch": 5.085455764075067,
"grad_norm": 0.12422385066747665,
"learning_rate": 9.837330202917995e-06,
"loss": 0.001,
"step": 30350
},
{
"epoch": 5.093833780160858,
"grad_norm": 0.05497809499502182,
"learning_rate": 9.820560120744592e-06,
"loss": 0.001,
"step": 30400
},
{
"epoch": 5.102211796246649,
"grad_norm": 0.21757960319519043,
"learning_rate": 9.803790038571189e-06,
"loss": 0.0011,
"step": 30450
},
{
"epoch": 5.11058981233244,
"grad_norm": 0.2624013423919678,
"learning_rate": 9.787019956397787e-06,
"loss": 0.0012,
"step": 30500
},
{
"epoch": 5.1189678284182305,
"grad_norm": 0.12058177590370178,
"learning_rate": 9.770249874224384e-06,
"loss": 0.0011,
"step": 30550
},
{
"epoch": 5.127345844504021,
"grad_norm": 0.09058215469121933,
"learning_rate": 9.753479792050981e-06,
"loss": 0.001,
"step": 30600
},
{
"epoch": 5.135723860589812,
"grad_norm": 0.07257585972547531,
"learning_rate": 9.736709709877578e-06,
"loss": 0.0011,
"step": 30650
},
{
"epoch": 5.144101876675603,
"grad_norm": 0.07394664734601974,
"learning_rate": 9.719939627704177e-06,
"loss": 0.0014,
"step": 30700
},
{
"epoch": 5.152479892761394,
"grad_norm": 0.08835545182228088,
"learning_rate": 9.703169545530774e-06,
"loss": 0.0011,
"step": 30750
},
{
"epoch": 5.160857908847185,
"grad_norm": 0.05140378698706627,
"learning_rate": 9.686399463357371e-06,
"loss": 0.0011,
"step": 30800
},
{
"epoch": 5.169235924932976,
"grad_norm": 0.11998426169157028,
"learning_rate": 9.669629381183968e-06,
"loss": 0.0013,
"step": 30850
},
{
"epoch": 5.177613941018767,
"grad_norm": 0.11740259826183319,
"learning_rate": 9.652859299010567e-06,
"loss": 0.0011,
"step": 30900
},
{
"epoch": 5.185991957104558,
"grad_norm": 0.3295953869819641,
"learning_rate": 9.636089216837164e-06,
"loss": 0.0011,
"step": 30950
},
{
"epoch": 5.194369973190349,
"grad_norm": 0.21252810955047607,
"learning_rate": 9.619319134663761e-06,
"loss": 0.0012,
"step": 31000
},
{
"epoch": 5.20274798927614,
"grad_norm": 0.1684710681438446,
"learning_rate": 9.602549052490358e-06,
"loss": 0.001,
"step": 31050
},
{
"epoch": 5.211126005361931,
"grad_norm": 0.30938273668289185,
"learning_rate": 9.585778970316955e-06,
"loss": 0.0013,
"step": 31100
},
{
"epoch": 5.2195040214477215,
"grad_norm": 0.13435423374176025,
"learning_rate": 9.569008888143552e-06,
"loss": 0.0012,
"step": 31150
},
{
"epoch": 5.227882037533512,
"grad_norm": 0.24395543336868286,
"learning_rate": 9.552238805970149e-06,
"loss": 0.001,
"step": 31200
},
{
"epoch": 5.236260053619303,
"grad_norm": 0.07691800594329834,
"learning_rate": 9.535468723796748e-06,
"loss": 0.0011,
"step": 31250
},
{
"epoch": 5.244638069705093,
"grad_norm": 0.07506980746984482,
"learning_rate": 9.518698641623345e-06,
"loss": 0.001,
"step": 31300
},
{
"epoch": 5.253016085790884,
"grad_norm": 0.09802160412073135,
"learning_rate": 9.501928559449942e-06,
"loss": 0.0009,
"step": 31350
},
{
"epoch": 5.261394101876675,
"grad_norm": 0.08386828005313873,
"learning_rate": 9.485158477276539e-06,
"loss": 0.0011,
"step": 31400
},
{
"epoch": 5.269772117962466,
"grad_norm": 0.23838509619235992,
"learning_rate": 9.468388395103136e-06,
"loss": 0.0011,
"step": 31450
},
{
"epoch": 5.278150134048257,
"grad_norm": 0.06687796860933304,
"learning_rate": 9.451618312929734e-06,
"loss": 0.001,
"step": 31500
},
{
"epoch": 5.286528150134048,
"grad_norm": 0.06589027494192123,
"learning_rate": 9.434848230756332e-06,
"loss": 0.0011,
"step": 31550
},
{
"epoch": 5.294906166219839,
"grad_norm": 0.21493591368198395,
"learning_rate": 9.418078148582929e-06,
"loss": 0.0012,
"step": 31600
},
{
"epoch": 5.30328418230563,
"grad_norm": 0.20591777563095093,
"learning_rate": 9.401308066409526e-06,
"loss": 0.0012,
"step": 31650
},
{
"epoch": 5.311662198391421,
"grad_norm": 0.16145972907543182,
"learning_rate": 9.384537984236124e-06,
"loss": 0.0012,
"step": 31700
},
{
"epoch": 5.320040214477212,
"grad_norm": 0.07270830124616623,
"learning_rate": 9.367767902062721e-06,
"loss": 0.001,
"step": 31750
},
{
"epoch": 5.328418230563003,
"grad_norm": 0.12921959161758423,
"learning_rate": 9.350997819889318e-06,
"loss": 0.0012,
"step": 31800
},
{
"epoch": 5.3367962466487935,
"grad_norm": 0.24689610302448273,
"learning_rate": 9.334227737715915e-06,
"loss": 0.0011,
"step": 31850
},
{
"epoch": 5.345174262734584,
"grad_norm": 0.14935541152954102,
"learning_rate": 9.317457655542514e-06,
"loss": 0.001,
"step": 31900
},
{
"epoch": 5.353552278820375,
"grad_norm": 0.05026477575302124,
"learning_rate": 9.300687573369111e-06,
"loss": 0.0009,
"step": 31950
},
{
"epoch": 5.361930294906166,
"grad_norm": 0.11298377066850662,
"learning_rate": 9.283917491195708e-06,
"loss": 0.0012,
"step": 32000
},
{
"epoch": 5.370308310991957,
"grad_norm": 0.04898526519536972,
"learning_rate": 9.267147409022305e-06,
"loss": 0.0015,
"step": 32050
},
{
"epoch": 5.378686327077748,
"grad_norm": 0.07678736001253128,
"learning_rate": 9.250377326848902e-06,
"loss": 0.0013,
"step": 32100
},
{
"epoch": 5.387064343163539,
"grad_norm": 0.08714163303375244,
"learning_rate": 9.233607244675499e-06,
"loss": 0.001,
"step": 32150
},
{
"epoch": 5.39544235924933,
"grad_norm": 0.14269877970218658,
"learning_rate": 9.216837162502098e-06,
"loss": 0.001,
"step": 32200
},
{
"epoch": 5.403820375335121,
"grad_norm": 0.0840928927063942,
"learning_rate": 9.200067080328695e-06,
"loss": 0.0012,
"step": 32250
},
{
"epoch": 5.412198391420912,
"grad_norm": 0.049975261092185974,
"learning_rate": 9.183296998155292e-06,
"loss": 0.0013,
"step": 32300
},
{
"epoch": 5.420576407506703,
"grad_norm": 0.10942261666059494,
"learning_rate": 9.166526915981889e-06,
"loss": 0.0013,
"step": 32350
},
{
"epoch": 5.428954423592494,
"grad_norm": 0.09510983526706696,
"learning_rate": 9.149756833808486e-06,
"loss": 0.0013,
"step": 32400
},
{
"epoch": 5.4373324396782845,
"grad_norm": 0.07059191167354584,
"learning_rate": 9.132986751635083e-06,
"loss": 0.0013,
"step": 32450
},
{
"epoch": 5.445710455764075,
"grad_norm": 0.09271156042814255,
"learning_rate": 9.116216669461682e-06,
"loss": 0.0014,
"step": 32500
},
{
"epoch": 5.454088471849866,
"grad_norm": 0.5445387363433838,
"learning_rate": 9.099446587288279e-06,
"loss": 0.0017,
"step": 32550
},
{
"epoch": 5.462466487935657,
"grad_norm": 0.5922443270683289,
"learning_rate": 9.082676505114876e-06,
"loss": 0.0012,
"step": 32600
},
{
"epoch": 5.470844504021448,
"grad_norm": 0.08508787304162979,
"learning_rate": 9.065906422941473e-06,
"loss": 0.0013,
"step": 32650
},
{
"epoch": 5.479222520107238,
"grad_norm": 0.10297244787216187,
"learning_rate": 9.049136340768071e-06,
"loss": 0.0015,
"step": 32700
},
{
"epoch": 5.487600536193029,
"grad_norm": 0.20003701746463776,
"learning_rate": 9.032366258594668e-06,
"loss": 0.001,
"step": 32750
},
{
"epoch": 5.49597855227882,
"grad_norm": 0.33047032356262207,
"learning_rate": 9.015596176421265e-06,
"loss": 0.0011,
"step": 32800
},
{
"epoch": 5.504356568364611,
"grad_norm": 0.12265091389417648,
"learning_rate": 8.998826094247862e-06,
"loss": 0.0012,
"step": 32850
},
{
"epoch": 5.512734584450402,
"grad_norm": 0.1573624163866043,
"learning_rate": 8.982056012074461e-06,
"loss": 0.0013,
"step": 32900
},
{
"epoch": 5.521112600536193,
"grad_norm": 0.13570530712604523,
"learning_rate": 8.965285929901058e-06,
"loss": 0.0012,
"step": 32950
},
{
"epoch": 5.529490616621984,
"grad_norm": 0.1362573504447937,
"learning_rate": 8.948515847727655e-06,
"loss": 0.0015,
"step": 33000
},
{
"epoch": 5.537868632707775,
"grad_norm": 0.16211983561515808,
"learning_rate": 8.931745765554252e-06,
"loss": 0.0013,
"step": 33050
},
{
"epoch": 5.5462466487935655,
"grad_norm": 0.1816491037607193,
"learning_rate": 8.914975683380849e-06,
"loss": 0.0013,
"step": 33100
},
{
"epoch": 5.5546246648793565,
"grad_norm": 0.12222578376531601,
"learning_rate": 8.898205601207446e-06,
"loss": 0.0012,
"step": 33150
},
{
"epoch": 5.563002680965147,
"grad_norm": 0.24587097764015198,
"learning_rate": 8.881435519034045e-06,
"loss": 0.0014,
"step": 33200
},
{
"epoch": 5.571380697050938,
"grad_norm": 0.13261163234710693,
"learning_rate": 8.864665436860642e-06,
"loss": 0.0012,
"step": 33250
},
{
"epoch": 5.579758713136729,
"grad_norm": 0.1050226092338562,
"learning_rate": 8.847895354687239e-06,
"loss": 0.0013,
"step": 33300
},
{
"epoch": 5.58813672922252,
"grad_norm": 0.1119270995259285,
"learning_rate": 8.831125272513836e-06,
"loss": 0.0013,
"step": 33350
},
{
"epoch": 5.596514745308311,
"grad_norm": 0.41666361689567566,
"learning_rate": 8.814355190340433e-06,
"loss": 0.0013,
"step": 33400
},
{
"epoch": 5.604892761394102,
"grad_norm": 0.21499872207641602,
"learning_rate": 8.79758510816703e-06,
"loss": 0.0013,
"step": 33450
},
{
"epoch": 5.613270777479893,
"grad_norm": 0.1437048465013504,
"learning_rate": 8.780815025993627e-06,
"loss": 0.0013,
"step": 33500
},
{
"epoch": 5.621648793565684,
"grad_norm": 0.10052605718374252,
"learning_rate": 8.764044943820226e-06,
"loss": 0.0016,
"step": 33550
},
{
"epoch": 5.630026809651475,
"grad_norm": 0.16671398282051086,
"learning_rate": 8.747274861646823e-06,
"loss": 0.0013,
"step": 33600
},
{
"epoch": 5.638404825737266,
"grad_norm": 0.1302991360425949,
"learning_rate": 8.73050477947342e-06,
"loss": 0.0013,
"step": 33650
},
{
"epoch": 5.646782841823057,
"grad_norm": 0.07595470547676086,
"learning_rate": 8.713734697300017e-06,
"loss": 0.0013,
"step": 33700
},
{
"epoch": 5.6551608579088475,
"grad_norm": 0.08618602156639099,
"learning_rate": 8.696964615126615e-06,
"loss": 0.0011,
"step": 33750
},
{
"epoch": 5.663538873994638,
"grad_norm": 0.18707716464996338,
"learning_rate": 8.680194532953212e-06,
"loss": 0.0014,
"step": 33800
},
{
"epoch": 5.671916890080429,
"grad_norm": 0.167672261595726,
"learning_rate": 8.66342445077981e-06,
"loss": 0.0015,
"step": 33850
},
{
"epoch": 5.680294906166219,
"grad_norm": 0.15763333439826965,
"learning_rate": 8.646654368606406e-06,
"loss": 0.0015,
"step": 33900
},
{
"epoch": 5.68867292225201,
"grad_norm": 0.1692523956298828,
"learning_rate": 8.629884286433005e-06,
"loss": 0.0014,
"step": 33950
},
{
"epoch": 5.697050938337801,
"grad_norm": 0.06355728209018707,
"learning_rate": 8.613114204259602e-06,
"loss": 0.0016,
"step": 34000
},
{
"epoch": 5.705428954423592,
"grad_norm": 0.06145229935646057,
"learning_rate": 8.596344122086199e-06,
"loss": 0.0014,
"step": 34050
},
{
"epoch": 5.713806970509383,
"grad_norm": 0.1229192316532135,
"learning_rate": 8.579574039912796e-06,
"loss": 0.0016,
"step": 34100
},
{
"epoch": 5.722184986595174,
"grad_norm": 0.07723601907491684,
"learning_rate": 8.562803957739395e-06,
"loss": 0.0012,
"step": 34150
},
{
"epoch": 5.730563002680965,
"grad_norm": 0.16651087999343872,
"learning_rate": 8.546033875565992e-06,
"loss": 0.0012,
"step": 34200
},
{
"epoch": 5.738941018766756,
"grad_norm": 0.19212378561496735,
"learning_rate": 8.529263793392589e-06,
"loss": 0.0015,
"step": 34250
},
{
"epoch": 5.747319034852547,
"grad_norm": 0.14563122391700745,
"learning_rate": 8.512493711219186e-06,
"loss": 0.0013,
"step": 34300
},
{
"epoch": 5.755697050938338,
"grad_norm": 0.14981712400913239,
"learning_rate": 8.495723629045783e-06,
"loss": 0.0013,
"step": 34350
},
{
"epoch": 5.7640750670241285,
"grad_norm": 0.3324640989303589,
"learning_rate": 8.47895354687238e-06,
"loss": 0.0014,
"step": 34400
},
{
"epoch": 5.7724530831099194,
"grad_norm": 0.2827085256576538,
"learning_rate": 8.462183464698977e-06,
"loss": 0.0014,
"step": 34450
},
{
"epoch": 5.78083109919571,
"grad_norm": 0.3784811198711395,
"learning_rate": 8.445413382525574e-06,
"loss": 0.0015,
"step": 34500
},
{
"epoch": 5.789209115281501,
"grad_norm": 0.08754808455705643,
"learning_rate": 8.428643300352173e-06,
"loss": 0.0013,
"step": 34550
},
{
"epoch": 5.797587131367292,
"grad_norm": 0.2719215750694275,
"learning_rate": 8.41187321817877e-06,
"loss": 0.0014,
"step": 34600
},
{
"epoch": 5.805965147453083,
"grad_norm": 0.17088165879249573,
"learning_rate": 8.395103136005367e-06,
"loss": 0.0016,
"step": 34650
},
{
"epoch": 5.814343163538874,
"grad_norm": 0.3564954400062561,
"learning_rate": 8.378333053831964e-06,
"loss": 0.0013,
"step": 34700
},
{
"epoch": 5.822721179624665,
"grad_norm": 0.1912204474210739,
"learning_rate": 8.361562971658562e-06,
"loss": 0.0017,
"step": 34750
},
{
"epoch": 5.831099195710456,
"grad_norm": 0.2299826443195343,
"learning_rate": 8.34479288948516e-06,
"loss": 0.0011,
"step": 34800
},
{
"epoch": 5.839477211796247,
"grad_norm": 0.07874714583158493,
"learning_rate": 8.328022807311756e-06,
"loss": 0.0016,
"step": 34850
},
{
"epoch": 5.847855227882038,
"grad_norm": 0.1063261479139328,
"learning_rate": 8.311252725138353e-06,
"loss": 0.0014,
"step": 34900
},
{
"epoch": 5.856233243967829,
"grad_norm": 0.07933440804481506,
"learning_rate": 8.294482642964952e-06,
"loss": 0.0013,
"step": 34950
},
{
"epoch": 5.8646112600536195,
"grad_norm": 0.14268645644187927,
"learning_rate": 8.277712560791549e-06,
"loss": 0.0014,
"step": 35000
},
{
"epoch": 5.8729892761394105,
"grad_norm": 0.24926510453224182,
"learning_rate": 8.260942478618146e-06,
"loss": 0.0014,
"step": 35050
},
{
"epoch": 5.881367292225201,
"grad_norm": 0.12582330405712128,
"learning_rate": 8.244172396444743e-06,
"loss": 0.0013,
"step": 35100
},
{
"epoch": 5.889745308310992,
"grad_norm": 0.04589623957872391,
"learning_rate": 8.227402314271342e-06,
"loss": 0.0012,
"step": 35150
},
{
"epoch": 5.898123324396783,
"grad_norm": 0.3229510188102722,
"learning_rate": 8.210632232097939e-06,
"loss": 0.0011,
"step": 35200
},
{
"epoch": 5.906501340482574,
"grad_norm": 0.12953703105449677,
"learning_rate": 8.193862149924536e-06,
"loss": 0.0013,
"step": 35250
},
{
"epoch": 5.914879356568365,
"grad_norm": 0.31099674105644226,
"learning_rate": 8.177092067751133e-06,
"loss": 0.0013,
"step": 35300
},
{
"epoch": 5.923257372654156,
"grad_norm": 0.17244383692741394,
"learning_rate": 8.16032198557773e-06,
"loss": 0.0012,
"step": 35350
},
{
"epoch": 5.931635388739946,
"grad_norm": 0.6794390678405762,
"learning_rate": 8.143551903404327e-06,
"loss": 0.0011,
"step": 35400
},
{
"epoch": 5.940013404825737,
"grad_norm": 0.15421807765960693,
"learning_rate": 8.126781821230924e-06,
"loss": 0.0015,
"step": 35450
},
{
"epoch": 5.948391420911528,
"grad_norm": 0.09070286899805069,
"learning_rate": 8.110011739057521e-06,
"loss": 0.0015,
"step": 35500
},
{
"epoch": 5.956769436997319,
"grad_norm": 0.13311493396759033,
"learning_rate": 8.09324165688412e-06,
"loss": 0.0014,
"step": 35550
},
{
"epoch": 5.96514745308311,
"grad_norm": 0.3226371705532074,
"learning_rate": 8.076471574710717e-06,
"loss": 0.0015,
"step": 35600
},
{
"epoch": 5.973525469168901,
"grad_norm": 0.25139835476875305,
"learning_rate": 8.059701492537314e-06,
"loss": 0.0016,
"step": 35650
},
{
"epoch": 5.9819034852546915,
"grad_norm": 0.14944802224636078,
"learning_rate": 8.04293141036391e-06,
"loss": 0.0013,
"step": 35700
},
{
"epoch": 5.990281501340482,
"grad_norm": 0.210645392537117,
"learning_rate": 8.02616132819051e-06,
"loss": 0.0014,
"step": 35750
},
{
"epoch": 5.998659517426273,
"grad_norm": 0.3287517726421356,
"learning_rate": 8.009391246017106e-06,
"loss": 0.0013,
"step": 35800
},
{
"epoch": 6.007037533512064,
"grad_norm": 0.08051057904958725,
"learning_rate": 7.992621163843703e-06,
"loss": 0.0007,
"step": 35850
},
{
"epoch": 6.015415549597855,
"grad_norm": 0.03923693299293518,
"learning_rate": 7.9758510816703e-06,
"loss": 0.0006,
"step": 35900
},
{
"epoch": 6.023793565683646,
"grad_norm": 0.03783218562602997,
"learning_rate": 7.959080999496899e-06,
"loss": 0.0009,
"step": 35950
},
{
"epoch": 6.032171581769437,
"grad_norm": 0.251902312040329,
"learning_rate": 7.942310917323496e-06,
"loss": 0.0007,
"step": 36000
},
{
"epoch": 6.040549597855228,
"grad_norm": 0.04270997270941734,
"learning_rate": 7.925540835150093e-06,
"loss": 0.0006,
"step": 36050
},
{
"epoch": 6.048927613941019,
"grad_norm": 0.07284736633300781,
"learning_rate": 7.90877075297669e-06,
"loss": 0.0007,
"step": 36100
},
{
"epoch": 6.05730563002681,
"grad_norm": 0.06689571589231491,
"learning_rate": 7.892000670803289e-06,
"loss": 0.0007,
"step": 36150
},
{
"epoch": 6.065683646112601,
"grad_norm": 0.024217478930950165,
"learning_rate": 7.875230588629886e-06,
"loss": 0.0006,
"step": 36200
},
{
"epoch": 6.074061662198392,
"grad_norm": 0.09656205028295517,
"learning_rate": 7.858460506456483e-06,
"loss": 0.0006,
"step": 36250
},
{
"epoch": 6.0824396782841825,
"grad_norm": 0.030422423034906387,
"learning_rate": 7.84169042428308e-06,
"loss": 0.0007,
"step": 36300
},
{
"epoch": 6.0908176943699734,
"grad_norm": 0.08347397297620773,
"learning_rate": 7.824920342109677e-06,
"loss": 0.0006,
"step": 36350
},
{
"epoch": 6.099195710455764,
"grad_norm": 0.027809837833046913,
"learning_rate": 7.808150259936274e-06,
"loss": 0.0006,
"step": 36400
},
{
"epoch": 6.107573726541555,
"grad_norm": 0.2227599024772644,
"learning_rate": 7.791380177762871e-06,
"loss": 0.0007,
"step": 36450
},
{
"epoch": 6.115951742627346,
"grad_norm": 0.1868954300880432,
"learning_rate": 7.774610095589468e-06,
"loss": 0.0007,
"step": 36500
},
{
"epoch": 6.124329758713137,
"grad_norm": 0.1395631730556488,
"learning_rate": 7.757840013416067e-06,
"loss": 0.0005,
"step": 36550
},
{
"epoch": 6.132707774798928,
"grad_norm": 0.029958348721265793,
"learning_rate": 7.741069931242664e-06,
"loss": 0.0005,
"step": 36600
},
{
"epoch": 6.141085790884718,
"grad_norm": 0.04532192647457123,
"learning_rate": 7.72429984906926e-06,
"loss": 0.0007,
"step": 36650
},
{
"epoch": 6.149463806970509,
"grad_norm": 0.0366247221827507,
"learning_rate": 7.707529766895858e-06,
"loss": 0.0005,
"step": 36700
},
{
"epoch": 6.1578418230563,
"grad_norm": 0.07616298645734787,
"learning_rate": 7.690759684722455e-06,
"loss": 0.0007,
"step": 36750
},
{
"epoch": 6.166219839142091,
"grad_norm": 0.05013656988739967,
"learning_rate": 7.673989602549053e-06,
"loss": 0.0007,
"step": 36800
},
{
"epoch": 6.174597855227882,
"grad_norm": 0.08419755846261978,
"learning_rate": 7.65721952037565e-06,
"loss": 0.0005,
"step": 36850
},
{
"epoch": 6.182975871313673,
"grad_norm": 0.11614430695772171,
"learning_rate": 7.640449438202247e-06,
"loss": 0.0004,
"step": 36900
},
{
"epoch": 6.191353887399464,
"grad_norm": 0.1487479954957962,
"learning_rate": 7.6236793560288445e-06,
"loss": 0.0007,
"step": 36950
},
{
"epoch": 6.1997319034852545,
"grad_norm": 0.1483132392168045,
"learning_rate": 7.606909273855443e-06,
"loss": 0.0006,
"step": 37000
},
{
"epoch": 6.208109919571045,
"grad_norm": 0.034125398844480515,
"learning_rate": 7.59013919168204e-06,
"loss": 0.0005,
"step": 37050
},
{
"epoch": 6.216487935656836,
"grad_norm": 0.0705786794424057,
"learning_rate": 7.573369109508637e-06,
"loss": 0.0007,
"step": 37100
},
{
"epoch": 6.224865951742627,
"grad_norm": 0.03942383453249931,
"learning_rate": 7.556599027335234e-06,
"loss": 0.0007,
"step": 37150
},
{
"epoch": 6.233243967828418,
"grad_norm": 0.045597631484270096,
"learning_rate": 7.539828945161832e-06,
"loss": 0.0006,
"step": 37200
},
{
"epoch": 6.241621983914209,
"grad_norm": 0.06973922997713089,
"learning_rate": 7.523058862988429e-06,
"loss": 0.0005,
"step": 37250
},
{
"epoch": 6.25,
"grad_norm": 0.042143791913986206,
"learning_rate": 7.506288780815026e-06,
"loss": 0.0006,
"step": 37300
},
{
"epoch": 6.258378016085791,
"grad_norm": 0.02553519792854786,
"learning_rate": 7.489518698641623e-06,
"loss": 0.0005,
"step": 37350
},
{
"epoch": 6.266756032171582,
"grad_norm": 0.07629157602787018,
"learning_rate": 7.472748616468222e-06,
"loss": 0.0005,
"step": 37400
},
{
"epoch": 6.275134048257373,
"grad_norm": 0.13083019852638245,
"learning_rate": 7.455978534294819e-06,
"loss": 0.0006,
"step": 37450
},
{
"epoch": 6.283512064343164,
"grad_norm": 0.024578507989645004,
"learning_rate": 7.439208452121416e-06,
"loss": 0.0005,
"step": 37500
},
{
"epoch": 6.291890080428955,
"grad_norm": 0.2308337688446045,
"learning_rate": 7.422438369948013e-06,
"loss": 0.0005,
"step": 37550
},
{
"epoch": 6.3002680965147455,
"grad_norm": 0.09496274590492249,
"learning_rate": 7.405668287774611e-06,
"loss": 0.0005,
"step": 37600
},
{
"epoch": 6.308646112600536,
"grad_norm": 0.06111710146069527,
"learning_rate": 7.388898205601209e-06,
"loss": 0.0007,
"step": 37650
},
{
"epoch": 6.317024128686327,
"grad_norm": 0.30509406328201294,
"learning_rate": 7.372128123427806e-06,
"loss": 0.0006,
"step": 37700
},
{
"epoch": 6.325402144772118,
"grad_norm": 0.041681017726659775,
"learning_rate": 7.355358041254403e-06,
"loss": 0.0006,
"step": 37750
},
{
"epoch": 6.333780160857909,
"grad_norm": 0.05730760842561722,
"learning_rate": 7.3385879590810005e-06,
"loss": 0.0005,
"step": 37800
},
{
"epoch": 6.3421581769437,
"grad_norm": 0.03064553625881672,
"learning_rate": 7.3218178769075975e-06,
"loss": 0.0006,
"step": 37850
},
{
"epoch": 6.350536193029491,
"grad_norm": 0.0326654389500618,
"learning_rate": 7.3050477947341945e-06,
"loss": 0.0006,
"step": 37900
},
{
"epoch": 6.358914209115282,
"grad_norm": 0.058062855154275894,
"learning_rate": 7.2882777125607915e-06,
"loss": 0.0007,
"step": 37950
},
{
"epoch": 6.367292225201073,
"grad_norm": 0.1029849499464035,
"learning_rate": 7.27150763038739e-06,
"loss": 0.0005,
"step": 38000
},
{
"epoch": 6.375670241286863,
"grad_norm": 0.05858965218067169,
"learning_rate": 7.254737548213987e-06,
"loss": 0.0006,
"step": 38050
},
{
"epoch": 6.384048257372654,
"grad_norm": 0.0990440845489502,
"learning_rate": 7.237967466040584e-06,
"loss": 0.0005,
"step": 38100
},
{
"epoch": 6.392426273458445,
"grad_norm": 0.18513937294483185,
"learning_rate": 7.221197383867181e-06,
"loss": 0.0005,
"step": 38150
},
{
"epoch": 6.400804289544236,
"grad_norm": 0.045414622873067856,
"learning_rate": 7.204427301693779e-06,
"loss": 0.0006,
"step": 38200
},
{
"epoch": 6.4091823056300266,
"grad_norm": 0.14755046367645264,
"learning_rate": 7.187657219520376e-06,
"loss": 0.0006,
"step": 38250
},
{
"epoch": 6.4175603217158175,
"grad_norm": 0.03956648334860802,
"learning_rate": 7.170887137346973e-06,
"loss": 0.0005,
"step": 38300
},
{
"epoch": 6.425938337801608,
"grad_norm": 0.03931158035993576,
"learning_rate": 7.154117055173571e-06,
"loss": 0.0006,
"step": 38350
},
{
"epoch": 6.434316353887399,
"grad_norm": 0.030736852437257767,
"learning_rate": 7.137346973000169e-06,
"loss": 0.0006,
"step": 38400
},
{
"epoch": 6.44269436997319,
"grad_norm": 0.04715625196695328,
"learning_rate": 7.120576890826766e-06,
"loss": 0.0006,
"step": 38450
},
{
"epoch": 6.451072386058981,
"grad_norm": 0.03468763083219528,
"learning_rate": 7.103806808653363e-06,
"loss": 0.0006,
"step": 38500
},
{
"epoch": 6.459450402144772,
"grad_norm": 0.04370501637458801,
"learning_rate": 7.08703672647996e-06,
"loss": 0.0008,
"step": 38550
},
{
"epoch": 6.467828418230563,
"grad_norm": 0.08410083502531052,
"learning_rate": 7.070266644306559e-06,
"loss": 0.0007,
"step": 38600
},
{
"epoch": 6.476206434316354,
"grad_norm": 0.07396062463521957,
"learning_rate": 7.053496562133156e-06,
"loss": 0.001,
"step": 38650
},
{
"epoch": 6.484584450402145,
"grad_norm": 0.09718171507120132,
"learning_rate": 7.036726479959753e-06,
"loss": 0.0006,
"step": 38700
},
{
"epoch": 6.492962466487936,
"grad_norm": 0.20317842066287994,
"learning_rate": 7.01995639778635e-06,
"loss": 0.0007,
"step": 38750
},
{
"epoch": 6.501340482573727,
"grad_norm": 0.1610729843378067,
"learning_rate": 7.0031863156129475e-06,
"loss": 0.0007,
"step": 38800
},
{
"epoch": 6.509718498659518,
"grad_norm": 0.2847572863101959,
"learning_rate": 6.9864162334395445e-06,
"loss": 0.0006,
"step": 38850
},
{
"epoch": 6.5180965147453085,
"grad_norm": 0.05428579822182655,
"learning_rate": 6.9696461512661415e-06,
"loss": 0.0007,
"step": 38900
},
{
"epoch": 6.526474530831099,
"grad_norm": 0.03034658171236515,
"learning_rate": 6.9528760690927385e-06,
"loss": 0.0006,
"step": 38950
},
{
"epoch": 6.53485254691689,
"grad_norm": 0.08986043930053711,
"learning_rate": 6.936105986919337e-06,
"loss": 0.0005,
"step": 39000
},
{
"epoch": 6.543230563002681,
"grad_norm": 0.05553920567035675,
"learning_rate": 6.919335904745934e-06,
"loss": 0.0009,
"step": 39050
},
{
"epoch": 6.551608579088472,
"grad_norm": 0.018183773383498192,
"learning_rate": 6.902565822572531e-06,
"loss": 0.0008,
"step": 39100
},
{
"epoch": 6.559986595174263,
"grad_norm": 0.06645216047763824,
"learning_rate": 6.885795740399128e-06,
"loss": 0.0008,
"step": 39150
},
{
"epoch": 6.568364611260054,
"grad_norm": 0.04686279594898224,
"learning_rate": 6.869025658225726e-06,
"loss": 0.0006,
"step": 39200
},
{
"epoch": 6.576742627345844,
"grad_norm": 0.14523954689502716,
"learning_rate": 6.852255576052323e-06,
"loss": 0.0007,
"step": 39250
},
{
"epoch": 6.585120643431635,
"grad_norm": 0.07832646369934082,
"learning_rate": 6.83548549387892e-06,
"loss": 0.0007,
"step": 39300
},
{
"epoch": 6.593498659517426,
"grad_norm": 0.07805398851633072,
"learning_rate": 6.818715411705518e-06,
"loss": 0.0006,
"step": 39350
},
{
"epoch": 6.601876675603217,
"grad_norm": 0.07783017307519913,
"learning_rate": 6.801945329532115e-06,
"loss": 0.0006,
"step": 39400
},
{
"epoch": 6.610254691689008,
"grad_norm": 0.044575657695531845,
"learning_rate": 6.785175247358713e-06,
"loss": 0.0006,
"step": 39450
},
{
"epoch": 6.618632707774799,
"grad_norm": 0.31225234270095825,
"learning_rate": 6.76840516518531e-06,
"loss": 0.0006,
"step": 39500
},
{
"epoch": 6.6270107238605895,
"grad_norm": 0.08033174276351929,
"learning_rate": 6.751635083011907e-06,
"loss": 0.0008,
"step": 39550
},
{
"epoch": 6.6353887399463805,
"grad_norm": 0.06084591895341873,
"learning_rate": 6.734865000838504e-06,
"loss": 0.0007,
"step": 39600
},
{
"epoch": 6.643766756032171,
"grad_norm": 0.05018865689635277,
"learning_rate": 6.718094918665103e-06,
"loss": 0.0008,
"step": 39650
},
{
"epoch": 6.652144772117962,
"grad_norm": 0.05032634735107422,
"learning_rate": 6.7013248364917e-06,
"loss": 0.0006,
"step": 39700
},
{
"epoch": 6.660522788203753,
"grad_norm": 0.08206313848495483,
"learning_rate": 6.684554754318297e-06,
"loss": 0.0009,
"step": 39750
},
{
"epoch": 6.668900804289544,
"grad_norm": 0.1278487741947174,
"learning_rate": 6.667784672144894e-06,
"loss": 0.0006,
"step": 39800
},
{
"epoch": 6.677278820375335,
"grad_norm": 0.05226191505789757,
"learning_rate": 6.6510145899714915e-06,
"loss": 0.0008,
"step": 39850
},
{
"epoch": 6.685656836461126,
"grad_norm": 0.04356776922941208,
"learning_rate": 6.6342445077980886e-06,
"loss": 0.0006,
"step": 39900
},
{
"epoch": 6.694034852546917,
"grad_norm": 0.06333254277706146,
"learning_rate": 6.6174744256246856e-06,
"loss": 0.0007,
"step": 39950
},
{
"epoch": 6.702412868632708,
"grad_norm": 0.08492754399776459,
"learning_rate": 6.600704343451283e-06,
"loss": 0.0007,
"step": 40000
},
{
"epoch": 6.702412868632708,
"eval_loss": 0.061279989778995514,
"eval_runtime": 0.3146,
"eval_samples_per_second": 63.58,
"eval_steps_per_second": 3.179,
"step": 40000
},
{
"epoch": 6.710790884718499,
"grad_norm": 0.3515622317790985,
"learning_rate": 6.583934261277881e-06,
"loss": 0.0009,
"step": 40050
},
{
"epoch": 6.71916890080429,
"grad_norm": 0.10627135634422302,
"learning_rate": 6.567164179104478e-06,
"loss": 0.0007,
"step": 40100
},
{
"epoch": 6.7275469168900806,
"grad_norm": 0.17090724408626556,
"learning_rate": 6.550394096931075e-06,
"loss": 0.0008,
"step": 40150
},
{
"epoch": 6.7359249329758715,
"grad_norm": 0.03962019085884094,
"learning_rate": 6.533624014757672e-06,
"loss": 0.0005,
"step": 40200
},
{
"epoch": 6.744302949061662,
"grad_norm": 0.5243228077888489,
"learning_rate": 6.51685393258427e-06,
"loss": 0.0007,
"step": 40250
},
{
"epoch": 6.752680965147453,
"grad_norm": 0.04279276728630066,
"learning_rate": 6.500083850410868e-06,
"loss": 0.0007,
"step": 40300
},
{
"epoch": 6.761058981233244,
"grad_norm": 0.05610975995659828,
"learning_rate": 6.483313768237465e-06,
"loss": 0.0006,
"step": 40350
},
{
"epoch": 6.769436997319035,
"grad_norm": 0.05965403839945793,
"learning_rate": 6.466543686064062e-06,
"loss": 0.0007,
"step": 40400
},
{
"epoch": 6.777815013404826,
"grad_norm": 0.1390516757965088,
"learning_rate": 6.44977360389066e-06,
"loss": 0.0008,
"step": 40450
},
{
"epoch": 6.786193029490617,
"grad_norm": 0.06495050340890884,
"learning_rate": 6.433003521717257e-06,
"loss": 0.0007,
"step": 40500
},
{
"epoch": 6.794571045576408,
"grad_norm": 0.031072689220309258,
"learning_rate": 6.416233439543854e-06,
"loss": 0.0007,
"step": 40550
},
{
"epoch": 6.802949061662199,
"grad_norm": 0.07859810441732407,
"learning_rate": 6.399463357370451e-06,
"loss": 0.0006,
"step": 40600
},
{
"epoch": 6.81132707774799,
"grad_norm": 0.032027170062065125,
"learning_rate": 6.38269327519705e-06,
"loss": 0.0006,
"step": 40650
},
{
"epoch": 6.819705093833781,
"grad_norm": 0.04607260972261429,
"learning_rate": 6.365923193023647e-06,
"loss": 0.0006,
"step": 40700
},
{
"epoch": 6.828083109919571,
"grad_norm": 0.3754972815513611,
"learning_rate": 6.349153110850244e-06,
"loss": 0.0008,
"step": 40750
},
{
"epoch": 6.836461126005362,
"grad_norm": 0.2285059541463852,
"learning_rate": 6.332383028676841e-06,
"loss": 0.0008,
"step": 40800
},
{
"epoch": 6.8448391420911525,
"grad_norm": 0.0579165481030941,
"learning_rate": 6.3156129465034386e-06,
"loss": 0.0007,
"step": 40850
},
{
"epoch": 6.853217158176943,
"grad_norm": 0.045123569667339325,
"learning_rate": 6.298842864330036e-06,
"loss": 0.0009,
"step": 40900
},
{
"epoch": 6.861595174262734,
"grad_norm": 0.05418705940246582,
"learning_rate": 6.282072782156633e-06,
"loss": 0.0007,
"step": 40950
},
{
"epoch": 6.869973190348525,
"grad_norm": 0.09083729982376099,
"learning_rate": 6.2653026999832305e-06,
"loss": 0.0007,
"step": 41000
},
{
"epoch": 6.878351206434316,
"grad_norm": 0.04620116204023361,
"learning_rate": 6.248532617809828e-06,
"loss": 0.0008,
"step": 41050
},
{
"epoch": 6.886729222520107,
"grad_norm": 0.18256771564483643,
"learning_rate": 6.231762535636425e-06,
"loss": 0.0006,
"step": 41100
},
{
"epoch": 6.895107238605898,
"grad_norm": 0.09917750209569931,
"learning_rate": 6.214992453463022e-06,
"loss": 0.0006,
"step": 41150
},
{
"epoch": 6.903485254691689,
"grad_norm": 0.07544329017400742,
"learning_rate": 6.198222371289619e-06,
"loss": 0.0008,
"step": 41200
},
{
"epoch": 6.91186327077748,
"grad_norm": 0.23120667040348053,
"learning_rate": 6.181452289116217e-06,
"loss": 0.0006,
"step": 41250
},
{
"epoch": 6.920241286863271,
"grad_norm": 0.030208513140678406,
"learning_rate": 6.164682206942815e-06,
"loss": 0.0009,
"step": 41300
},
{
"epoch": 6.928619302949062,
"grad_norm": 0.06649070233106613,
"learning_rate": 6.147912124769412e-06,
"loss": 0.0007,
"step": 41350
},
{
"epoch": 6.936997319034853,
"grad_norm": 0.0900665819644928,
"learning_rate": 6.131142042596009e-06,
"loss": 0.0006,
"step": 41400
},
{
"epoch": 6.9453753351206435,
"grad_norm": 0.046752411872148514,
"learning_rate": 6.114371960422607e-06,
"loss": 0.0006,
"step": 41450
},
{
"epoch": 6.9537533512064345,
"grad_norm": 0.1204705610871315,
"learning_rate": 6.097601878249204e-06,
"loss": 0.0007,
"step": 41500
},
{
"epoch": 6.962131367292225,
"grad_norm": 0.0734005719423294,
"learning_rate": 6.080831796075801e-06,
"loss": 0.0007,
"step": 41550
},
{
"epoch": 6.970509383378016,
"grad_norm": 0.0475836880505085,
"learning_rate": 6.064061713902398e-06,
"loss": 0.0008,
"step": 41600
},
{
"epoch": 6.978887399463807,
"grad_norm": 0.08627843111753464,
"learning_rate": 6.047291631728997e-06,
"loss": 0.0007,
"step": 41650
},
{
"epoch": 6.987265415549598,
"grad_norm": 0.04399965703487396,
"learning_rate": 6.030521549555594e-06,
"loss": 0.0007,
"step": 41700
},
{
"epoch": 6.995643431635389,
"grad_norm": 0.09367845952510834,
"learning_rate": 6.013751467382191e-06,
"loss": 0.0007,
"step": 41750
},
{
"epoch": 7.00402144772118,
"grad_norm": 0.042545393109321594,
"learning_rate": 5.996981385208788e-06,
"loss": 0.0006,
"step": 41800
},
{
"epoch": 7.012399463806971,
"grad_norm": 0.06873136013746262,
"learning_rate": 5.980211303035386e-06,
"loss": 0.0003,
"step": 41850
},
{
"epoch": 7.020777479892762,
"grad_norm": 0.016868956387043,
"learning_rate": 5.963441220861983e-06,
"loss": 0.0003,
"step": 41900
},
{
"epoch": 7.029155495978553,
"grad_norm": 0.040632057934999466,
"learning_rate": 5.94667113868858e-06,
"loss": 0.0004,
"step": 41950
},
{
"epoch": 7.037533512064343,
"grad_norm": 0.02409088797867298,
"learning_rate": 5.9299010565151775e-06,
"loss": 0.0003,
"step": 42000
},
{
"epoch": 7.045911528150134,
"grad_norm": 0.04467145353555679,
"learning_rate": 5.913130974341775e-06,
"loss": 0.0003,
"step": 42050
},
{
"epoch": 7.054289544235925,
"grad_norm": 0.03071122244000435,
"learning_rate": 5.896360892168372e-06,
"loss": 0.0003,
"step": 42100
},
{
"epoch": 7.0626675603217155,
"grad_norm": 0.028979197144508362,
"learning_rate": 5.879590809994969e-06,
"loss": 0.0003,
"step": 42150
},
{
"epoch": 7.071045576407506,
"grad_norm": 0.01919564977288246,
"learning_rate": 5.862820727821566e-06,
"loss": 0.0004,
"step": 42200
},
{
"epoch": 7.079423592493297,
"grad_norm": 0.027090469375252724,
"learning_rate": 5.846050645648163e-06,
"loss": 0.0003,
"step": 42250
},
{
"epoch": 7.087801608579088,
"grad_norm": 0.02643194980919361,
"learning_rate": 5.829280563474762e-06,
"loss": 0.0003,
"step": 42300
},
{
"epoch": 7.096179624664879,
"grad_norm": 0.021015044301748276,
"learning_rate": 5.812510481301359e-06,
"loss": 0.0003,
"step": 42350
},
{
"epoch": 7.10455764075067,
"grad_norm": 0.030943244695663452,
"learning_rate": 5.795740399127956e-06,
"loss": 0.0003,
"step": 42400
},
{
"epoch": 7.112935656836461,
"grad_norm": 0.027034178376197815,
"learning_rate": 5.778970316954553e-06,
"loss": 0.0004,
"step": 42450
},
{
"epoch": 7.121313672922252,
"grad_norm": 0.022702839225530624,
"learning_rate": 5.762200234781151e-06,
"loss": 0.0003,
"step": 42500
},
{
"epoch": 7.129691689008043,
"grad_norm": 0.030643748119473457,
"learning_rate": 5.745430152607748e-06,
"loss": 0.0004,
"step": 42550
},
{
"epoch": 7.138069705093834,
"grad_norm": 0.07582689076662064,
"learning_rate": 5.728660070434345e-06,
"loss": 0.0003,
"step": 42600
},
{
"epoch": 7.146447721179625,
"grad_norm": 0.024221094325184822,
"learning_rate": 5.711889988260942e-06,
"loss": 0.0003,
"step": 42650
},
{
"epoch": 7.154825737265416,
"grad_norm": 0.04585973173379898,
"learning_rate": 5.695119906087541e-06,
"loss": 0.0003,
"step": 42700
},
{
"epoch": 7.1632037533512065,
"grad_norm": 0.1931953877210617,
"learning_rate": 5.678349823914138e-06,
"loss": 0.0004,
"step": 42750
},
{
"epoch": 7.171581769436997,
"grad_norm": 0.02779720537364483,
"learning_rate": 5.661579741740735e-06,
"loss": 0.0002,
"step": 42800
},
{
"epoch": 7.179959785522788,
"grad_norm": 0.0313730388879776,
"learning_rate": 5.644809659567332e-06,
"loss": 0.0004,
"step": 42850
},
{
"epoch": 7.188337801608579,
"grad_norm": 0.0504007451236248,
"learning_rate": 5.62803957739393e-06,
"loss": 0.0006,
"step": 42900
},
{
"epoch": 7.19671581769437,
"grad_norm": 0.019938312470912933,
"learning_rate": 5.6112694952205275e-06,
"loss": 0.0003,
"step": 42950
},
{
"epoch": 7.205093833780161,
"grad_norm": 0.020967137068510056,
"learning_rate": 5.5944994130471245e-06,
"loss": 0.0003,
"step": 43000
},
{
"epoch": 7.213471849865952,
"grad_norm": 0.02640225552022457,
"learning_rate": 5.5777293308737215e-06,
"loss": 0.0003,
"step": 43050
},
{
"epoch": 7.221849865951743,
"grad_norm": 0.02011190541088581,
"learning_rate": 5.560959248700319e-06,
"loss": 0.0003,
"step": 43100
},
{
"epoch": 7.230227882037534,
"grad_norm": 0.05346396192908287,
"learning_rate": 5.544189166526916e-06,
"loss": 0.0003,
"step": 43150
},
{
"epoch": 7.238605898123325,
"grad_norm": 0.038962822407484055,
"learning_rate": 5.5274190843535134e-06,
"loss": 0.0003,
"step": 43200
},
{
"epoch": 7.246983914209116,
"grad_norm": 0.031206265091896057,
"learning_rate": 5.5106490021801104e-06,
"loss": 0.0003,
"step": 43250
},
{
"epoch": 7.255361930294907,
"grad_norm": 0.0827702060341835,
"learning_rate": 5.493878920006709e-06,
"loss": 0.0004,
"step": 43300
},
{
"epoch": 7.263739946380697,
"grad_norm": 0.023083705455064774,
"learning_rate": 5.477108837833306e-06,
"loss": 0.0004,
"step": 43350
},
{
"epoch": 7.272117962466488,
"grad_norm": 0.024041956290602684,
"learning_rate": 5.460338755659903e-06,
"loss": 0.0003,
"step": 43400
},
{
"epoch": 7.2804959785522785,
"grad_norm": 0.07957682758569717,
"learning_rate": 5.4435686734865e-06,
"loss": 0.0003,
"step": 43450
},
{
"epoch": 7.288873994638069,
"grad_norm": 0.18736758828163147,
"learning_rate": 5.426798591313098e-06,
"loss": 0.0005,
"step": 43500
},
{
"epoch": 7.29725201072386,
"grad_norm": 0.04302476719021797,
"learning_rate": 5.410028509139695e-06,
"loss": 0.0003,
"step": 43550
},
{
"epoch": 7.305630026809651,
"grad_norm": 0.35610461235046387,
"learning_rate": 5.393258426966292e-06,
"loss": 0.0005,
"step": 43600
},
{
"epoch": 7.314008042895442,
"grad_norm": 0.023378223180770874,
"learning_rate": 5.376488344792889e-06,
"loss": 0.0003,
"step": 43650
},
{
"epoch": 7.322386058981233,
"grad_norm": 0.031664662063121796,
"learning_rate": 5.359718262619488e-06,
"loss": 0.0003,
"step": 43700
},
{
"epoch": 7.330764075067024,
"grad_norm": 0.01903984695672989,
"learning_rate": 5.342948180446085e-06,
"loss": 0.0003,
"step": 43750
},
{
"epoch": 7.339142091152815,
"grad_norm": 0.031017929315567017,
"learning_rate": 5.326178098272682e-06,
"loss": 0.0004,
"step": 43800
},
{
"epoch": 7.347520107238606,
"grad_norm": 0.028874006122350693,
"learning_rate": 5.309408016099279e-06,
"loss": 0.0003,
"step": 43850
},
{
"epoch": 7.355898123324397,
"grad_norm": 0.016122756525874138,
"learning_rate": 5.292637933925877e-06,
"loss": 0.0003,
"step": 43900
},
{
"epoch": 7.364276139410188,
"grad_norm": 0.037943582981824875,
"learning_rate": 5.2758678517524745e-06,
"loss": 0.0003,
"step": 43950
},
{
"epoch": 7.372654155495979,
"grad_norm": 0.02277122251689434,
"learning_rate": 5.2590977695790716e-06,
"loss": 0.0004,
"step": 44000
},
{
"epoch": 7.3810321715817695,
"grad_norm": 0.049923092126846313,
"learning_rate": 5.2423276874056686e-06,
"loss": 0.0003,
"step": 44050
},
{
"epoch": 7.38941018766756,
"grad_norm": 0.04292990267276764,
"learning_rate": 5.2255576052322664e-06,
"loss": 0.0005,
"step": 44100
},
{
"epoch": 7.397788203753351,
"grad_norm": 0.04316338151693344,
"learning_rate": 5.2087875230588634e-06,
"loss": 0.0003,
"step": 44150
},
{
"epoch": 7.406166219839142,
"grad_norm": 0.0165548212826252,
"learning_rate": 5.1920174408854605e-06,
"loss": 0.0003,
"step": 44200
},
{
"epoch": 7.414544235924933,
"grad_norm": 0.041038576513528824,
"learning_rate": 5.1752473587120575e-06,
"loss": 0.0004,
"step": 44250
},
{
"epoch": 7.422922252010724,
"grad_norm": 0.07028800249099731,
"learning_rate": 5.158477276538656e-06,
"loss": 0.0003,
"step": 44300
},
{
"epoch": 7.431300268096515,
"grad_norm": 0.03722773492336273,
"learning_rate": 5.141707194365253e-06,
"loss": 0.0003,
"step": 44350
},
{
"epoch": 7.439678284182306,
"grad_norm": 0.015645667910575867,
"learning_rate": 5.12493711219185e-06,
"loss": 0.0004,
"step": 44400
},
{
"epoch": 7.448056300268097,
"grad_norm": 0.18982096016407013,
"learning_rate": 5.108167030018447e-06,
"loss": 0.0005,
"step": 44450
},
{
"epoch": 7.456434316353888,
"grad_norm": 0.02524687349796295,
"learning_rate": 5.091396947845045e-06,
"loss": 0.0004,
"step": 44500
},
{
"epoch": 7.464812332439679,
"grad_norm": 0.02892642468214035,
"learning_rate": 5.074626865671642e-06,
"loss": 0.0006,
"step": 44550
},
{
"epoch": 7.473190348525469,
"grad_norm": 0.04107584059238434,
"learning_rate": 5.057856783498239e-06,
"loss": 0.0007,
"step": 44600
},
{
"epoch": 7.48156836461126,
"grad_norm": 0.05140475928783417,
"learning_rate": 5.041086701324837e-06,
"loss": 0.0003,
"step": 44650
},
{
"epoch": 7.4899463806970505,
"grad_norm": 0.054712191224098206,
"learning_rate": 5.024316619151435e-06,
"loss": 0.0004,
"step": 44700
},
{
"epoch": 7.4983243967828415,
"grad_norm": 0.022500043734908104,
"learning_rate": 5.007546536978032e-06,
"loss": 0.0004,
"step": 44750
},
{
"epoch": 7.506702412868632,
"grad_norm": 0.6084216833114624,
"learning_rate": 4.990776454804629e-06,
"loss": 0.0006,
"step": 44800
},
{
"epoch": 7.515080428954423,
"grad_norm": 0.07291937619447708,
"learning_rate": 4.974006372631227e-06,
"loss": 0.0004,
"step": 44850
},
{
"epoch": 7.523458445040214,
"grad_norm": 0.06688908487558365,
"learning_rate": 4.957236290457824e-06,
"loss": 0.0005,
"step": 44900
},
{
"epoch": 7.531836461126005,
"grad_norm": 0.025029698386788368,
"learning_rate": 4.940466208284422e-06,
"loss": 0.0005,
"step": 44950
},
{
"epoch": 7.540214477211796,
"grad_norm": 0.08454358577728271,
"learning_rate": 4.923696126111019e-06,
"loss": 0.0003,
"step": 45000
},
{
"epoch": 7.548592493297587,
"grad_norm": 0.0553942508995533,
"learning_rate": 4.906926043937616e-06,
"loss": 0.0003,
"step": 45050
},
{
"epoch": 7.556970509383378,
"grad_norm": 0.05681919679045677,
"learning_rate": 4.890155961764213e-06,
"loss": 0.0005,
"step": 45100
},
{
"epoch": 7.565348525469169,
"grad_norm": 0.019724005833268166,
"learning_rate": 4.8733858795908105e-06,
"loss": 0.0004,
"step": 45150
},
{
"epoch": 7.57372654155496,
"grad_norm": 0.027729319408535957,
"learning_rate": 4.8566157974174075e-06,
"loss": 0.0003,
"step": 45200
},
{
"epoch": 7.582104557640751,
"grad_norm": 0.05744357779622078,
"learning_rate": 4.8398457152440045e-06,
"loss": 0.0004,
"step": 45250
},
{
"epoch": 7.590482573726542,
"grad_norm": 0.061757415533065796,
"learning_rate": 4.823075633070602e-06,
"loss": 0.0003,
"step": 45300
},
{
"epoch": 7.5988605898123325,
"grad_norm": 0.04031449928879738,
"learning_rate": 4.806305550897199e-06,
"loss": 0.0005,
"step": 45350
},
{
"epoch": 7.607238605898123,
"grad_norm": 0.10854317247867584,
"learning_rate": 4.789535468723797e-06,
"loss": 0.0004,
"step": 45400
},
{
"epoch": 7.615616621983914,
"grad_norm": 0.01701934076845646,
"learning_rate": 4.772765386550394e-06,
"loss": 0.0003,
"step": 45450
},
{
"epoch": 7.623994638069705,
"grad_norm": 0.04535774141550064,
"learning_rate": 4.755995304376992e-06,
"loss": 0.0003,
"step": 45500
},
{
"epoch": 7.632372654155496,
"grad_norm": 0.03864599019289017,
"learning_rate": 4.739225222203589e-06,
"loss": 0.0005,
"step": 45550
},
{
"epoch": 7.640750670241287,
"grad_norm": 0.03809565305709839,
"learning_rate": 4.722455140030187e-06,
"loss": 0.0004,
"step": 45600
},
{
"epoch": 7.649128686327078,
"grad_norm": 0.043824635446071625,
"learning_rate": 4.705685057856784e-06,
"loss": 0.0004,
"step": 45650
},
{
"epoch": 7.657506702412869,
"grad_norm": 0.02916356548666954,
"learning_rate": 4.688914975683381e-06,
"loss": 0.0004,
"step": 45700
},
{
"epoch": 7.66588471849866,
"grad_norm": 0.1915924996137619,
"learning_rate": 4.672144893509978e-06,
"loss": 0.0004,
"step": 45750
},
{
"epoch": 7.674262734584451,
"grad_norm": 0.07420273870229721,
"learning_rate": 4.655374811336576e-06,
"loss": 0.0005,
"step": 45800
},
{
"epoch": 7.682640750670242,
"grad_norm": 0.026272999122738838,
"learning_rate": 4.638604729163173e-06,
"loss": 0.0004,
"step": 45850
},
{
"epoch": 7.691018766756033,
"grad_norm": 0.3181280493736267,
"learning_rate": 4.621834646989771e-06,
"loss": 0.0004,
"step": 45900
},
{
"epoch": 7.6993967828418235,
"grad_norm": 0.035441432148218155,
"learning_rate": 4.605064564816368e-06,
"loss": 0.0004,
"step": 45950
},
{
"epoch": 7.707774798927614,
"grad_norm": 0.01749766804277897,
"learning_rate": 4.588294482642966e-06,
"loss": 0.0004,
"step": 46000
},
{
"epoch": 7.716152815013404,
"grad_norm": 0.07927963137626648,
"learning_rate": 4.571524400469563e-06,
"loss": 0.0004,
"step": 46050
},
{
"epoch": 7.724530831099195,
"grad_norm": 0.03702886402606964,
"learning_rate": 4.5547543182961605e-06,
"loss": 0.0004,
"step": 46100
},
{
"epoch": 7.732908847184986,
"grad_norm": 0.04682036116719246,
"learning_rate": 4.5379842361227575e-06,
"loss": 0.0003,
"step": 46150
},
{
"epoch": 7.741286863270777,
"grad_norm": 0.013787736184895039,
"learning_rate": 4.5212141539493545e-06,
"loss": 0.0004,
"step": 46200
},
{
"epoch": 7.749664879356568,
"grad_norm": 0.029385261237621307,
"learning_rate": 4.5044440717759515e-06,
"loss": 0.0005,
"step": 46250
},
{
"epoch": 7.758042895442359,
"grad_norm": 0.018814504146575928,
"learning_rate": 4.487673989602549e-06,
"loss": 0.0004,
"step": 46300
},
{
"epoch": 7.76642091152815,
"grad_norm": 0.017043303698301315,
"learning_rate": 4.470903907429146e-06,
"loss": 0.0004,
"step": 46350
},
{
"epoch": 7.774798927613941,
"grad_norm": 0.12016864866018295,
"learning_rate": 4.454133825255744e-06,
"loss": 0.0004,
"step": 46400
},
{
"epoch": 7.783176943699732,
"grad_norm": 0.051153432577848434,
"learning_rate": 4.437363743082341e-06,
"loss": 0.0003,
"step": 46450
},
{
"epoch": 7.791554959785523,
"grad_norm": 0.030375001952052116,
"learning_rate": 4.420593660908939e-06,
"loss": 0.0004,
"step": 46500
},
{
"epoch": 7.799932975871314,
"grad_norm": 0.060530513525009155,
"learning_rate": 4.403823578735536e-06,
"loss": 0.0004,
"step": 46550
},
{
"epoch": 7.8083109919571045,
"grad_norm": 0.034854013472795486,
"learning_rate": 4.387053496562134e-06,
"loss": 0.0004,
"step": 46600
},
{
"epoch": 7.8166890080428955,
"grad_norm": 0.03328954800963402,
"learning_rate": 4.370283414388731e-06,
"loss": 0.0004,
"step": 46650
},
{
"epoch": 7.825067024128686,
"grad_norm": 0.04314300790429115,
"learning_rate": 4.353513332215328e-06,
"loss": 0.0004,
"step": 46700
},
{
"epoch": 7.833445040214477,
"grad_norm": 0.09990786761045456,
"learning_rate": 4.336743250041925e-06,
"loss": 0.0004,
"step": 46750
},
{
"epoch": 7.841823056300268,
"grad_norm": 0.035922013223171234,
"learning_rate": 4.319973167868523e-06,
"loss": 0.0003,
"step": 46800
},
{
"epoch": 7.850201072386059,
"grad_norm": 0.05447731912136078,
"learning_rate": 4.30320308569512e-06,
"loss": 0.0004,
"step": 46850
},
{
"epoch": 7.85857908847185,
"grad_norm": 0.01950427144765854,
"learning_rate": 4.286433003521718e-06,
"loss": 0.0004,
"step": 46900
},
{
"epoch": 7.866957104557641,
"grad_norm": 0.04952532425522804,
"learning_rate": 4.269662921348315e-06,
"loss": 0.0004,
"step": 46950
},
{
"epoch": 7.875335120643432,
"grad_norm": 0.04123789444565773,
"learning_rate": 4.252892839174913e-06,
"loss": 0.0004,
"step": 47000
},
{
"epoch": 7.883713136729223,
"grad_norm": 0.0161293838173151,
"learning_rate": 4.23612275700151e-06,
"loss": 0.0003,
"step": 47050
},
{
"epoch": 7.892091152815014,
"grad_norm": 0.039569880813360214,
"learning_rate": 4.2193526748281075e-06,
"loss": 0.0004,
"step": 47100
},
{
"epoch": 7.900469168900805,
"grad_norm": 0.209671750664711,
"learning_rate": 4.2025825926547045e-06,
"loss": 0.0004,
"step": 47150
},
{
"epoch": 7.908847184986596,
"grad_norm": 0.049620840698480606,
"learning_rate": 4.1858125104813016e-06,
"loss": 0.0005,
"step": 47200
},
{
"epoch": 7.917225201072386,
"grad_norm": 0.03689347580075264,
"learning_rate": 4.1690424283078986e-06,
"loss": 0.0004,
"step": 47250
},
{
"epoch": 7.9256032171581765,
"grad_norm": 0.05554811283946037,
"learning_rate": 4.1522723461344964e-06,
"loss": 0.0004,
"step": 47300
},
{
"epoch": 7.933981233243967,
"grad_norm": 0.040197305381298065,
"learning_rate": 4.1355022639610934e-06,
"loss": 0.0003,
"step": 47350
},
{
"epoch": 7.942359249329758,
"grad_norm": 0.01716030202805996,
"learning_rate": 4.118732181787691e-06,
"loss": 0.0003,
"step": 47400
},
{
"epoch": 7.950737265415549,
"grad_norm": 0.0522179938852787,
"learning_rate": 4.101962099614288e-06,
"loss": 0.0003,
"step": 47450
},
{
"epoch": 7.95911528150134,
"grad_norm": 0.3440731167793274,
"learning_rate": 4.085192017440886e-06,
"loss": 0.0004,
"step": 47500
},
{
"epoch": 7.967493297587131,
"grad_norm": 0.03338254243135452,
"learning_rate": 4.068421935267483e-06,
"loss": 0.0004,
"step": 47550
},
{
"epoch": 7.975871313672922,
"grad_norm": 0.061764348298311234,
"learning_rate": 4.051651853094081e-06,
"loss": 0.0004,
"step": 47600
},
{
"epoch": 7.984249329758713,
"grad_norm": 0.046575699001550674,
"learning_rate": 4.034881770920678e-06,
"loss": 0.0004,
"step": 47650
},
{
"epoch": 7.992627345844504,
"grad_norm": 0.062236297875642776,
"learning_rate": 4.018111688747275e-06,
"loss": 0.0004,
"step": 47700
},
{
"epoch": 8.001005361930295,
"grad_norm": 0.028931235894560814,
"learning_rate": 4.001341606573872e-06,
"loss": 0.0003,
"step": 47750
},
{
"epoch": 8.009383378016086,
"grad_norm": 0.01811792142689228,
"learning_rate": 3.98457152440047e-06,
"loss": 0.0002,
"step": 47800
},
{
"epoch": 8.017761394101877,
"grad_norm": 0.01822470873594284,
"learning_rate": 3.967801442227067e-06,
"loss": 0.0002,
"step": 47850
},
{
"epoch": 8.026139410187668,
"grad_norm": 0.04903008043766022,
"learning_rate": 3.951031360053665e-06,
"loss": 0.0002,
"step": 47900
},
{
"epoch": 8.034517426273458,
"grad_norm": 0.022891085594892502,
"learning_rate": 3.934261277880262e-06,
"loss": 0.0002,
"step": 47950
},
{
"epoch": 8.04289544235925,
"grad_norm": 0.015373194590210915,
"learning_rate": 3.91749119570686e-06,
"loss": 0.0002,
"step": 48000
},
{
"epoch": 8.05127345844504,
"grad_norm": 0.01921216771006584,
"learning_rate": 3.900721113533457e-06,
"loss": 0.0002,
"step": 48050
},
{
"epoch": 8.059651474530831,
"grad_norm": 0.01771024614572525,
"learning_rate": 3.8839510313600546e-06,
"loss": 0.0002,
"step": 48100
},
{
"epoch": 8.068029490616622,
"grad_norm": 0.012290588580071926,
"learning_rate": 3.867180949186652e-06,
"loss": 0.0002,
"step": 48150
},
{
"epoch": 8.076407506702413,
"grad_norm": 0.013452921062707901,
"learning_rate": 3.850410867013249e-06,
"loss": 0.0002,
"step": 48200
},
{
"epoch": 8.084785522788204,
"grad_norm": 0.024924032390117645,
"learning_rate": 3.833640784839846e-06,
"loss": 0.0002,
"step": 48250
},
{
"epoch": 8.093163538873995,
"grad_norm": 0.010622446425259113,
"learning_rate": 3.8168707026664435e-06,
"loss": 0.0002,
"step": 48300
},
{
"epoch": 8.101541554959786,
"grad_norm": 0.02651936188340187,
"learning_rate": 3.800100620493041e-06,
"loss": 0.0003,
"step": 48350
},
{
"epoch": 8.109919571045577,
"grad_norm": 0.011863762512803078,
"learning_rate": 3.783330538319638e-06,
"loss": 0.0002,
"step": 48400
},
{
"epoch": 8.118297587131368,
"grad_norm": 0.02366674318909645,
"learning_rate": 3.7665604561462354e-06,
"loss": 0.0002,
"step": 48450
},
{
"epoch": 8.126675603217159,
"grad_norm": 0.026062361896038055,
"learning_rate": 3.7497903739728324e-06,
"loss": 0.0002,
"step": 48500
},
{
"epoch": 8.13505361930295,
"grad_norm": 0.016329048201441765,
"learning_rate": 3.7330202917994302e-06,
"loss": 0.0002,
"step": 48550
},
{
"epoch": 8.14343163538874,
"grad_norm": 0.09319298714399338,
"learning_rate": 3.7162502096260272e-06,
"loss": 0.0002,
"step": 48600
},
{
"epoch": 8.151809651474531,
"grad_norm": 0.014463214203715324,
"learning_rate": 3.6994801274526247e-06,
"loss": 0.0002,
"step": 48650
},
{
"epoch": 8.160187667560322,
"grad_norm": 0.030192028731107712,
"learning_rate": 3.682710045279222e-06,
"loss": 0.0004,
"step": 48700
},
{
"epoch": 8.168565683646113,
"grad_norm": 0.014410781674087048,
"learning_rate": 3.6659399631058195e-06,
"loss": 0.0002,
"step": 48750
},
{
"epoch": 8.176943699731904,
"grad_norm": 0.028254050761461258,
"learning_rate": 3.6491698809324166e-06,
"loss": 0.0002,
"step": 48800
},
{
"epoch": 8.185321715817695,
"grad_norm": 0.022153843194246292,
"learning_rate": 3.6323997987590144e-06,
"loss": 0.0002,
"step": 48850
},
{
"epoch": 8.193699731903486,
"grad_norm": 0.01866259053349495,
"learning_rate": 3.6156297165856114e-06,
"loss": 0.0002,
"step": 48900
},
{
"epoch": 8.202077747989277,
"grad_norm": 0.014017355628311634,
"learning_rate": 3.598859634412209e-06,
"loss": 0.0002,
"step": 48950
},
{
"epoch": 8.210455764075068,
"grad_norm": 0.017173465341329575,
"learning_rate": 3.582089552238806e-06,
"loss": 0.0002,
"step": 49000
},
{
"epoch": 8.218833780160859,
"grad_norm": 0.20775650441646576,
"learning_rate": 3.5653194700654037e-06,
"loss": 0.0002,
"step": 49050
},
{
"epoch": 8.22721179624665,
"grad_norm": 0.014167393557727337,
"learning_rate": 3.5485493878920008e-06,
"loss": 0.0002,
"step": 49100
},
{
"epoch": 8.23558981233244,
"grad_norm": 0.02049107290804386,
"learning_rate": 3.531779305718598e-06,
"loss": 0.0002,
"step": 49150
},
{
"epoch": 8.243967828418231,
"grad_norm": 0.026173189282417297,
"learning_rate": 3.5150092235451956e-06,
"loss": 0.0002,
"step": 49200
},
{
"epoch": 8.25234584450402,
"grad_norm": 0.02050282247364521,
"learning_rate": 3.498239141371793e-06,
"loss": 0.0002,
"step": 49250
},
{
"epoch": 8.260723860589813,
"grad_norm": 0.015291319228708744,
"learning_rate": 3.48146905919839e-06,
"loss": 0.0002,
"step": 49300
},
{
"epoch": 8.269101876675602,
"grad_norm": 0.026897624135017395,
"learning_rate": 3.464698977024988e-06,
"loss": 0.0002,
"step": 49350
},
{
"epoch": 8.277479892761393,
"grad_norm": 0.016716543585062027,
"learning_rate": 3.447928894851585e-06,
"loss": 0.0002,
"step": 49400
},
{
"epoch": 8.285857908847184,
"grad_norm": 0.024870146065950394,
"learning_rate": 3.4311588126781824e-06,
"loss": 0.0002,
"step": 49450
},
{
"epoch": 8.294235924932975,
"grad_norm": 0.017461460083723068,
"learning_rate": 3.4143887305047794e-06,
"loss": 0.0002,
"step": 49500
},
{
"epoch": 8.302613941018766,
"grad_norm": 0.014370834454894066,
"learning_rate": 3.3976186483313773e-06,
"loss": 0.0002,
"step": 49550
},
{
"epoch": 8.310991957104557,
"grad_norm": 0.026379108428955078,
"learning_rate": 3.3808485661579743e-06,
"loss": 0.0003,
"step": 49600
},
{
"epoch": 8.319369973190348,
"grad_norm": 0.012285185977816582,
"learning_rate": 3.364078483984572e-06,
"loss": 0.0002,
"step": 49650
},
{
"epoch": 8.327747989276139,
"grad_norm": 0.02005821093916893,
"learning_rate": 3.347308401811169e-06,
"loss": 0.0002,
"step": 49700
},
{
"epoch": 8.33612600536193,
"grad_norm": 0.016585618257522583,
"learning_rate": 3.3305383196377666e-06,
"loss": 0.0003,
"step": 49750
},
{
"epoch": 8.34450402144772,
"grad_norm": 0.023517385125160217,
"learning_rate": 3.3137682374643636e-06,
"loss": 0.0002,
"step": 49800
},
{
"epoch": 8.352882037533512,
"grad_norm": 0.013065959326922894,
"learning_rate": 3.2969981552909615e-06,
"loss": 0.0002,
"step": 49850
},
{
"epoch": 8.361260053619302,
"grad_norm": 0.028112584725022316,
"learning_rate": 3.2802280731175585e-06,
"loss": 0.0002,
"step": 49900
},
{
"epoch": 8.369638069705093,
"grad_norm": 0.011142577044665813,
"learning_rate": 3.263457990944156e-06,
"loss": 0.0002,
"step": 49950
},
{
"epoch": 8.378016085790884,
"grad_norm": 0.02244596742093563,
"learning_rate": 3.246687908770753e-06,
"loss": 0.0002,
"step": 50000
},
{
"epoch": 8.378016085790884,
"eval_loss": 0.050992656499147415,
"eval_runtime": 0.3133,
"eval_samples_per_second": 63.832,
"eval_steps_per_second": 3.192,
"step": 50000
}
],
"logging_steps": 50,
"max_steps": 59680,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 10000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.0705940805709087e+20,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}