{ "best_metric": 0.03919154778122902, "best_model_checkpoint": null, "epoch": 8.378016085790884, "eval_steps": 10000, "global_step": 50000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008378016085790885, "grad_norm": 0.1641591489315033, "learning_rate": 2e-05, "loss": 0.2693, "step": 50 }, { "epoch": 0.01675603217158177, "grad_norm": 0.12203536182641983, "learning_rate": 1.99832299178266e-05, "loss": 0.1092, "step": 100 }, { "epoch": 0.025134048257372654, "grad_norm": 0.1460294872522354, "learning_rate": 1.9966459835653196e-05, "loss": 0.0971, "step": 150 }, { "epoch": 0.03351206434316354, "grad_norm": 0.18384236097335815, "learning_rate": 1.9949689753479794e-05, "loss": 0.0891, "step": 200 }, { "epoch": 0.041890080428954424, "grad_norm": 0.14532588422298431, "learning_rate": 1.993291967130639e-05, "loss": 0.0825, "step": 250 }, { "epoch": 0.05026809651474531, "grad_norm": 0.1657487004995346, "learning_rate": 1.991614958913299e-05, "loss": 0.085, "step": 300 }, { "epoch": 0.05864611260053619, "grad_norm": 0.16286590695381165, "learning_rate": 1.9899379506959587e-05, "loss": 0.074, "step": 350 }, { "epoch": 0.06702412868632708, "grad_norm": 0.17318418622016907, "learning_rate": 1.9882609424786182e-05, "loss": 0.0723, "step": 400 }, { "epoch": 0.07540214477211796, "grad_norm": 0.20146086812019348, "learning_rate": 1.986583934261278e-05, "loss": 0.0726, "step": 450 }, { "epoch": 0.08378016085790885, "grad_norm": 0.25428423285484314, "learning_rate": 1.9849069260439376e-05, "loss": 0.0745, "step": 500 }, { "epoch": 0.09215817694369974, "grad_norm": 0.25297069549560547, "learning_rate": 1.9832299178265975e-05, "loss": 0.0686, "step": 550 }, { "epoch": 0.10053619302949061, "grad_norm": 0.24197077751159668, "learning_rate": 1.981552909609257e-05, "loss": 0.0689, "step": 600 }, { "epoch": 0.1089142091152815, "grad_norm": 0.2025458812713623, "learning_rate": 1.979875901391917e-05, "loss": 0.064, "step": 650 }, { "epoch": 0.11729222520107238, "grad_norm": 0.2370821088552475, "learning_rate": 1.9781988931745768e-05, "loss": 0.0661, "step": 700 }, { "epoch": 0.12567024128686327, "grad_norm": 0.23466931283473969, "learning_rate": 1.9765218849572367e-05, "loss": 0.0628, "step": 750 }, { "epoch": 0.13404825737265416, "grad_norm": 0.27670082449913025, "learning_rate": 1.9748448767398962e-05, "loss": 0.0608, "step": 800 }, { "epoch": 0.14242627345844505, "grad_norm": 0.25532266497612, "learning_rate": 1.973167868522556e-05, "loss": 0.0578, "step": 850 }, { "epoch": 0.15080428954423591, "grad_norm": 0.2733491063117981, "learning_rate": 1.9714908603052156e-05, "loss": 0.0586, "step": 900 }, { "epoch": 0.1591823056300268, "grad_norm": 0.31975990533828735, "learning_rate": 1.9698138520878755e-05, "loss": 0.063, "step": 950 }, { "epoch": 0.1675603217158177, "grad_norm": 0.2980721592903137, "learning_rate": 1.968136843870535e-05, "loss": 0.0615, "step": 1000 }, { "epoch": 0.17593833780160859, "grad_norm": 0.2662040591239929, "learning_rate": 1.966459835653195e-05, "loss": 0.0573, "step": 1050 }, { "epoch": 0.18431635388739948, "grad_norm": 0.24934068322181702, "learning_rate": 1.9647828274358547e-05, "loss": 0.0578, "step": 1100 }, { "epoch": 0.19269436997319034, "grad_norm": 0.35513001680374146, "learning_rate": 1.9631058192185143e-05, "loss": 0.0567, "step": 1150 }, { "epoch": 0.20107238605898123, "grad_norm": 0.2941363453865051, "learning_rate": 1.961428811001174e-05, "loss": 0.0613, "step": 1200 }, { "epoch": 0.20945040214477212, "grad_norm": 0.2334873378276825, "learning_rate": 1.9597518027838337e-05, "loss": 0.0564, "step": 1250 }, { "epoch": 0.217828418230563, "grad_norm": 0.3162507116794586, "learning_rate": 1.9580747945664935e-05, "loss": 0.0546, "step": 1300 }, { "epoch": 0.2262064343163539, "grad_norm": 0.3287353217601776, "learning_rate": 1.956397786349153e-05, "loss": 0.0561, "step": 1350 }, { "epoch": 0.23458445040214476, "grad_norm": 0.34116727113723755, "learning_rate": 1.954720778131813e-05, "loss": 0.0541, "step": 1400 }, { "epoch": 0.24296246648793565, "grad_norm": 0.2549584209918976, "learning_rate": 1.9530437699144725e-05, "loss": 0.0512, "step": 1450 }, { "epoch": 0.25134048257372654, "grad_norm": 0.2564306855201721, "learning_rate": 1.9513667616971327e-05, "loss": 0.053, "step": 1500 }, { "epoch": 0.2597184986595174, "grad_norm": 0.39897748827934265, "learning_rate": 1.9496897534797922e-05, "loss": 0.0499, "step": 1550 }, { "epoch": 0.2680965147453083, "grad_norm": 0.3399379849433899, "learning_rate": 1.948012745262452e-05, "loss": 0.0527, "step": 1600 }, { "epoch": 0.2764745308310992, "grad_norm": 0.3706755042076111, "learning_rate": 1.9463357370451116e-05, "loss": 0.0522, "step": 1650 }, { "epoch": 0.2848525469168901, "grad_norm": 0.3208563029766083, "learning_rate": 1.9446587288277715e-05, "loss": 0.0502, "step": 1700 }, { "epoch": 0.29323056300268097, "grad_norm": 0.30643364787101746, "learning_rate": 1.942981720610431e-05, "loss": 0.0517, "step": 1750 }, { "epoch": 0.30160857908847183, "grad_norm": 0.28462880849838257, "learning_rate": 1.941304712393091e-05, "loss": 0.0483, "step": 1800 }, { "epoch": 0.30998659517426275, "grad_norm": 0.46007809042930603, "learning_rate": 1.9396277041757504e-05, "loss": 0.0534, "step": 1850 }, { "epoch": 0.3183646112600536, "grad_norm": 0.27532362937927246, "learning_rate": 1.9379506959584103e-05, "loss": 0.049, "step": 1900 }, { "epoch": 0.3267426273458445, "grad_norm": 0.3934316337108612, "learning_rate": 1.93627368774107e-05, "loss": 0.0504, "step": 1950 }, { "epoch": 0.3351206434316354, "grad_norm": 0.38043123483657837, "learning_rate": 1.93459667952373e-05, "loss": 0.0474, "step": 2000 }, { "epoch": 0.34349865951742625, "grad_norm": 0.33170923590660095, "learning_rate": 1.9329196713063896e-05, "loss": 0.0474, "step": 2050 }, { "epoch": 0.35187667560321717, "grad_norm": 0.34464696049690247, "learning_rate": 1.9312426630890494e-05, "loss": 0.0494, "step": 2100 }, { "epoch": 0.36025469168900803, "grad_norm": 0.33678779006004333, "learning_rate": 1.929565654871709e-05, "loss": 0.0465, "step": 2150 }, { "epoch": 0.36863270777479895, "grad_norm": 0.34634217619895935, "learning_rate": 1.927888646654369e-05, "loss": 0.046, "step": 2200 }, { "epoch": 0.3770107238605898, "grad_norm": 0.48531678318977356, "learning_rate": 1.9262116384370284e-05, "loss": 0.0463, "step": 2250 }, { "epoch": 0.3853887399463807, "grad_norm": 0.33334800601005554, "learning_rate": 1.9245346302196882e-05, "loss": 0.047, "step": 2300 }, { "epoch": 0.3937667560321716, "grad_norm": 0.4677096903324127, "learning_rate": 1.922857622002348e-05, "loss": 0.0474, "step": 2350 }, { "epoch": 0.40214477211796246, "grad_norm": 0.35066741704940796, "learning_rate": 1.9211806137850076e-05, "loss": 0.0449, "step": 2400 }, { "epoch": 0.4105227882037534, "grad_norm": 0.3857254087924957, "learning_rate": 1.9195036055676675e-05, "loss": 0.0453, "step": 2450 }, { "epoch": 0.41890080428954424, "grad_norm": 0.36052629351615906, "learning_rate": 1.917826597350327e-05, "loss": 0.0469, "step": 2500 }, { "epoch": 0.4272788203753351, "grad_norm": 0.3650895059108734, "learning_rate": 1.916149589132987e-05, "loss": 0.0483, "step": 2550 }, { "epoch": 0.435656836461126, "grad_norm": 0.34670376777648926, "learning_rate": 1.9144725809156465e-05, "loss": 0.0449, "step": 2600 }, { "epoch": 0.4440348525469169, "grad_norm": 0.36593642830848694, "learning_rate": 1.9127955726983063e-05, "loss": 0.0449, "step": 2650 }, { "epoch": 0.4524128686327078, "grad_norm": 0.31553247570991516, "learning_rate": 1.9111185644809662e-05, "loss": 0.0448, "step": 2700 }, { "epoch": 0.46079088471849866, "grad_norm": 0.30997416377067566, "learning_rate": 1.909441556263626e-05, "loss": 0.0488, "step": 2750 }, { "epoch": 0.4691689008042895, "grad_norm": 0.4204448461532593, "learning_rate": 1.9077645480462856e-05, "loss": 0.0443, "step": 2800 }, { "epoch": 0.47754691689008044, "grad_norm": 0.36868560314178467, "learning_rate": 1.9060875398289455e-05, "loss": 0.0428, "step": 2850 }, { "epoch": 0.4859249329758713, "grad_norm": 0.37285274267196655, "learning_rate": 1.904410531611605e-05, "loss": 0.0439, "step": 2900 }, { "epoch": 0.4943029490616622, "grad_norm": 0.4258297085762024, "learning_rate": 1.902733523394265e-05, "loss": 0.0446, "step": 2950 }, { "epoch": 0.5026809651474531, "grad_norm": 0.34184491634368896, "learning_rate": 1.9010565151769244e-05, "loss": 0.0399, "step": 3000 }, { "epoch": 0.511058981233244, "grad_norm": 0.404744029045105, "learning_rate": 1.8993795069595843e-05, "loss": 0.0434, "step": 3050 }, { "epoch": 0.5194369973190348, "grad_norm": 0.31526079773902893, "learning_rate": 1.8977024987422438e-05, "loss": 0.042, "step": 3100 }, { "epoch": 0.5278150134048257, "grad_norm": 0.39627355337142944, "learning_rate": 1.8960254905249037e-05, "loss": 0.0418, "step": 3150 }, { "epoch": 0.5361930294906166, "grad_norm": 0.39220544695854187, "learning_rate": 1.8943484823075635e-05, "loss": 0.0434, "step": 3200 }, { "epoch": 0.5445710455764075, "grad_norm": 0.4202696979045868, "learning_rate": 1.892671474090223e-05, "loss": 0.0453, "step": 3250 }, { "epoch": 0.5529490616621984, "grad_norm": 0.31564274430274963, "learning_rate": 1.890994465872883e-05, "loss": 0.0423, "step": 3300 }, { "epoch": 0.5613270777479893, "grad_norm": 0.43861642479896545, "learning_rate": 1.8893174576555425e-05, "loss": 0.0432, "step": 3350 }, { "epoch": 0.5697050938337802, "grad_norm": 0.41774672269821167, "learning_rate": 1.8876404494382024e-05, "loss": 0.0424, "step": 3400 }, { "epoch": 0.578083109919571, "grad_norm": 0.44408470392227173, "learning_rate": 1.8859634412208622e-05, "loss": 0.0393, "step": 3450 }, { "epoch": 0.5864611260053619, "grad_norm": 0.5111362338066101, "learning_rate": 1.8842864330035218e-05, "loss": 0.0402, "step": 3500 }, { "epoch": 0.5948391420911529, "grad_norm": 0.48010021448135376, "learning_rate": 1.8826094247861816e-05, "loss": 0.0403, "step": 3550 }, { "epoch": 0.6032171581769437, "grad_norm": 0.41536250710487366, "learning_rate": 1.8809324165688415e-05, "loss": 0.0441, "step": 3600 }, { "epoch": 0.6115951742627346, "grad_norm": 0.40686219930648804, "learning_rate": 1.879255408351501e-05, "loss": 0.0408, "step": 3650 }, { "epoch": 0.6199731903485255, "grad_norm": 0.4435434937477112, "learning_rate": 1.877578400134161e-05, "loss": 0.0434, "step": 3700 }, { "epoch": 0.6283512064343163, "grad_norm": 0.4401046633720398, "learning_rate": 1.8759013919168204e-05, "loss": 0.042, "step": 3750 }, { "epoch": 0.6367292225201072, "grad_norm": 0.40911954641342163, "learning_rate": 1.8742243836994803e-05, "loss": 0.0413, "step": 3800 }, { "epoch": 0.6451072386058981, "grad_norm": 0.490383118391037, "learning_rate": 1.87254737548214e-05, "loss": 0.0423, "step": 3850 }, { "epoch": 0.653485254691689, "grad_norm": 0.4375227093696594, "learning_rate": 1.8708703672647997e-05, "loss": 0.0386, "step": 3900 }, { "epoch": 0.6618632707774799, "grad_norm": 0.336227685213089, "learning_rate": 1.8691933590474596e-05, "loss": 0.042, "step": 3950 }, { "epoch": 0.6702412868632708, "grad_norm": 0.5190924406051636, "learning_rate": 1.8675163508301194e-05, "loss": 0.0405, "step": 4000 }, { "epoch": 0.6786193029490617, "grad_norm": 0.3751809298992157, "learning_rate": 1.865839342612779e-05, "loss": 0.0383, "step": 4050 }, { "epoch": 0.6869973190348525, "grad_norm": 0.34148427844047546, "learning_rate": 1.864162334395439e-05, "loss": 0.0388, "step": 4100 }, { "epoch": 0.6953753351206434, "grad_norm": 0.5154247879981995, "learning_rate": 1.8624853261780984e-05, "loss": 0.0392, "step": 4150 }, { "epoch": 0.7037533512064343, "grad_norm": 0.3212796151638031, "learning_rate": 1.8608083179607583e-05, "loss": 0.0397, "step": 4200 }, { "epoch": 0.7121313672922251, "grad_norm": 0.3693840503692627, "learning_rate": 1.8591313097434178e-05, "loss": 0.0399, "step": 4250 }, { "epoch": 0.7205093833780161, "grad_norm": 0.384682297706604, "learning_rate": 1.8574543015260777e-05, "loss": 0.0394, "step": 4300 }, { "epoch": 0.728887399463807, "grad_norm": 0.5106825828552246, "learning_rate": 1.8557772933087375e-05, "loss": 0.0376, "step": 4350 }, { "epoch": 0.7372654155495979, "grad_norm": 0.5798951983451843, "learning_rate": 1.854100285091397e-05, "loss": 0.0384, "step": 4400 }, { "epoch": 0.7456434316353887, "grad_norm": 0.4215037226676941, "learning_rate": 1.852423276874057e-05, "loss": 0.0371, "step": 4450 }, { "epoch": 0.7540214477211796, "grad_norm": 0.41392162442207336, "learning_rate": 1.8507462686567165e-05, "loss": 0.0395, "step": 4500 }, { "epoch": 0.7623994638069705, "grad_norm": 0.38111090660095215, "learning_rate": 1.8490692604393763e-05, "loss": 0.0397, "step": 4550 }, { "epoch": 0.7707774798927614, "grad_norm": 0.5323607325553894, "learning_rate": 1.847392252222036e-05, "loss": 0.0389, "step": 4600 }, { "epoch": 0.7791554959785523, "grad_norm": 0.3730742335319519, "learning_rate": 1.8457152440046957e-05, "loss": 0.037, "step": 4650 }, { "epoch": 0.7875335120643432, "grad_norm": 0.5167490243911743, "learning_rate": 1.8440382357873553e-05, "loss": 0.0398, "step": 4700 }, { "epoch": 0.795911528150134, "grad_norm": 0.3720487356185913, "learning_rate": 1.8423612275700155e-05, "loss": 0.0353, "step": 4750 }, { "epoch": 0.8042895442359249, "grad_norm": 0.49233752489089966, "learning_rate": 1.840684219352675e-05, "loss": 0.0375, "step": 4800 }, { "epoch": 0.8126675603217158, "grad_norm": 0.35151785612106323, "learning_rate": 1.839007211135335e-05, "loss": 0.0367, "step": 4850 }, { "epoch": 0.8210455764075067, "grad_norm": 0.4015248119831085, "learning_rate": 1.8373302029179944e-05, "loss": 0.0402, "step": 4900 }, { "epoch": 0.8294235924932976, "grad_norm": 0.3868032395839691, "learning_rate": 1.8356531947006543e-05, "loss": 0.0364, "step": 4950 }, { "epoch": 0.8378016085790885, "grad_norm": 0.3618241250514984, "learning_rate": 1.8339761864833138e-05, "loss": 0.0365, "step": 5000 }, { "epoch": 0.8461796246648794, "grad_norm": 0.4246107041835785, "learning_rate": 1.8322991782659737e-05, "loss": 0.0383, "step": 5050 }, { "epoch": 0.8545576407506702, "grad_norm": 0.4502660632133484, "learning_rate": 1.8306221700486332e-05, "loss": 0.0384, "step": 5100 }, { "epoch": 0.8629356568364611, "grad_norm": 0.38021931052207947, "learning_rate": 1.828945161831293e-05, "loss": 0.036, "step": 5150 }, { "epoch": 0.871313672922252, "grad_norm": 0.49084368348121643, "learning_rate": 1.827268153613953e-05, "loss": 0.0403, "step": 5200 }, { "epoch": 0.8796916890080428, "grad_norm": 0.4013173282146454, "learning_rate": 1.8255911453966125e-05, "loss": 0.038, "step": 5250 }, { "epoch": 0.8880697050938338, "grad_norm": 0.4591931700706482, "learning_rate": 1.8239141371792724e-05, "loss": 0.0345, "step": 5300 }, { "epoch": 0.8964477211796247, "grad_norm": 0.3261602520942688, "learning_rate": 1.822237128961932e-05, "loss": 0.0371, "step": 5350 }, { "epoch": 0.9048257372654156, "grad_norm": 0.5109397172927856, "learning_rate": 1.8205601207445918e-05, "loss": 0.0352, "step": 5400 }, { "epoch": 0.9132037533512064, "grad_norm": 0.4951651990413666, "learning_rate": 1.8188831125272516e-05, "loss": 0.0384, "step": 5450 }, { "epoch": 0.9215817694369973, "grad_norm": 0.2933291494846344, "learning_rate": 1.817206104309911e-05, "loss": 0.0367, "step": 5500 }, { "epoch": 0.9299597855227882, "grad_norm": 0.518692135810852, "learning_rate": 1.815529096092571e-05, "loss": 0.0362, "step": 5550 }, { "epoch": 0.938337801608579, "grad_norm": 0.4825911223888397, "learning_rate": 1.813852087875231e-05, "loss": 0.0363, "step": 5600 }, { "epoch": 0.94671581769437, "grad_norm": 0.45922228693962097, "learning_rate": 1.8121750796578904e-05, "loss": 0.0381, "step": 5650 }, { "epoch": 0.9550938337801609, "grad_norm": 0.3230240046977997, "learning_rate": 1.8104980714405503e-05, "loss": 0.0342, "step": 5700 }, { "epoch": 0.9634718498659517, "grad_norm": 0.3606482744216919, "learning_rate": 1.80882106322321e-05, "loss": 0.0355, "step": 5750 }, { "epoch": 0.9718498659517426, "grad_norm": 0.4341330826282501, "learning_rate": 1.8071440550058697e-05, "loss": 0.037, "step": 5800 }, { "epoch": 0.9802278820375335, "grad_norm": 0.42356178164482117, "learning_rate": 1.8054670467885292e-05, "loss": 0.0353, "step": 5850 }, { "epoch": 0.9886058981233244, "grad_norm": 0.39021754264831543, "learning_rate": 1.803790038571189e-05, "loss": 0.0352, "step": 5900 }, { "epoch": 0.9969839142091153, "grad_norm": 0.29827752709388733, "learning_rate": 1.802113030353849e-05, "loss": 0.0355, "step": 5950 }, { "epoch": 1.0053619302949062, "grad_norm": 0.38858547806739807, "learning_rate": 1.800436022136509e-05, "loss": 0.0278, "step": 6000 }, { "epoch": 1.013739946380697, "grad_norm": 0.2972586452960968, "learning_rate": 1.7987590139191684e-05, "loss": 0.0236, "step": 6050 }, { "epoch": 1.022117962466488, "grad_norm": 0.36482104659080505, "learning_rate": 1.7970820057018283e-05, "loss": 0.0251, "step": 6100 }, { "epoch": 1.030495978552279, "grad_norm": 0.37719279527664185, "learning_rate": 1.7954049974844878e-05, "loss": 0.023, "step": 6150 }, { "epoch": 1.0388739946380696, "grad_norm": 0.4385906457901001, "learning_rate": 1.7937279892671477e-05, "loss": 0.0248, "step": 6200 }, { "epoch": 1.0472520107238605, "grad_norm": 0.508695662021637, "learning_rate": 1.7920509810498072e-05, "loss": 0.0254, "step": 6250 }, { "epoch": 1.0556300268096515, "grad_norm": 0.36647507548332214, "learning_rate": 1.790373972832467e-05, "loss": 0.0254, "step": 6300 }, { "epoch": 1.0640080428954424, "grad_norm": 0.4308232069015503, "learning_rate": 1.7886969646151266e-05, "loss": 0.0226, "step": 6350 }, { "epoch": 1.0723860589812333, "grad_norm": 0.3477235734462738, "learning_rate": 1.7870199563977865e-05, "loss": 0.0235, "step": 6400 }, { "epoch": 1.0807640750670242, "grad_norm": 0.45611080527305603, "learning_rate": 1.7853429481804463e-05, "loss": 0.0255, "step": 6450 }, { "epoch": 1.089142091152815, "grad_norm": 0.41645970940589905, "learning_rate": 1.783665939963106e-05, "loss": 0.0236, "step": 6500 }, { "epoch": 1.0975201072386058, "grad_norm": 0.5107206702232361, "learning_rate": 1.7819889317457657e-05, "loss": 0.0243, "step": 6550 }, { "epoch": 1.1058981233243967, "grad_norm": 0.37085390090942383, "learning_rate": 1.7803119235284253e-05, "loss": 0.0231, "step": 6600 }, { "epoch": 1.1142761394101877, "grad_norm": 0.3882488012313843, "learning_rate": 1.778634915311085e-05, "loss": 0.0226, "step": 6650 }, { "epoch": 1.1226541554959786, "grad_norm": 0.3104082942008972, "learning_rate": 1.7769579070937447e-05, "loss": 0.0243, "step": 6700 }, { "epoch": 1.1310321715817695, "grad_norm": 0.500109076499939, "learning_rate": 1.7752808988764045e-05, "loss": 0.0245, "step": 6750 }, { "epoch": 1.1394101876675604, "grad_norm": 0.6070294380187988, "learning_rate": 1.7736038906590644e-05, "loss": 0.024, "step": 6800 }, { "epoch": 1.147788203753351, "grad_norm": 0.4429844915866852, "learning_rate": 1.7719268824417243e-05, "loss": 0.024, "step": 6850 }, { "epoch": 1.156166219839142, "grad_norm": 0.532455563545227, "learning_rate": 1.7702498742243838e-05, "loss": 0.0231, "step": 6900 }, { "epoch": 1.164544235924933, "grad_norm": 0.4723723828792572, "learning_rate": 1.7685728660070437e-05, "loss": 0.0227, "step": 6950 }, { "epoch": 1.1729222520107239, "grad_norm": 0.40511298179626465, "learning_rate": 1.7668958577897032e-05, "loss": 0.0228, "step": 7000 }, { "epoch": 1.1813002680965148, "grad_norm": 0.4623141884803772, "learning_rate": 1.765218849572363e-05, "loss": 0.0213, "step": 7050 }, { "epoch": 1.1896782841823057, "grad_norm": 0.5076983571052551, "learning_rate": 1.7635418413550226e-05, "loss": 0.0247, "step": 7100 }, { "epoch": 1.1980563002680964, "grad_norm": 0.438363641500473, "learning_rate": 1.7618648331376825e-05, "loss": 0.0247, "step": 7150 }, { "epoch": 1.2064343163538873, "grad_norm": 0.4427433907985687, "learning_rate": 1.7601878249203424e-05, "loss": 0.0266, "step": 7200 }, { "epoch": 1.2148123324396782, "grad_norm": 0.4235341548919678, "learning_rate": 1.758510816703002e-05, "loss": 0.0249, "step": 7250 }, { "epoch": 1.2231903485254692, "grad_norm": 0.3872547149658203, "learning_rate": 1.7568338084856618e-05, "loss": 0.0241, "step": 7300 }, { "epoch": 1.23156836461126, "grad_norm": 0.4646087884902954, "learning_rate": 1.7551568002683216e-05, "loss": 0.0239, "step": 7350 }, { "epoch": 1.239946380697051, "grad_norm": 0.3509279489517212, "learning_rate": 1.753479792050981e-05, "loss": 0.0247, "step": 7400 }, { "epoch": 1.248324396782842, "grad_norm": 0.5066854357719421, "learning_rate": 1.751802783833641e-05, "loss": 0.0223, "step": 7450 }, { "epoch": 1.2567024128686328, "grad_norm": 0.5363894104957581, "learning_rate": 1.7501257756163006e-05, "loss": 0.0231, "step": 7500 }, { "epoch": 1.2650804289544235, "grad_norm": 0.6059328317642212, "learning_rate": 1.7484487673989604e-05, "loss": 0.0247, "step": 7550 }, { "epoch": 1.2734584450402144, "grad_norm": 0.458574503660202, "learning_rate": 1.7467717591816203e-05, "loss": 0.0252, "step": 7600 }, { "epoch": 1.2818364611260054, "grad_norm": 0.4931676983833313, "learning_rate": 1.74509475096428e-05, "loss": 0.0242, "step": 7650 }, { "epoch": 1.2902144772117963, "grad_norm": 0.4417158365249634, "learning_rate": 1.7434177427469397e-05, "loss": 0.0245, "step": 7700 }, { "epoch": 1.2985924932975872, "grad_norm": 0.3258965015411377, "learning_rate": 1.7417407345295992e-05, "loss": 0.0254, "step": 7750 }, { "epoch": 1.3069705093833779, "grad_norm": 0.4492965638637543, "learning_rate": 1.740063726312259e-05, "loss": 0.0247, "step": 7800 }, { "epoch": 1.3153485254691688, "grad_norm": 0.4415794312953949, "learning_rate": 1.7383867180949186e-05, "loss": 0.0243, "step": 7850 }, { "epoch": 1.3237265415549597, "grad_norm": 0.5353983044624329, "learning_rate": 1.7367097098775785e-05, "loss": 0.0221, "step": 7900 }, { "epoch": 1.3321045576407506, "grad_norm": 0.5296221375465393, "learning_rate": 1.735032701660238e-05, "loss": 0.0241, "step": 7950 }, { "epoch": 1.3404825737265416, "grad_norm": 0.5558563470840454, "learning_rate": 1.7333556934428983e-05, "loss": 0.0278, "step": 8000 }, { "epoch": 1.3488605898123325, "grad_norm": 0.5353667736053467, "learning_rate": 1.7316786852255578e-05, "loss": 0.0221, "step": 8050 }, { "epoch": 1.3572386058981234, "grad_norm": 0.47603583335876465, "learning_rate": 1.7300016770082177e-05, "loss": 0.0241, "step": 8100 }, { "epoch": 1.3656166219839143, "grad_norm": 0.5160461068153381, "learning_rate": 1.7283246687908772e-05, "loss": 0.0237, "step": 8150 }, { "epoch": 1.3739946380697052, "grad_norm": 0.5242166519165039, "learning_rate": 1.726647660573537e-05, "loss": 0.0241, "step": 8200 }, { "epoch": 1.382372654155496, "grad_norm": 0.4098646342754364, "learning_rate": 1.7249706523561966e-05, "loss": 0.0237, "step": 8250 }, { "epoch": 1.3907506702412868, "grad_norm": 0.488899290561676, "learning_rate": 1.7232936441388565e-05, "loss": 0.024, "step": 8300 }, { "epoch": 1.3991286863270778, "grad_norm": 0.4955669641494751, "learning_rate": 1.721616635921516e-05, "loss": 0.0233, "step": 8350 }, { "epoch": 1.4075067024128687, "grad_norm": 0.4925636053085327, "learning_rate": 1.719939627704176e-05, "loss": 0.0241, "step": 8400 }, { "epoch": 1.4158847184986596, "grad_norm": 0.4332300126552582, "learning_rate": 1.7182626194868357e-05, "loss": 0.023, "step": 8450 }, { "epoch": 1.4242627345844503, "grad_norm": 0.5092645883560181, "learning_rate": 1.7165856112694953e-05, "loss": 0.0247, "step": 8500 }, { "epoch": 1.4326407506702412, "grad_norm": 0.4245849847793579, "learning_rate": 1.714908603052155e-05, "loss": 0.0229, "step": 8550 }, { "epoch": 1.4410187667560321, "grad_norm": 0.5392746329307556, "learning_rate": 1.7132315948348147e-05, "loss": 0.0243, "step": 8600 }, { "epoch": 1.449396782841823, "grad_norm": 0.4076955020427704, "learning_rate": 1.7115545866174745e-05, "loss": 0.023, "step": 8650 }, { "epoch": 1.457774798927614, "grad_norm": 0.39265647530555725, "learning_rate": 1.709877578400134e-05, "loss": 0.023, "step": 8700 }, { "epoch": 1.4661528150134049, "grad_norm": 0.49499258399009705, "learning_rate": 1.708200570182794e-05, "loss": 0.025, "step": 8750 }, { "epoch": 1.4745308310991958, "grad_norm": 0.38666218519210815, "learning_rate": 1.7065235619654538e-05, "loss": 0.0222, "step": 8800 }, { "epoch": 1.4829088471849867, "grad_norm": 0.4817696809768677, "learning_rate": 1.7048465537481137e-05, "loss": 0.0248, "step": 8850 }, { "epoch": 1.4912868632707774, "grad_norm": 0.5351291298866272, "learning_rate": 1.7031695455307732e-05, "loss": 0.0249, "step": 8900 }, { "epoch": 1.4996648793565683, "grad_norm": 0.37309539318084717, "learning_rate": 1.701492537313433e-05, "loss": 0.0236, "step": 8950 }, { "epoch": 1.5080428954423593, "grad_norm": 0.3458901345729828, "learning_rate": 1.6998155290960926e-05, "loss": 0.0234, "step": 9000 }, { "epoch": 1.5164209115281502, "grad_norm": 0.4059881269931793, "learning_rate": 1.6981385208787525e-05, "loss": 0.0241, "step": 9050 }, { "epoch": 1.5247989276139409, "grad_norm": 0.3939747214317322, "learning_rate": 1.696461512661412e-05, "loss": 0.0232, "step": 9100 }, { "epoch": 1.5331769436997318, "grad_norm": 0.43895846605300903, "learning_rate": 1.694784504444072e-05, "loss": 0.0233, "step": 9150 }, { "epoch": 1.5415549597855227, "grad_norm": 0.48546019196510315, "learning_rate": 1.6931074962267314e-05, "loss": 0.0231, "step": 9200 }, { "epoch": 1.5499329758713136, "grad_norm": 0.7542991638183594, "learning_rate": 1.6914304880093916e-05, "loss": 0.0227, "step": 9250 }, { "epoch": 1.5583109919571045, "grad_norm": 0.5190153121948242, "learning_rate": 1.6897534797920512e-05, "loss": 0.0257, "step": 9300 }, { "epoch": 1.5666890080428955, "grad_norm": 0.36860191822052, "learning_rate": 1.688076471574711e-05, "loss": 0.0237, "step": 9350 }, { "epoch": 1.5750670241286864, "grad_norm": 0.4412299394607544, "learning_rate": 1.6863994633573706e-05, "loss": 0.023, "step": 9400 }, { "epoch": 1.5834450402144773, "grad_norm": 0.44900405406951904, "learning_rate": 1.6847224551400304e-05, "loss": 0.0219, "step": 9450 }, { "epoch": 1.5918230563002682, "grad_norm": 0.4734587073326111, "learning_rate": 1.68304544692269e-05, "loss": 0.0244, "step": 9500 }, { "epoch": 1.6002010723860591, "grad_norm": 0.4086250364780426, "learning_rate": 1.68136843870535e-05, "loss": 0.0233, "step": 9550 }, { "epoch": 1.6085790884718498, "grad_norm": 0.33544018864631653, "learning_rate": 1.6796914304880094e-05, "loss": 0.0228, "step": 9600 }, { "epoch": 1.6169571045576407, "grad_norm": 0.5728262066841125, "learning_rate": 1.6780144222706692e-05, "loss": 0.0247, "step": 9650 }, { "epoch": 1.6253351206434317, "grad_norm": 0.40683993697166443, "learning_rate": 1.676337414053329e-05, "loss": 0.0236, "step": 9700 }, { "epoch": 1.6337131367292224, "grad_norm": 0.5062201619148254, "learning_rate": 1.6746604058359887e-05, "loss": 0.0227, "step": 9750 }, { "epoch": 1.6420911528150133, "grad_norm": 0.6048519015312195, "learning_rate": 1.6729833976186485e-05, "loss": 0.024, "step": 9800 }, { "epoch": 1.6504691689008042, "grad_norm": 0.42808210849761963, "learning_rate": 1.671306389401308e-05, "loss": 0.0214, "step": 9850 }, { "epoch": 1.6588471849865951, "grad_norm": 0.452362060546875, "learning_rate": 1.669629381183968e-05, "loss": 0.025, "step": 9900 }, { "epoch": 1.667225201072386, "grad_norm": 0.45486265420913696, "learning_rate": 1.6679523729666275e-05, "loss": 0.0241, "step": 9950 }, { "epoch": 1.675603217158177, "grad_norm": 0.39436477422714233, "learning_rate": 1.6662753647492873e-05, "loss": 0.0238, "step": 10000 }, { "epoch": 1.675603217158177, "eval_loss": 0.042236872017383575, "eval_runtime": 0.3154, "eval_samples_per_second": 63.415, "eval_steps_per_second": 3.171, "step": 10000 }, { "epoch": 1.6839812332439679, "grad_norm": 0.3792615234851837, "learning_rate": 1.6645983565319472e-05, "loss": 0.0236, "step": 10050 }, { "epoch": 1.6923592493297588, "grad_norm": 0.4568031132221222, "learning_rate": 1.662921348314607e-05, "loss": 0.0245, "step": 10100 }, { "epoch": 1.7007372654155497, "grad_norm": 0.30801689624786377, "learning_rate": 1.6612443400972666e-05, "loss": 0.0242, "step": 10150 }, { "epoch": 1.7091152815013406, "grad_norm": 0.404593825340271, "learning_rate": 1.6595673318799265e-05, "loss": 0.0244, "step": 10200 }, { "epoch": 1.7174932975871313, "grad_norm": 0.41899484395980835, "learning_rate": 1.657890323662586e-05, "loss": 0.0237, "step": 10250 }, { "epoch": 1.7258713136729222, "grad_norm": 0.7052549719810486, "learning_rate": 1.656213315445246e-05, "loss": 0.0219, "step": 10300 }, { "epoch": 1.7342493297587132, "grad_norm": 0.6043505072593689, "learning_rate": 1.6545363072279054e-05, "loss": 0.023, "step": 10350 }, { "epoch": 1.742627345844504, "grad_norm": 0.47521620988845825, "learning_rate": 1.6528592990105653e-05, "loss": 0.0213, "step": 10400 }, { "epoch": 1.7510053619302948, "grad_norm": 0.43603143095970154, "learning_rate": 1.651182290793225e-05, "loss": 0.0222, "step": 10450 }, { "epoch": 1.7593833780160857, "grad_norm": 0.5093141794204712, "learning_rate": 1.6495052825758847e-05, "loss": 0.0241, "step": 10500 }, { "epoch": 1.7677613941018766, "grad_norm": 0.4269144535064697, "learning_rate": 1.6478282743585445e-05, "loss": 0.0225, "step": 10550 }, { "epoch": 1.7761394101876675, "grad_norm": 0.3798427879810333, "learning_rate": 1.646151266141204e-05, "loss": 0.0246, "step": 10600 }, { "epoch": 1.7845174262734584, "grad_norm": 0.35155215859413147, "learning_rate": 1.644474257923864e-05, "loss": 0.0223, "step": 10650 }, { "epoch": 1.7928954423592494, "grad_norm": 0.3362865447998047, "learning_rate": 1.6427972497065238e-05, "loss": 0.0222, "step": 10700 }, { "epoch": 1.8012734584450403, "grad_norm": 0.4176539480686188, "learning_rate": 1.6411202414891834e-05, "loss": 0.0221, "step": 10750 }, { "epoch": 1.8096514745308312, "grad_norm": 0.38857483863830566, "learning_rate": 1.6394432332718432e-05, "loss": 0.0231, "step": 10800 }, { "epoch": 1.8180294906166221, "grad_norm": 0.5508946776390076, "learning_rate": 1.637766225054503e-05, "loss": 0.0244, "step": 10850 }, { "epoch": 1.826407506702413, "grad_norm": 0.23041021823883057, "learning_rate": 1.6360892168371626e-05, "loss": 0.0214, "step": 10900 }, { "epoch": 1.8347855227882037, "grad_norm": 0.4554728865623474, "learning_rate": 1.6344122086198225e-05, "loss": 0.0216, "step": 10950 }, { "epoch": 1.8431635388739946, "grad_norm": 0.3926387131214142, "learning_rate": 1.632735200402482e-05, "loss": 0.0231, "step": 11000 }, { "epoch": 1.8515415549597856, "grad_norm": 0.4310173988342285, "learning_rate": 1.631058192185142e-05, "loss": 0.0217, "step": 11050 }, { "epoch": 1.8599195710455763, "grad_norm": 0.5301809310913086, "learning_rate": 1.6293811839678014e-05, "loss": 0.0233, "step": 11100 }, { "epoch": 1.8682975871313672, "grad_norm": 0.5201212167739868, "learning_rate": 1.6277041757504613e-05, "loss": 0.0238, "step": 11150 }, { "epoch": 1.876675603217158, "grad_norm": 0.5420696139335632, "learning_rate": 1.626027167533121e-05, "loss": 0.0228, "step": 11200 }, { "epoch": 1.885053619302949, "grad_norm": 0.449569433927536, "learning_rate": 1.624350159315781e-05, "loss": 0.0229, "step": 11250 }, { "epoch": 1.89343163538874, "grad_norm": 0.41790249943733215, "learning_rate": 1.6226731510984406e-05, "loss": 0.0211, "step": 11300 }, { "epoch": 1.9018096514745308, "grad_norm": 0.49417269229888916, "learning_rate": 1.6209961428811004e-05, "loss": 0.0238, "step": 11350 }, { "epoch": 1.9101876675603218, "grad_norm": 0.7904441952705383, "learning_rate": 1.61931913466376e-05, "loss": 0.0238, "step": 11400 }, { "epoch": 1.9185656836461127, "grad_norm": 0.5102431178092957, "learning_rate": 1.61764212644642e-05, "loss": 0.0234, "step": 11450 }, { "epoch": 1.9269436997319036, "grad_norm": 0.5872859954833984, "learning_rate": 1.6159651182290794e-05, "loss": 0.023, "step": 11500 }, { "epoch": 1.9353217158176945, "grad_norm": 0.4397691488265991, "learning_rate": 1.6142881100117393e-05, "loss": 0.0225, "step": 11550 }, { "epoch": 1.9436997319034852, "grad_norm": 0.5159376263618469, "learning_rate": 1.6126111017943988e-05, "loss": 0.0236, "step": 11600 }, { "epoch": 1.9520777479892761, "grad_norm": 0.5699421763420105, "learning_rate": 1.6109340935770587e-05, "loss": 0.0221, "step": 11650 }, { "epoch": 1.960455764075067, "grad_norm": 0.5751481056213379, "learning_rate": 1.6092570853597185e-05, "loss": 0.0238, "step": 11700 }, { "epoch": 1.9688337801608577, "grad_norm": 0.4952080249786377, "learning_rate": 1.607580077142378e-05, "loss": 0.0245, "step": 11750 }, { "epoch": 1.9772117962466487, "grad_norm": 0.3852183520793915, "learning_rate": 1.605903068925038e-05, "loss": 0.0237, "step": 11800 }, { "epoch": 1.9855898123324396, "grad_norm": 0.5378175973892212, "learning_rate": 1.6042260607076975e-05, "loss": 0.0217, "step": 11850 }, { "epoch": 1.9939678284182305, "grad_norm": 0.48786741495132446, "learning_rate": 1.6025490524903573e-05, "loss": 0.0212, "step": 11900 }, { "epoch": 2.0023458445040214, "grad_norm": 0.2508140206336975, "learning_rate": 1.600872044273017e-05, "loss": 0.0221, "step": 11950 }, { "epoch": 2.0107238605898123, "grad_norm": 0.32956379652023315, "learning_rate": 1.5991950360556767e-05, "loss": 0.0115, "step": 12000 }, { "epoch": 2.0191018766756033, "grad_norm": 0.22912301123142242, "learning_rate": 1.5975180278383366e-05, "loss": 0.0127, "step": 12050 }, { "epoch": 2.027479892761394, "grad_norm": 0.29201629757881165, "learning_rate": 1.5958410196209965e-05, "loss": 0.0103, "step": 12100 }, { "epoch": 2.035857908847185, "grad_norm": 0.3595946431159973, "learning_rate": 1.594164011403656e-05, "loss": 0.0105, "step": 12150 }, { "epoch": 2.044235924932976, "grad_norm": 0.22679433226585388, "learning_rate": 1.592487003186316e-05, "loss": 0.0107, "step": 12200 }, { "epoch": 2.052613941018767, "grad_norm": 0.40025532245635986, "learning_rate": 1.5908099949689754e-05, "loss": 0.0117, "step": 12250 }, { "epoch": 2.060991957104558, "grad_norm": 0.32900357246398926, "learning_rate": 1.5891329867516353e-05, "loss": 0.0115, "step": 12300 }, { "epoch": 2.0693699731903483, "grad_norm": 0.2236577868461609, "learning_rate": 1.5874559785342948e-05, "loss": 0.0123, "step": 12350 }, { "epoch": 2.0777479892761392, "grad_norm": 0.3712753355503082, "learning_rate": 1.5857789703169547e-05, "loss": 0.0114, "step": 12400 }, { "epoch": 2.08612600536193, "grad_norm": 0.4136362373828888, "learning_rate": 1.5841019620996142e-05, "loss": 0.011, "step": 12450 }, { "epoch": 2.094504021447721, "grad_norm": 0.3658868074417114, "learning_rate": 1.582424953882274e-05, "loss": 0.0124, "step": 12500 }, { "epoch": 2.102882037533512, "grad_norm": 0.44573381543159485, "learning_rate": 1.580747945664934e-05, "loss": 0.0112, "step": 12550 }, { "epoch": 2.111260053619303, "grad_norm": 0.4188709557056427, "learning_rate": 1.5790709374475935e-05, "loss": 0.0115, "step": 12600 }, { "epoch": 2.119638069705094, "grad_norm": 0.3570314645767212, "learning_rate": 1.5773939292302534e-05, "loss": 0.0112, "step": 12650 }, { "epoch": 2.1280160857908847, "grad_norm": 0.3598877191543579, "learning_rate": 1.5757169210129132e-05, "loss": 0.0118, "step": 12700 }, { "epoch": 2.1363941018766757, "grad_norm": 0.3769216239452362, "learning_rate": 1.5740399127955728e-05, "loss": 0.0107, "step": 12750 }, { "epoch": 2.1447721179624666, "grad_norm": 0.2821277678012848, "learning_rate": 1.5723629045782326e-05, "loss": 0.0118, "step": 12800 }, { "epoch": 2.1531501340482575, "grad_norm": 0.26597416400909424, "learning_rate": 1.570685896360892e-05, "loss": 0.0113, "step": 12850 }, { "epoch": 2.1615281501340484, "grad_norm": 0.26788029074668884, "learning_rate": 1.569008888143552e-05, "loss": 0.0119, "step": 12900 }, { "epoch": 2.1699061662198393, "grad_norm": 0.4225537180900574, "learning_rate": 1.567331879926212e-05, "loss": 0.0111, "step": 12950 }, { "epoch": 2.17828418230563, "grad_norm": 0.2967151403427124, "learning_rate": 1.5656548717088714e-05, "loss": 0.0116, "step": 13000 }, { "epoch": 2.1866621983914207, "grad_norm": 0.37873271107673645, "learning_rate": 1.5639778634915313e-05, "loss": 0.0122, "step": 13050 }, { "epoch": 2.1950402144772116, "grad_norm": 0.3496306836605072, "learning_rate": 1.562300855274191e-05, "loss": 0.0115, "step": 13100 }, { "epoch": 2.2034182305630026, "grad_norm": 0.2340189516544342, "learning_rate": 1.5606238470568507e-05, "loss": 0.0114, "step": 13150 }, { "epoch": 2.2117962466487935, "grad_norm": 0.34111320972442627, "learning_rate": 1.5589468388395102e-05, "loss": 0.0106, "step": 13200 }, { "epoch": 2.2201742627345844, "grad_norm": 0.4557114839553833, "learning_rate": 1.55726983062217e-05, "loss": 0.0124, "step": 13250 }, { "epoch": 2.2285522788203753, "grad_norm": 0.3776351511478424, "learning_rate": 1.55559282240483e-05, "loss": 0.0109, "step": 13300 }, { "epoch": 2.2369302949061662, "grad_norm": 0.49314960837364197, "learning_rate": 1.55391581418749e-05, "loss": 0.0127, "step": 13350 }, { "epoch": 2.245308310991957, "grad_norm": 0.2994402348995209, "learning_rate": 1.5522388059701494e-05, "loss": 0.0123, "step": 13400 }, { "epoch": 2.253686327077748, "grad_norm": 0.6113381385803223, "learning_rate": 1.5505617977528093e-05, "loss": 0.0122, "step": 13450 }, { "epoch": 2.262064343163539, "grad_norm": 0.43357163667678833, "learning_rate": 1.5488847895354688e-05, "loss": 0.0109, "step": 13500 }, { "epoch": 2.27044235924933, "grad_norm": 0.5021244287490845, "learning_rate": 1.5472077813181287e-05, "loss": 0.013, "step": 13550 }, { "epoch": 2.278820375335121, "grad_norm": 0.4794227182865143, "learning_rate": 1.5455307731007882e-05, "loss": 0.0125, "step": 13600 }, { "epoch": 2.2871983914209117, "grad_norm": 0.2409118264913559, "learning_rate": 1.543853764883448e-05, "loss": 0.012, "step": 13650 }, { "epoch": 2.295576407506702, "grad_norm": 0.36879080533981323, "learning_rate": 1.542176756666108e-05, "loss": 0.0101, "step": 13700 }, { "epoch": 2.303954423592493, "grad_norm": 0.2825350761413574, "learning_rate": 1.5404997484487675e-05, "loss": 0.0113, "step": 13750 }, { "epoch": 2.312332439678284, "grad_norm": 0.5339875221252441, "learning_rate": 1.5388227402314273e-05, "loss": 0.0115, "step": 13800 }, { "epoch": 2.320710455764075, "grad_norm": 0.5463636517524719, "learning_rate": 1.537145732014087e-05, "loss": 0.0127, "step": 13850 }, { "epoch": 2.329088471849866, "grad_norm": 0.37746766209602356, "learning_rate": 1.5354687237967467e-05, "loss": 0.0116, "step": 13900 }, { "epoch": 2.337466487935657, "grad_norm": 0.6131693124771118, "learning_rate": 1.5337917155794063e-05, "loss": 0.0115, "step": 13950 }, { "epoch": 2.3458445040214477, "grad_norm": 0.3330284059047699, "learning_rate": 1.532114707362066e-05, "loss": 0.0111, "step": 14000 }, { "epoch": 2.3542225201072386, "grad_norm": 0.47551050782203674, "learning_rate": 1.530437699144726e-05, "loss": 0.0117, "step": 14050 }, { "epoch": 2.3626005361930296, "grad_norm": 0.5559821128845215, "learning_rate": 1.528760690927386e-05, "loss": 0.0134, "step": 14100 }, { "epoch": 2.3709785522788205, "grad_norm": 0.32303518056869507, "learning_rate": 1.5270836827100454e-05, "loss": 0.012, "step": 14150 }, { "epoch": 2.3793565683646114, "grad_norm": 0.4595315754413605, "learning_rate": 1.5254066744927051e-05, "loss": 0.012, "step": 14200 }, { "epoch": 2.3877345844504023, "grad_norm": 0.5437060594558716, "learning_rate": 1.5237296662753648e-05, "loss": 0.0124, "step": 14250 }, { "epoch": 2.396112600536193, "grad_norm": 0.3886863589286804, "learning_rate": 1.5220526580580245e-05, "loss": 0.0129, "step": 14300 }, { "epoch": 2.4044906166219837, "grad_norm": 0.5083261132240295, "learning_rate": 1.5203756498406844e-05, "loss": 0.0134, "step": 14350 }, { "epoch": 2.4128686327077746, "grad_norm": 0.35092031955718994, "learning_rate": 1.5186986416233441e-05, "loss": 0.0116, "step": 14400 }, { "epoch": 2.4212466487935655, "grad_norm": 0.4511415660381317, "learning_rate": 1.5170216334060038e-05, "loss": 0.013, "step": 14450 }, { "epoch": 2.4296246648793565, "grad_norm": 0.5314837694168091, "learning_rate": 1.5153446251886637e-05, "loss": 0.0128, "step": 14500 }, { "epoch": 2.4380026809651474, "grad_norm": 0.3129260540008545, "learning_rate": 1.5136676169713234e-05, "loss": 0.0121, "step": 14550 }, { "epoch": 2.4463806970509383, "grad_norm": 0.3153856694698334, "learning_rate": 1.511990608753983e-05, "loss": 0.0138, "step": 14600 }, { "epoch": 2.454758713136729, "grad_norm": 0.8036394715309143, "learning_rate": 1.5103136005366428e-05, "loss": 0.0121, "step": 14650 }, { "epoch": 2.46313672922252, "grad_norm": 0.50925213098526, "learning_rate": 1.5086365923193025e-05, "loss": 0.0124, "step": 14700 }, { "epoch": 2.471514745308311, "grad_norm": 0.5606102347373962, "learning_rate": 1.5069595841019622e-05, "loss": 0.0124, "step": 14750 }, { "epoch": 2.479892761394102, "grad_norm": 0.5037418603897095, "learning_rate": 1.5052825758846219e-05, "loss": 0.0136, "step": 14800 }, { "epoch": 2.488270777479893, "grad_norm": 0.3871222138404846, "learning_rate": 1.5036055676672816e-05, "loss": 0.0129, "step": 14850 }, { "epoch": 2.496648793565684, "grad_norm": 0.5839509963989258, "learning_rate": 1.5019285594499416e-05, "loss": 0.0123, "step": 14900 }, { "epoch": 2.5050268096514747, "grad_norm": 0.7268586754798889, "learning_rate": 1.5002515512326013e-05, "loss": 0.012, "step": 14950 }, { "epoch": 2.5134048257372656, "grad_norm": 0.3473876118659973, "learning_rate": 1.498574543015261e-05, "loss": 0.0126, "step": 15000 }, { "epoch": 2.5217828418230566, "grad_norm": 0.49601665139198303, "learning_rate": 1.4968975347979207e-05, "loss": 0.0121, "step": 15050 }, { "epoch": 2.530160857908847, "grad_norm": 0.23973305523395538, "learning_rate": 1.4952205265805804e-05, "loss": 0.0111, "step": 15100 }, { "epoch": 2.538538873994638, "grad_norm": 0.5663930177688599, "learning_rate": 1.4935435183632401e-05, "loss": 0.0106, "step": 15150 }, { "epoch": 2.546916890080429, "grad_norm": 0.24828468263149261, "learning_rate": 1.4918665101458998e-05, "loss": 0.0132, "step": 15200 }, { "epoch": 2.55529490616622, "grad_norm": 0.17071287333965302, "learning_rate": 1.4901895019285595e-05, "loss": 0.0132, "step": 15250 }, { "epoch": 2.5636729222520107, "grad_norm": 0.5064595937728882, "learning_rate": 1.4885124937112192e-05, "loss": 0.013, "step": 15300 }, { "epoch": 2.5720509383378016, "grad_norm": 0.48466721177101135, "learning_rate": 1.4868354854938791e-05, "loss": 0.0112, "step": 15350 }, { "epoch": 2.5804289544235925, "grad_norm": 0.6913251876831055, "learning_rate": 1.4851584772765388e-05, "loss": 0.0129, "step": 15400 }, { "epoch": 2.5888069705093835, "grad_norm": 0.4608655869960785, "learning_rate": 1.4834814690591985e-05, "loss": 0.0125, "step": 15450 }, { "epoch": 2.5971849865951744, "grad_norm": 0.5575762391090393, "learning_rate": 1.4818044608418582e-05, "loss": 0.0122, "step": 15500 }, { "epoch": 2.6055630026809653, "grad_norm": 0.3975880444049835, "learning_rate": 1.4801274526245179e-05, "loss": 0.0115, "step": 15550 }, { "epoch": 2.6139410187667558, "grad_norm": 0.4161764681339264, "learning_rate": 1.4784504444071776e-05, "loss": 0.0143, "step": 15600 }, { "epoch": 2.6223190348525467, "grad_norm": 0.5338849425315857, "learning_rate": 1.4767734361898373e-05, "loss": 0.0127, "step": 15650 }, { "epoch": 2.6306970509383376, "grad_norm": 0.3128230571746826, "learning_rate": 1.475096427972497e-05, "loss": 0.0135, "step": 15700 }, { "epoch": 2.6390750670241285, "grad_norm": 0.5028887987136841, "learning_rate": 1.473419419755157e-05, "loss": 0.0117, "step": 15750 }, { "epoch": 2.6474530831099194, "grad_norm": 0.3744266927242279, "learning_rate": 1.4717424115378167e-05, "loss": 0.0115, "step": 15800 }, { "epoch": 2.6558310991957104, "grad_norm": 0.4230741560459137, "learning_rate": 1.4700654033204764e-05, "loss": 0.013, "step": 15850 }, { "epoch": 2.6642091152815013, "grad_norm": 0.49401816725730896, "learning_rate": 1.4683883951031361e-05, "loss": 0.0115, "step": 15900 }, { "epoch": 2.672587131367292, "grad_norm": 0.4584721028804779, "learning_rate": 1.4667113868857958e-05, "loss": 0.0133, "step": 15950 }, { "epoch": 2.680965147453083, "grad_norm": 0.760981559753418, "learning_rate": 1.4650343786684555e-05, "loss": 0.0127, "step": 16000 }, { "epoch": 2.689343163538874, "grad_norm": 0.37186485528945923, "learning_rate": 1.4633573704511152e-05, "loss": 0.0134, "step": 16050 }, { "epoch": 2.697721179624665, "grad_norm": 0.62066251039505, "learning_rate": 1.461680362233775e-05, "loss": 0.0141, "step": 16100 }, { "epoch": 2.706099195710456, "grad_norm": 0.3157498240470886, "learning_rate": 1.4600033540164348e-05, "loss": 0.0118, "step": 16150 }, { "epoch": 2.714477211796247, "grad_norm": 0.4527428448200226, "learning_rate": 1.4583263457990945e-05, "loss": 0.0134, "step": 16200 }, { "epoch": 2.7228552278820377, "grad_norm": 0.31555086374282837, "learning_rate": 1.4566493375817544e-05, "loss": 0.0132, "step": 16250 }, { "epoch": 2.7312332439678286, "grad_norm": 0.44448813796043396, "learning_rate": 1.4549723293644141e-05, "loss": 0.0124, "step": 16300 }, { "epoch": 2.7396112600536195, "grad_norm": 0.4281978905200958, "learning_rate": 1.4532953211470738e-05, "loss": 0.0122, "step": 16350 }, { "epoch": 2.7479892761394105, "grad_norm": 0.45892074704170227, "learning_rate": 1.4516183129297335e-05, "loss": 0.012, "step": 16400 }, { "epoch": 2.756367292225201, "grad_norm": 0.30029842257499695, "learning_rate": 1.4499413047123932e-05, "loss": 0.0119, "step": 16450 }, { "epoch": 2.764745308310992, "grad_norm": 0.3950155973434448, "learning_rate": 1.4482642964950529e-05, "loss": 0.0136, "step": 16500 }, { "epoch": 2.7731233243967828, "grad_norm": 0.4550629258155823, "learning_rate": 1.4465872882777128e-05, "loss": 0.0122, "step": 16550 }, { "epoch": 2.7815013404825737, "grad_norm": 0.5514039397239685, "learning_rate": 1.4449102800603725e-05, "loss": 0.0135, "step": 16600 }, { "epoch": 2.7898793565683646, "grad_norm": 0.5131493806838989, "learning_rate": 1.4432332718430322e-05, "loss": 0.0132, "step": 16650 }, { "epoch": 2.7982573726541555, "grad_norm": 0.39987483620643616, "learning_rate": 1.4415562636256919e-05, "loss": 0.0126, "step": 16700 }, { "epoch": 2.8066353887399464, "grad_norm": 0.5557750463485718, "learning_rate": 1.4398792554083516e-05, "loss": 0.0121, "step": 16750 }, { "epoch": 2.8150134048257374, "grad_norm": 0.34864020347595215, "learning_rate": 1.4382022471910113e-05, "loss": 0.0129, "step": 16800 }, { "epoch": 2.8233914209115283, "grad_norm": 0.4396969974040985, "learning_rate": 1.436525238973671e-05, "loss": 0.0137, "step": 16850 }, { "epoch": 2.831769436997319, "grad_norm": 0.4104606509208679, "learning_rate": 1.4348482307563307e-05, "loss": 0.0146, "step": 16900 }, { "epoch": 2.8401474530831097, "grad_norm": 0.6937008500099182, "learning_rate": 1.4331712225389907e-05, "loss": 0.0123, "step": 16950 }, { "epoch": 2.8485254691689006, "grad_norm": 0.5880556106567383, "learning_rate": 1.4314942143216504e-05, "loss": 0.0131, "step": 17000 }, { "epoch": 2.8569034852546915, "grad_norm": 0.4264618158340454, "learning_rate": 1.4298172061043101e-05, "loss": 0.0133, "step": 17050 }, { "epoch": 2.8652815013404824, "grad_norm": 0.5207853317260742, "learning_rate": 1.4281401978869698e-05, "loss": 0.0137, "step": 17100 }, { "epoch": 2.8736595174262733, "grad_norm": 0.4656062424182892, "learning_rate": 1.4264631896696295e-05, "loss": 0.0145, "step": 17150 }, { "epoch": 2.8820375335120643, "grad_norm": 0.38702937960624695, "learning_rate": 1.4247861814522892e-05, "loss": 0.0122, "step": 17200 }, { "epoch": 2.890415549597855, "grad_norm": 0.246555358171463, "learning_rate": 1.423109173234949e-05, "loss": 0.0135, "step": 17250 }, { "epoch": 2.898793565683646, "grad_norm": 0.2863421142101288, "learning_rate": 1.4214321650176086e-05, "loss": 0.0132, "step": 17300 }, { "epoch": 2.907171581769437, "grad_norm": 0.31063777208328247, "learning_rate": 1.4197551568002685e-05, "loss": 0.0122, "step": 17350 }, { "epoch": 2.915549597855228, "grad_norm": 0.5885173082351685, "learning_rate": 1.4180781485829282e-05, "loss": 0.0117, "step": 17400 }, { "epoch": 2.923927613941019, "grad_norm": 0.41046226024627686, "learning_rate": 1.4164011403655879e-05, "loss": 0.0136, "step": 17450 }, { "epoch": 2.9323056300268098, "grad_norm": 0.45641854405403137, "learning_rate": 1.4147241321482476e-05, "loss": 0.0136, "step": 17500 }, { "epoch": 2.9406836461126007, "grad_norm": 0.3291575610637665, "learning_rate": 1.4130471239309073e-05, "loss": 0.0128, "step": 17550 }, { "epoch": 2.9490616621983916, "grad_norm": 0.4031969904899597, "learning_rate": 1.411370115713567e-05, "loss": 0.0144, "step": 17600 }, { "epoch": 2.9574396782841825, "grad_norm": 0.4048541486263275, "learning_rate": 1.4096931074962267e-05, "loss": 0.0131, "step": 17650 }, { "epoch": 2.9658176943699734, "grad_norm": 0.42356961965560913, "learning_rate": 1.4080160992788866e-05, "loss": 0.0136, "step": 17700 }, { "epoch": 2.974195710455764, "grad_norm": 0.499991774559021, "learning_rate": 1.4063390910615464e-05, "loss": 0.0129, "step": 17750 }, { "epoch": 2.982573726541555, "grad_norm": 0.4582955837249756, "learning_rate": 1.4046620828442061e-05, "loss": 0.0128, "step": 17800 }, { "epoch": 2.9909517426273458, "grad_norm": 0.40763500332832336, "learning_rate": 1.4029850746268658e-05, "loss": 0.0125, "step": 17850 }, { "epoch": 2.9993297587131367, "grad_norm": 0.2882692515850067, "learning_rate": 1.4013080664095256e-05, "loss": 0.0131, "step": 17900 }, { "epoch": 3.0077077747989276, "grad_norm": 0.25621238350868225, "learning_rate": 1.3996310581921853e-05, "loss": 0.007, "step": 17950 }, { "epoch": 3.0160857908847185, "grad_norm": 0.2496500015258789, "learning_rate": 1.397954049974845e-05, "loss": 0.0058, "step": 18000 }, { "epoch": 3.0244638069705094, "grad_norm": 0.5392020344734192, "learning_rate": 1.3962770417575047e-05, "loss": 0.0064, "step": 18050 }, { "epoch": 3.0328418230563003, "grad_norm": 0.142150416970253, "learning_rate": 1.3946000335401644e-05, "loss": 0.0049, "step": 18100 }, { "epoch": 3.0412198391420913, "grad_norm": 0.2814841866493225, "learning_rate": 1.392923025322824e-05, "loss": 0.0048, "step": 18150 }, { "epoch": 3.049597855227882, "grad_norm": 0.9062692523002625, "learning_rate": 1.3912460171054841e-05, "loss": 0.0054, "step": 18200 }, { "epoch": 3.057975871313673, "grad_norm": 0.17520900070667267, "learning_rate": 1.3895690088881438e-05, "loss": 0.0047, "step": 18250 }, { "epoch": 3.066353887399464, "grad_norm": 0.2684191167354584, "learning_rate": 1.3878920006708035e-05, "loss": 0.0055, "step": 18300 }, { "epoch": 3.0747319034852545, "grad_norm": 0.2762264013290405, "learning_rate": 1.3862149924534632e-05, "loss": 0.006, "step": 18350 }, { "epoch": 3.0831099195710454, "grad_norm": 0.16580019891262054, "learning_rate": 1.3845379842361229e-05, "loss": 0.0053, "step": 18400 }, { "epoch": 3.0914879356568363, "grad_norm": 0.30021271109580994, "learning_rate": 1.3828609760187826e-05, "loss": 0.0052, "step": 18450 }, { "epoch": 3.0998659517426272, "grad_norm": 0.3511424958705902, "learning_rate": 1.3811839678014423e-05, "loss": 0.0053, "step": 18500 }, { "epoch": 3.108243967828418, "grad_norm": 0.2431810349225998, "learning_rate": 1.379506959584102e-05, "loss": 0.0054, "step": 18550 }, { "epoch": 3.116621983914209, "grad_norm": 0.2419600486755371, "learning_rate": 1.3778299513667619e-05, "loss": 0.0053, "step": 18600 }, { "epoch": 3.125, "grad_norm": 0.3268046975135803, "learning_rate": 1.3761529431494216e-05, "loss": 0.0055, "step": 18650 }, { "epoch": 3.133378016085791, "grad_norm": 0.38957932591438293, "learning_rate": 1.3744759349320813e-05, "loss": 0.0051, "step": 18700 }, { "epoch": 3.141756032171582, "grad_norm": 0.31418824195861816, "learning_rate": 1.372798926714741e-05, "loss": 0.0052, "step": 18750 }, { "epoch": 3.1501340482573728, "grad_norm": 0.3322865068912506, "learning_rate": 1.3711219184974007e-05, "loss": 0.0051, "step": 18800 }, { "epoch": 3.1585120643431637, "grad_norm": 0.22010941803455353, "learning_rate": 1.3694449102800604e-05, "loss": 0.0053, "step": 18850 }, { "epoch": 3.1668900804289546, "grad_norm": 0.23425912857055664, "learning_rate": 1.3677679020627201e-05, "loss": 0.0055, "step": 18900 }, { "epoch": 3.1752680965147455, "grad_norm": 0.30269861221313477, "learning_rate": 1.3660908938453798e-05, "loss": 0.0051, "step": 18950 }, { "epoch": 3.1836461126005364, "grad_norm": 0.28305545449256897, "learning_rate": 1.3644138856280398e-05, "loss": 0.0057, "step": 19000 }, { "epoch": 3.192024128686327, "grad_norm": 0.313149631023407, "learning_rate": 1.3627368774106995e-05, "loss": 0.0048, "step": 19050 }, { "epoch": 3.200402144772118, "grad_norm": 0.30681276321411133, "learning_rate": 1.3610598691933592e-05, "loss": 0.0056, "step": 19100 }, { "epoch": 3.2087801608579087, "grad_norm": 0.17815206944942474, "learning_rate": 1.359382860976019e-05, "loss": 0.0055, "step": 19150 }, { "epoch": 3.2171581769436997, "grad_norm": 0.29173994064331055, "learning_rate": 1.3577058527586786e-05, "loss": 0.0057, "step": 19200 }, { "epoch": 3.2255361930294906, "grad_norm": 0.3214263916015625, "learning_rate": 1.3560288445413383e-05, "loss": 0.0063, "step": 19250 }, { "epoch": 3.2339142091152815, "grad_norm": 0.2251535803079605, "learning_rate": 1.354351836323998e-05, "loss": 0.006, "step": 19300 }, { "epoch": 3.2422922252010724, "grad_norm": 0.4358842372894287, "learning_rate": 1.3526748281066577e-05, "loss": 0.0047, "step": 19350 }, { "epoch": 3.2506702412868633, "grad_norm": 0.23471078276634216, "learning_rate": 1.3509978198893176e-05, "loss": 0.0058, "step": 19400 }, { "epoch": 3.2590482573726542, "grad_norm": 0.28291311860084534, "learning_rate": 1.3493208116719773e-05, "loss": 0.005, "step": 19450 }, { "epoch": 3.267426273458445, "grad_norm": 0.23490838706493378, "learning_rate": 1.347643803454637e-05, "loss": 0.0055, "step": 19500 }, { "epoch": 3.275804289544236, "grad_norm": 0.3439931571483612, "learning_rate": 1.3459667952372967e-05, "loss": 0.0053, "step": 19550 }, { "epoch": 3.284182305630027, "grad_norm": 0.19748039543628693, "learning_rate": 1.3442897870199564e-05, "loss": 0.0054, "step": 19600 }, { "epoch": 3.2925603217158175, "grad_norm": 0.3718995749950409, "learning_rate": 1.3426127788026163e-05, "loss": 0.0051, "step": 19650 }, { "epoch": 3.3009383378016084, "grad_norm": 0.49980103969573975, "learning_rate": 1.340935770585276e-05, "loss": 0.0056, "step": 19700 }, { "epoch": 3.3093163538873993, "grad_norm": 0.5253378748893738, "learning_rate": 1.3392587623679357e-05, "loss": 0.0052, "step": 19750 }, { "epoch": 3.3176943699731902, "grad_norm": 0.14330442249774933, "learning_rate": 1.3375817541505956e-05, "loss": 0.0058, "step": 19800 }, { "epoch": 3.326072386058981, "grad_norm": 0.2218172401189804, "learning_rate": 1.3359047459332553e-05, "loss": 0.006, "step": 19850 }, { "epoch": 3.334450402144772, "grad_norm": 0.4102313816547394, "learning_rate": 1.334227737715915e-05, "loss": 0.0062, "step": 19900 }, { "epoch": 3.342828418230563, "grad_norm": 0.37395352125167847, "learning_rate": 1.3325507294985747e-05, "loss": 0.0061, "step": 19950 }, { "epoch": 3.351206434316354, "grad_norm": 0.2626063823699951, "learning_rate": 1.3308737212812344e-05, "loss": 0.0053, "step": 20000 }, { "epoch": 3.351206434316354, "eval_loss": 0.03919154778122902, "eval_runtime": 0.3143, "eval_samples_per_second": 63.636, "eval_steps_per_second": 3.182, "step": 20000 }, { "epoch": 3.359584450402145, "grad_norm": 0.1779392808675766, "learning_rate": 1.329196713063894e-05, "loss": 0.0059, "step": 20050 }, { "epoch": 3.3679624664879357, "grad_norm": 0.37775570154190063, "learning_rate": 1.3275197048465538e-05, "loss": 0.0064, "step": 20100 }, { "epoch": 3.3763404825737267, "grad_norm": 0.26373809576034546, "learning_rate": 1.3258426966292135e-05, "loss": 0.0064, "step": 20150 }, { "epoch": 3.3847184986595176, "grad_norm": 0.330445259809494, "learning_rate": 1.3241656884118735e-05, "loss": 0.0063, "step": 20200 }, { "epoch": 3.3930965147453085, "grad_norm": 0.294837087392807, "learning_rate": 1.3224886801945332e-05, "loss": 0.0054, "step": 20250 }, { "epoch": 3.4014745308310994, "grad_norm": 0.16401290893554688, "learning_rate": 1.3208116719771929e-05, "loss": 0.0057, "step": 20300 }, { "epoch": 3.4098525469168903, "grad_norm": 0.5002830624580383, "learning_rate": 1.3191346637598526e-05, "loss": 0.0058, "step": 20350 }, { "epoch": 3.418230563002681, "grad_norm": 0.321429580450058, "learning_rate": 1.3174576555425123e-05, "loss": 0.0067, "step": 20400 }, { "epoch": 3.4266085790884717, "grad_norm": 0.28548842668533325, "learning_rate": 1.315780647325172e-05, "loss": 0.0055, "step": 20450 }, { "epoch": 3.4349865951742626, "grad_norm": 0.20685793459415436, "learning_rate": 1.3141036391078317e-05, "loss": 0.0057, "step": 20500 }, { "epoch": 3.4433646112600536, "grad_norm": 0.30995652079582214, "learning_rate": 1.3124266308904914e-05, "loss": 0.0055, "step": 20550 }, { "epoch": 3.4517426273458445, "grad_norm": 0.41536813974380493, "learning_rate": 1.3107496226731513e-05, "loss": 0.0062, "step": 20600 }, { "epoch": 3.4601206434316354, "grad_norm": 0.29047590494155884, "learning_rate": 1.309072614455811e-05, "loss": 0.0063, "step": 20650 }, { "epoch": 3.4684986595174263, "grad_norm": 0.23248636722564697, "learning_rate": 1.3073956062384707e-05, "loss": 0.006, "step": 20700 }, { "epoch": 3.4768766756032172, "grad_norm": 0.2018858790397644, "learning_rate": 1.3057185980211304e-05, "loss": 0.0055, "step": 20750 }, { "epoch": 3.485254691689008, "grad_norm": 0.3976786732673645, "learning_rate": 1.3040415898037901e-05, "loss": 0.0053, "step": 20800 }, { "epoch": 3.493632707774799, "grad_norm": 0.34822383522987366, "learning_rate": 1.3023645815864498e-05, "loss": 0.0065, "step": 20850 }, { "epoch": 3.5020107238605895, "grad_norm": 0.23607690632343292, "learning_rate": 1.3006875733691095e-05, "loss": 0.0059, "step": 20900 }, { "epoch": 3.5103887399463805, "grad_norm": 0.40768417716026306, "learning_rate": 1.2990105651517692e-05, "loss": 0.0057, "step": 20950 }, { "epoch": 3.5187667560321714, "grad_norm": 0.42595741152763367, "learning_rate": 1.2973335569344292e-05, "loss": 0.0066, "step": 21000 }, { "epoch": 3.5271447721179623, "grad_norm": 0.4516412615776062, "learning_rate": 1.295656548717089e-05, "loss": 0.0062, "step": 21050 }, { "epoch": 3.535522788203753, "grad_norm": 0.42684000730514526, "learning_rate": 1.2939795404997486e-05, "loss": 0.0059, "step": 21100 }, { "epoch": 3.543900804289544, "grad_norm": 0.5775489211082458, "learning_rate": 1.2923025322824083e-05, "loss": 0.0054, "step": 21150 }, { "epoch": 3.552278820375335, "grad_norm": 0.7901192307472229, "learning_rate": 1.290625524065068e-05, "loss": 0.0052, "step": 21200 }, { "epoch": 3.560656836461126, "grad_norm": 0.2339819371700287, "learning_rate": 1.2889485158477277e-05, "loss": 0.006, "step": 21250 }, { "epoch": 3.569034852546917, "grad_norm": 0.34695181250572205, "learning_rate": 1.2872715076303874e-05, "loss": 0.005, "step": 21300 }, { "epoch": 3.577412868632708, "grad_norm": 0.3339728116989136, "learning_rate": 1.2855944994130471e-05, "loss": 0.0056, "step": 21350 }, { "epoch": 3.5857908847184987, "grad_norm": 0.22279733419418335, "learning_rate": 1.2839174911957068e-05, "loss": 0.0062, "step": 21400 }, { "epoch": 3.5941689008042896, "grad_norm": 0.2896275222301483, "learning_rate": 1.2822404829783667e-05, "loss": 0.0062, "step": 21450 }, { "epoch": 3.6025469168900806, "grad_norm": 0.4125616252422333, "learning_rate": 1.2805634747610264e-05, "loss": 0.0064, "step": 21500 }, { "epoch": 3.6109249329758715, "grad_norm": 0.3267725110054016, "learning_rate": 1.2788864665436861e-05, "loss": 0.0065, "step": 21550 }, { "epoch": 3.6193029490616624, "grad_norm": 0.4519464373588562, "learning_rate": 1.277209458326346e-05, "loss": 0.0062, "step": 21600 }, { "epoch": 3.6276809651474533, "grad_norm": 0.4503564238548279, "learning_rate": 1.2755324501090057e-05, "loss": 0.0068, "step": 21650 }, { "epoch": 3.6360589812332442, "grad_norm": 0.14587004482746124, "learning_rate": 1.2738554418916654e-05, "loss": 0.0056, "step": 21700 }, { "epoch": 3.6444369973190347, "grad_norm": 0.3932003378868103, "learning_rate": 1.2721784336743251e-05, "loss": 0.0055, "step": 21750 }, { "epoch": 3.6528150134048256, "grad_norm": 0.41266146302223206, "learning_rate": 1.2705014254569848e-05, "loss": 0.0058, "step": 21800 }, { "epoch": 3.6611930294906165, "grad_norm": 0.40585076808929443, "learning_rate": 1.2688244172396447e-05, "loss": 0.0057, "step": 21850 }, { "epoch": 3.6695710455764075, "grad_norm": 0.4181327819824219, "learning_rate": 1.2671474090223044e-05, "loss": 0.0069, "step": 21900 }, { "epoch": 3.6779490616621984, "grad_norm": 0.20495828986167908, "learning_rate": 1.265470400804964e-05, "loss": 0.0066, "step": 21950 }, { "epoch": 3.6863270777479893, "grad_norm": 0.22110895812511444, "learning_rate": 1.2637933925876238e-05, "loss": 0.006, "step": 22000 }, { "epoch": 3.69470509383378, "grad_norm": 0.43901216983795166, "learning_rate": 1.2621163843702835e-05, "loss": 0.0064, "step": 22050 }, { "epoch": 3.703083109919571, "grad_norm": 0.34933629631996155, "learning_rate": 1.2604393761529432e-05, "loss": 0.0062, "step": 22100 }, { "epoch": 3.711461126005362, "grad_norm": 0.3028928339481354, "learning_rate": 1.2587623679356029e-05, "loss": 0.0065, "step": 22150 }, { "epoch": 3.719839142091153, "grad_norm": 0.25583240389823914, "learning_rate": 1.2570853597182626e-05, "loss": 0.0067, "step": 22200 }, { "epoch": 3.7282171581769434, "grad_norm": 0.15429948270320892, "learning_rate": 1.2554083515009226e-05, "loss": 0.0067, "step": 22250 }, { "epoch": 3.7365951742627344, "grad_norm": 0.35330894589424133, "learning_rate": 1.2537313432835823e-05, "loss": 0.0073, "step": 22300 }, { "epoch": 3.7449731903485253, "grad_norm": 0.5058137774467468, "learning_rate": 1.252054335066242e-05, "loss": 0.0061, "step": 22350 }, { "epoch": 3.753351206434316, "grad_norm": 0.27442070841789246, "learning_rate": 1.2503773268489017e-05, "loss": 0.0072, "step": 22400 }, { "epoch": 3.761729222520107, "grad_norm": 0.34210237860679626, "learning_rate": 1.2487003186315614e-05, "loss": 0.0059, "step": 22450 }, { "epoch": 3.770107238605898, "grad_norm": 0.36563077569007874, "learning_rate": 1.2470233104142211e-05, "loss": 0.0064, "step": 22500 }, { "epoch": 3.778485254691689, "grad_norm": 0.3370627760887146, "learning_rate": 1.2453463021968808e-05, "loss": 0.0054, "step": 22550 }, { "epoch": 3.78686327077748, "grad_norm": 0.17612957954406738, "learning_rate": 1.2436692939795405e-05, "loss": 0.0071, "step": 22600 }, { "epoch": 3.795241286863271, "grad_norm": 0.33844587206840515, "learning_rate": 1.2419922857622004e-05, "loss": 0.0059, "step": 22650 }, { "epoch": 3.8036193029490617, "grad_norm": 0.22707916796207428, "learning_rate": 1.2403152775448601e-05, "loss": 0.0062, "step": 22700 }, { "epoch": 3.8119973190348526, "grad_norm": 0.2754456102848053, "learning_rate": 1.2386382693275198e-05, "loss": 0.0067, "step": 22750 }, { "epoch": 3.8203753351206435, "grad_norm": 0.411072313785553, "learning_rate": 1.2369612611101795e-05, "loss": 0.0066, "step": 22800 }, { "epoch": 3.8287533512064345, "grad_norm": 0.3319416344165802, "learning_rate": 1.2352842528928392e-05, "loss": 0.0061, "step": 22850 }, { "epoch": 3.8371313672922254, "grad_norm": 0.20878171920776367, "learning_rate": 1.2336072446754989e-05, "loss": 0.0059, "step": 22900 }, { "epoch": 3.8455093833780163, "grad_norm": 0.1912664771080017, "learning_rate": 1.2319302364581586e-05, "loss": 0.0059, "step": 22950 }, { "epoch": 3.853887399463807, "grad_norm": 0.3744626045227051, "learning_rate": 1.2302532282408185e-05, "loss": 0.006, "step": 23000 }, { "epoch": 3.862265415549598, "grad_norm": 0.37646523118019104, "learning_rate": 1.2285762200234783e-05, "loss": 0.0056, "step": 23050 }, { "epoch": 3.8706434316353886, "grad_norm": 0.17005406320095062, "learning_rate": 1.226899211806138e-05, "loss": 0.0062, "step": 23100 }, { "epoch": 3.8790214477211795, "grad_norm": 0.35684868693351746, "learning_rate": 1.2252222035887977e-05, "loss": 0.0069, "step": 23150 }, { "epoch": 3.8873994638069704, "grad_norm": 0.37645256519317627, "learning_rate": 1.2235451953714574e-05, "loss": 0.0071, "step": 23200 }, { "epoch": 3.8957774798927614, "grad_norm": 0.5175814032554626, "learning_rate": 1.2218681871541171e-05, "loss": 0.0059, "step": 23250 }, { "epoch": 3.9041554959785523, "grad_norm": 0.3317829966545105, "learning_rate": 1.2201911789367768e-05, "loss": 0.0064, "step": 23300 }, { "epoch": 3.912533512064343, "grad_norm": 0.2565181255340576, "learning_rate": 1.2185141707194365e-05, "loss": 0.0069, "step": 23350 }, { "epoch": 3.920911528150134, "grad_norm": 0.38244709372520447, "learning_rate": 1.2168371625020963e-05, "loss": 0.0068, "step": 23400 }, { "epoch": 3.929289544235925, "grad_norm": 0.4136451184749603, "learning_rate": 1.2151601542847561e-05, "loss": 0.0067, "step": 23450 }, { "epoch": 3.937667560321716, "grad_norm": 0.2662147581577301, "learning_rate": 1.213483146067416e-05, "loss": 0.0061, "step": 23500 }, { "epoch": 3.946045576407507, "grad_norm": 0.291955828666687, "learning_rate": 1.2118061378500757e-05, "loss": 0.0058, "step": 23550 }, { "epoch": 3.9544235924932973, "grad_norm": 0.30662792921066284, "learning_rate": 1.2101291296327354e-05, "loss": 0.0071, "step": 23600 }, { "epoch": 3.9628016085790883, "grad_norm": 0.5435032844543457, "learning_rate": 1.2084521214153951e-05, "loss": 0.0071, "step": 23650 }, { "epoch": 3.971179624664879, "grad_norm": 0.2924433648586273, "learning_rate": 1.2067751131980548e-05, "loss": 0.0074, "step": 23700 }, { "epoch": 3.97955764075067, "grad_norm": 0.47101885080337524, "learning_rate": 1.2050981049807145e-05, "loss": 0.0065, "step": 23750 }, { "epoch": 3.987935656836461, "grad_norm": 0.39184531569480896, "learning_rate": 1.2034210967633742e-05, "loss": 0.0065, "step": 23800 }, { "epoch": 3.996313672922252, "grad_norm": 0.27226710319519043, "learning_rate": 1.201744088546034e-05, "loss": 0.0058, "step": 23850 }, { "epoch": 4.004691689008043, "grad_norm": 0.37524715065956116, "learning_rate": 1.2000670803286938e-05, "loss": 0.0045, "step": 23900 }, { "epoch": 4.013069705093834, "grad_norm": 0.0983668640255928, "learning_rate": 1.1983900721113535e-05, "loss": 0.0023, "step": 23950 }, { "epoch": 4.021447721179625, "grad_norm": 0.32168978452682495, "learning_rate": 1.1967130638940132e-05, "loss": 0.0024, "step": 24000 }, { "epoch": 4.029825737265416, "grad_norm": 0.2205764651298523, "learning_rate": 1.1950360556766729e-05, "loss": 0.0022, "step": 24050 }, { "epoch": 4.0382037533512065, "grad_norm": 0.21505975723266602, "learning_rate": 1.1933590474593326e-05, "loss": 0.0023, "step": 24100 }, { "epoch": 4.046581769436997, "grad_norm": 0.0701180100440979, "learning_rate": 1.1916820392419923e-05, "loss": 0.0019, "step": 24150 }, { "epoch": 4.054959785522788, "grad_norm": 0.3256973624229431, "learning_rate": 1.190005031024652e-05, "loss": 0.0026, "step": 24200 }, { "epoch": 4.063337801608579, "grad_norm": 0.308699369430542, "learning_rate": 1.1883280228073117e-05, "loss": 0.0024, "step": 24250 }, { "epoch": 4.07171581769437, "grad_norm": 0.29565149545669556, "learning_rate": 1.1866510145899717e-05, "loss": 0.0026, "step": 24300 }, { "epoch": 4.080093833780161, "grad_norm": 0.23600581288337708, "learning_rate": 1.1849740063726314e-05, "loss": 0.0021, "step": 24350 }, { "epoch": 4.088471849865952, "grad_norm": 0.188632994890213, "learning_rate": 1.1832969981552911e-05, "loss": 0.0025, "step": 24400 }, { "epoch": 4.096849865951743, "grad_norm": 0.35330700874328613, "learning_rate": 1.1816199899379508e-05, "loss": 0.0025, "step": 24450 }, { "epoch": 4.105227882037534, "grad_norm": 0.14944002032279968, "learning_rate": 1.1799429817206105e-05, "loss": 0.0026, "step": 24500 }, { "epoch": 4.113605898123325, "grad_norm": 0.23015423119068146, "learning_rate": 1.1782659735032702e-05, "loss": 0.0023, "step": 24550 }, { "epoch": 4.121983914209116, "grad_norm": 0.43203842639923096, "learning_rate": 1.17658896528593e-05, "loss": 0.0024, "step": 24600 }, { "epoch": 4.130361930294907, "grad_norm": 0.16286316514015198, "learning_rate": 1.1749119570685896e-05, "loss": 0.0024, "step": 24650 }, { "epoch": 4.138739946380697, "grad_norm": 0.44359683990478516, "learning_rate": 1.1732349488512495e-05, "loss": 0.0024, "step": 24700 }, { "epoch": 4.147117962466488, "grad_norm": 0.07397326827049255, "learning_rate": 1.1715579406339092e-05, "loss": 0.0027, "step": 24750 }, { "epoch": 4.1554959785522785, "grad_norm": 0.07963547110557556, "learning_rate": 1.1698809324165689e-05, "loss": 0.0026, "step": 24800 }, { "epoch": 4.163873994638069, "grad_norm": 0.27886438369750977, "learning_rate": 1.1682039241992286e-05, "loss": 0.0026, "step": 24850 }, { "epoch": 4.17225201072386, "grad_norm": 0.11975943297147751, "learning_rate": 1.1665269159818883e-05, "loss": 0.0025, "step": 24900 }, { "epoch": 4.180630026809651, "grad_norm": 0.1815500408411026, "learning_rate": 1.1648499077645482e-05, "loss": 0.0024, "step": 24950 }, { "epoch": 4.189008042895442, "grad_norm": 0.7938502430915833, "learning_rate": 1.1631728995472079e-05, "loss": 0.0024, "step": 25000 }, { "epoch": 4.197386058981233, "grad_norm": 0.32321181893348694, "learning_rate": 1.1614958913298676e-05, "loss": 0.0026, "step": 25050 }, { "epoch": 4.205764075067024, "grad_norm": 0.09507790207862854, "learning_rate": 1.1598188831125274e-05, "loss": 0.0023, "step": 25100 }, { "epoch": 4.214142091152815, "grad_norm": 0.31748858094215393, "learning_rate": 1.1581418748951872e-05, "loss": 0.0025, "step": 25150 }, { "epoch": 4.222520107238606, "grad_norm": 0.39395052194595337, "learning_rate": 1.1564648666778469e-05, "loss": 0.0027, "step": 25200 }, { "epoch": 4.230898123324397, "grad_norm": 0.12810911238193512, "learning_rate": 1.1547878584605066e-05, "loss": 0.0023, "step": 25250 }, { "epoch": 4.239276139410188, "grad_norm": 0.18891964852809906, "learning_rate": 1.1531108502431663e-05, "loss": 0.0024, "step": 25300 }, { "epoch": 4.247654155495979, "grad_norm": 0.16266460716724396, "learning_rate": 1.151433842025826e-05, "loss": 0.0032, "step": 25350 }, { "epoch": 4.2560321715817695, "grad_norm": 0.3381274342536926, "learning_rate": 1.1497568338084857e-05, "loss": 0.0027, "step": 25400 }, { "epoch": 4.26441018766756, "grad_norm": 0.7718698978424072, "learning_rate": 1.1480798255911454e-05, "loss": 0.0026, "step": 25450 }, { "epoch": 4.272788203753351, "grad_norm": 0.7734161615371704, "learning_rate": 1.1464028173738054e-05, "loss": 0.0029, "step": 25500 }, { "epoch": 4.281166219839142, "grad_norm": 0.24889783561229706, "learning_rate": 1.1447258091564651e-05, "loss": 0.0027, "step": 25550 }, { "epoch": 4.289544235924933, "grad_norm": 0.1416139155626297, "learning_rate": 1.1430488009391248e-05, "loss": 0.0023, "step": 25600 }, { "epoch": 4.297922252010724, "grad_norm": 0.2628386318683624, "learning_rate": 1.1413717927217845e-05, "loss": 0.0026, "step": 25650 }, { "epoch": 4.306300268096515, "grad_norm": 0.1891651153564453, "learning_rate": 1.1396947845044442e-05, "loss": 0.0021, "step": 25700 }, { "epoch": 4.314678284182306, "grad_norm": 0.33927446603775024, "learning_rate": 1.1380177762871039e-05, "loss": 0.0023, "step": 25750 }, { "epoch": 4.323056300268097, "grad_norm": 0.2871659994125366, "learning_rate": 1.1363407680697636e-05, "loss": 0.0025, "step": 25800 }, { "epoch": 4.331434316353888, "grad_norm": 0.16000057756900787, "learning_rate": 1.1346637598524233e-05, "loss": 0.0028, "step": 25850 }, { "epoch": 4.339812332439679, "grad_norm": 0.2464749813079834, "learning_rate": 1.1329867516350832e-05, "loss": 0.0027, "step": 25900 }, { "epoch": 4.348190348525469, "grad_norm": 0.22312916815280914, "learning_rate": 1.1313097434177429e-05, "loss": 0.0021, "step": 25950 }, { "epoch": 4.35656836461126, "grad_norm": 0.04916452243924141, "learning_rate": 1.1296327352004026e-05, "loss": 0.0028, "step": 26000 }, { "epoch": 4.3649463806970505, "grad_norm": 0.09145969897508621, "learning_rate": 1.1279557269830623e-05, "loss": 0.0024, "step": 26050 }, { "epoch": 4.3733243967828415, "grad_norm": 0.15269909799098969, "learning_rate": 1.126278718765722e-05, "loss": 0.0027, "step": 26100 }, { "epoch": 4.381702412868632, "grad_norm": 0.14302955567836761, "learning_rate": 1.1246017105483817e-05, "loss": 0.0026, "step": 26150 }, { "epoch": 4.390080428954423, "grad_norm": 0.38677042722702026, "learning_rate": 1.1229247023310414e-05, "loss": 0.0025, "step": 26200 }, { "epoch": 4.398458445040214, "grad_norm": 0.18998374044895172, "learning_rate": 1.1212476941137011e-05, "loss": 0.0026, "step": 26250 }, { "epoch": 4.406836461126005, "grad_norm": 0.07754815369844437, "learning_rate": 1.1195706858963611e-05, "loss": 0.0031, "step": 26300 }, { "epoch": 4.415214477211796, "grad_norm": 0.31846073269844055, "learning_rate": 1.1178936776790208e-05, "loss": 0.0024, "step": 26350 }, { "epoch": 4.423592493297587, "grad_norm": 0.3100847601890564, "learning_rate": 1.1162166694616805e-05, "loss": 0.0029, "step": 26400 }, { "epoch": 4.431970509383378, "grad_norm": 0.08093760907649994, "learning_rate": 1.1145396612443402e-05, "loss": 0.0023, "step": 26450 }, { "epoch": 4.440348525469169, "grad_norm": 0.12065700441598892, "learning_rate": 1.112862653027e-05, "loss": 0.0024, "step": 26500 }, { "epoch": 4.44872654155496, "grad_norm": 0.18668776750564575, "learning_rate": 1.1111856448096596e-05, "loss": 0.0021, "step": 26550 }, { "epoch": 4.457104557640751, "grad_norm": 0.06442166119813919, "learning_rate": 1.1095086365923193e-05, "loss": 0.0022, "step": 26600 }, { "epoch": 4.465482573726542, "grad_norm": 0.3723543584346771, "learning_rate": 1.107831628374979e-05, "loss": 0.0026, "step": 26650 }, { "epoch": 4.4738605898123325, "grad_norm": 0.17430204153060913, "learning_rate": 1.1061546201576389e-05, "loss": 0.0025, "step": 26700 }, { "epoch": 4.482238605898123, "grad_norm": 0.24499832093715668, "learning_rate": 1.1044776119402986e-05, "loss": 0.0026, "step": 26750 }, { "epoch": 4.490616621983914, "grad_norm": 0.12160493433475494, "learning_rate": 1.1028006037229583e-05, "loss": 0.0027, "step": 26800 }, { "epoch": 4.498994638069705, "grad_norm": 0.13980576395988464, "learning_rate": 1.101123595505618e-05, "loss": 0.0025, "step": 26850 }, { "epoch": 4.507372654155496, "grad_norm": 0.9759100079536438, "learning_rate": 1.0994465872882779e-05, "loss": 0.0027, "step": 26900 }, { "epoch": 4.515750670241287, "grad_norm": 0.26894333958625793, "learning_rate": 1.0977695790709376e-05, "loss": 0.0025, "step": 26950 }, { "epoch": 4.524128686327078, "grad_norm": 0.17025631666183472, "learning_rate": 1.0960925708535973e-05, "loss": 0.0026, "step": 27000 }, { "epoch": 4.532506702412869, "grad_norm": 0.38619130849838257, "learning_rate": 1.094415562636257e-05, "loss": 0.0027, "step": 27050 }, { "epoch": 4.54088471849866, "grad_norm": 0.2916272282600403, "learning_rate": 1.0927385544189169e-05, "loss": 0.003, "step": 27100 }, { "epoch": 4.549262734584451, "grad_norm": 0.26764917373657227, "learning_rate": 1.0910615462015766e-05, "loss": 0.0024, "step": 27150 }, { "epoch": 4.557640750670242, "grad_norm": 0.1611230969429016, "learning_rate": 1.0893845379842363e-05, "loss": 0.003, "step": 27200 }, { "epoch": 4.566018766756033, "grad_norm": 0.23360604047775269, "learning_rate": 1.087707529766896e-05, "loss": 0.003, "step": 27250 }, { "epoch": 4.5743967828418235, "grad_norm": 0.19292519986629486, "learning_rate": 1.0860305215495557e-05, "loss": 0.0026, "step": 27300 }, { "epoch": 4.582774798927614, "grad_norm": 0.4826861023902893, "learning_rate": 1.0843535133322154e-05, "loss": 0.0029, "step": 27350 }, { "epoch": 4.591152815013404, "grad_norm": 0.11598275601863861, "learning_rate": 1.082676505114875e-05, "loss": 0.003, "step": 27400 }, { "epoch": 4.599530831099195, "grad_norm": 0.2721264958381653, "learning_rate": 1.0809994968975348e-05, "loss": 0.0032, "step": 27450 }, { "epoch": 4.607908847184986, "grad_norm": 0.1644926518201828, "learning_rate": 1.0793224886801945e-05, "loss": 0.003, "step": 27500 }, { "epoch": 4.616286863270777, "grad_norm": 0.17666374146938324, "learning_rate": 1.0776454804628545e-05, "loss": 0.0028, "step": 27550 }, { "epoch": 4.624664879356568, "grad_norm": 0.21726448833942413, "learning_rate": 1.0759684722455142e-05, "loss": 0.0025, "step": 27600 }, { "epoch": 4.633042895442359, "grad_norm": 0.21972903609275818, "learning_rate": 1.0742914640281739e-05, "loss": 0.0028, "step": 27650 }, { "epoch": 4.64142091152815, "grad_norm": 0.2247893065214157, "learning_rate": 1.0726144558108336e-05, "loss": 0.0026, "step": 27700 }, { "epoch": 4.649798927613941, "grad_norm": 0.1289321780204773, "learning_rate": 1.0709374475934933e-05, "loss": 0.0032, "step": 27750 }, { "epoch": 4.658176943699732, "grad_norm": 0.4954499304294586, "learning_rate": 1.069260439376153e-05, "loss": 0.0028, "step": 27800 }, { "epoch": 4.666554959785523, "grad_norm": 0.18240614235401154, "learning_rate": 1.0675834311588127e-05, "loss": 0.0027, "step": 27850 }, { "epoch": 4.674932975871314, "grad_norm": 0.1933482438325882, "learning_rate": 1.0659064229414724e-05, "loss": 0.0031, "step": 27900 }, { "epoch": 4.6833109919571045, "grad_norm": 0.11678989976644516, "learning_rate": 1.0642294147241323e-05, "loss": 0.0032, "step": 27950 }, { "epoch": 4.6916890080428955, "grad_norm": 0.18577493727207184, "learning_rate": 1.062552406506792e-05, "loss": 0.0026, "step": 28000 }, { "epoch": 4.700067024128686, "grad_norm": 0.07589305937290192, "learning_rate": 1.0608753982894517e-05, "loss": 0.003, "step": 28050 }, { "epoch": 4.708445040214477, "grad_norm": 0.20295588672161102, "learning_rate": 1.0591983900721114e-05, "loss": 0.0028, "step": 28100 }, { "epoch": 4.716823056300268, "grad_norm": 0.27186664938926697, "learning_rate": 1.0575213818547711e-05, "loss": 0.0033, "step": 28150 }, { "epoch": 4.725201072386059, "grad_norm": 0.3505285382270813, "learning_rate": 1.0558443736374308e-05, "loss": 0.0029, "step": 28200 }, { "epoch": 4.73357908847185, "grad_norm": 0.4643058776855469, "learning_rate": 1.0541673654200905e-05, "loss": 0.0033, "step": 28250 }, { "epoch": 4.741957104557641, "grad_norm": 0.29970914125442505, "learning_rate": 1.0524903572027502e-05, "loss": 0.0029, "step": 28300 }, { "epoch": 4.750335120643432, "grad_norm": 0.3563650846481323, "learning_rate": 1.0508133489854102e-05, "loss": 0.0026, "step": 28350 }, { "epoch": 4.758713136729223, "grad_norm": 0.2816406190395355, "learning_rate": 1.04913634076807e-05, "loss": 0.0033, "step": 28400 }, { "epoch": 4.767091152815014, "grad_norm": 0.3998458981513977, "learning_rate": 1.0474593325507296e-05, "loss": 0.0033, "step": 28450 }, { "epoch": 4.775469168900805, "grad_norm": 0.29152771830558777, "learning_rate": 1.0457823243333893e-05, "loss": 0.0031, "step": 28500 }, { "epoch": 4.783847184986596, "grad_norm": 0.3733079731464386, "learning_rate": 1.044105316116049e-05, "loss": 0.0031, "step": 28550 }, { "epoch": 4.792225201072386, "grad_norm": 0.2442307472229004, "learning_rate": 1.0424283078987087e-05, "loss": 0.0032, "step": 28600 }, { "epoch": 4.8006032171581765, "grad_norm": 0.6178602576255798, "learning_rate": 1.0407512996813684e-05, "loss": 0.0029, "step": 28650 }, { "epoch": 4.808981233243967, "grad_norm": 0.3169240951538086, "learning_rate": 1.0390742914640281e-05, "loss": 0.0029, "step": 28700 }, { "epoch": 4.817359249329758, "grad_norm": 0.21497473120689392, "learning_rate": 1.037397283246688e-05, "loss": 0.0032, "step": 28750 }, { "epoch": 4.825737265415549, "grad_norm": 0.4647163450717926, "learning_rate": 1.0357202750293477e-05, "loss": 0.0027, "step": 28800 }, { "epoch": 4.83411528150134, "grad_norm": 0.18522508442401886, "learning_rate": 1.0340432668120076e-05, "loss": 0.0026, "step": 28850 }, { "epoch": 4.842493297587131, "grad_norm": 0.201819509267807, "learning_rate": 1.0323662585946673e-05, "loss": 0.0025, "step": 28900 }, { "epoch": 4.850871313672922, "grad_norm": 0.2343200445175171, "learning_rate": 1.030689250377327e-05, "loss": 0.0027, "step": 28950 }, { "epoch": 4.859249329758713, "grad_norm": 0.164067804813385, "learning_rate": 1.0290122421599867e-05, "loss": 0.0032, "step": 29000 }, { "epoch": 4.867627345844504, "grad_norm": 1.0820327997207642, "learning_rate": 1.0273352339426464e-05, "loss": 0.0033, "step": 29050 }, { "epoch": 4.876005361930295, "grad_norm": 0.18911249935626984, "learning_rate": 1.0256582257253061e-05, "loss": 0.0024, "step": 29100 }, { "epoch": 4.884383378016086, "grad_norm": 0.1403694599866867, "learning_rate": 1.023981217507966e-05, "loss": 0.0028, "step": 29150 }, { "epoch": 4.892761394101877, "grad_norm": 0.20968593657016754, "learning_rate": 1.0223042092906257e-05, "loss": 0.0034, "step": 29200 }, { "epoch": 4.9011394101876675, "grad_norm": 0.26832762360572815, "learning_rate": 1.0206272010732854e-05, "loss": 0.0035, "step": 29250 }, { "epoch": 4.909517426273458, "grad_norm": 0.26747608184814453, "learning_rate": 1.018950192855945e-05, "loss": 0.0034, "step": 29300 }, { "epoch": 4.917895442359249, "grad_norm": 0.999813437461853, "learning_rate": 1.0172731846386048e-05, "loss": 0.0029, "step": 29350 }, { "epoch": 4.92627345844504, "grad_norm": 0.33220162987709045, "learning_rate": 1.0155961764212645e-05, "loss": 0.0033, "step": 29400 }, { "epoch": 4.934651474530831, "grad_norm": 0.13821391761302948, "learning_rate": 1.0139191682039242e-05, "loss": 0.003, "step": 29450 }, { "epoch": 4.943029490616622, "grad_norm": 0.33970770239830017, "learning_rate": 1.0122421599865839e-05, "loss": 0.0028, "step": 29500 }, { "epoch": 4.951407506702413, "grad_norm": 0.09418370574712753, "learning_rate": 1.0105651517692439e-05, "loss": 0.0029, "step": 29550 }, { "epoch": 4.959785522788204, "grad_norm": 0.10416509956121445, "learning_rate": 1.0088881435519036e-05, "loss": 0.0031, "step": 29600 }, { "epoch": 4.968163538873995, "grad_norm": 0.7082052230834961, "learning_rate": 1.0072111353345633e-05, "loss": 0.0025, "step": 29650 }, { "epoch": 4.976541554959786, "grad_norm": 0.3095639944076538, "learning_rate": 1.005534127117223e-05, "loss": 0.0035, "step": 29700 }, { "epoch": 4.984919571045577, "grad_norm": 0.119889035820961, "learning_rate": 1.0038571188998827e-05, "loss": 0.0033, "step": 29750 }, { "epoch": 4.993297587131368, "grad_norm": 0.29492849111557007, "learning_rate": 1.0021801106825424e-05, "loss": 0.0029, "step": 29800 }, { "epoch": 5.0016756032171585, "grad_norm": 0.0865137055516243, "learning_rate": 1.0005031024652021e-05, "loss": 0.0028, "step": 29850 }, { "epoch": 5.0100536193029495, "grad_norm": 0.22691671550273895, "learning_rate": 9.98826094247862e-06, "loss": 0.0011, "step": 29900 }, { "epoch": 5.01843163538874, "grad_norm": 0.0516495518386364, "learning_rate": 9.971490860305217e-06, "loss": 0.0012, "step": 29950 }, { "epoch": 5.02680965147453, "grad_norm": 0.08190739154815674, "learning_rate": 9.954720778131814e-06, "loss": 0.0011, "step": 30000 }, { "epoch": 5.02680965147453, "eval_loss": 0.05592558532953262, "eval_runtime": 0.3145, "eval_samples_per_second": 63.586, "eval_steps_per_second": 3.179, "step": 30000 }, { "epoch": 5.035187667560321, "grad_norm": 0.025882409885525703, "learning_rate": 9.937950695958411e-06, "loss": 0.0008, "step": 30050 }, { "epoch": 5.043565683646112, "grad_norm": 0.12556754052639008, "learning_rate": 9.921180613785008e-06, "loss": 0.001, "step": 30100 }, { "epoch": 5.051943699731903, "grad_norm": 0.09527916461229324, "learning_rate": 9.904410531611605e-06, "loss": 0.0011, "step": 30150 }, { "epoch": 5.060321715817694, "grad_norm": 0.30216673016548157, "learning_rate": 9.887640449438202e-06, "loss": 0.0013, "step": 30200 }, { "epoch": 5.068699731903485, "grad_norm": 0.030431820079684258, "learning_rate": 9.8708703672648e-06, "loss": 0.0011, "step": 30250 }, { "epoch": 5.077077747989276, "grad_norm": 0.08387458324432373, "learning_rate": 9.854100285091398e-06, "loss": 0.0008, "step": 30300 }, { "epoch": 5.085455764075067, "grad_norm": 0.12422385066747665, "learning_rate": 9.837330202917995e-06, "loss": 0.001, "step": 30350 }, { "epoch": 5.093833780160858, "grad_norm": 0.05497809499502182, "learning_rate": 9.820560120744592e-06, "loss": 0.001, "step": 30400 }, { "epoch": 5.102211796246649, "grad_norm": 0.21757960319519043, "learning_rate": 9.803790038571189e-06, "loss": 0.0011, "step": 30450 }, { "epoch": 5.11058981233244, "grad_norm": 0.2624013423919678, "learning_rate": 9.787019956397787e-06, "loss": 0.0012, "step": 30500 }, { "epoch": 5.1189678284182305, "grad_norm": 0.12058177590370178, "learning_rate": 9.770249874224384e-06, "loss": 0.0011, "step": 30550 }, { "epoch": 5.127345844504021, "grad_norm": 0.09058215469121933, "learning_rate": 9.753479792050981e-06, "loss": 0.001, "step": 30600 }, { "epoch": 5.135723860589812, "grad_norm": 0.07257585972547531, "learning_rate": 9.736709709877578e-06, "loss": 0.0011, "step": 30650 }, { "epoch": 5.144101876675603, "grad_norm": 0.07394664734601974, "learning_rate": 9.719939627704177e-06, "loss": 0.0014, "step": 30700 }, { "epoch": 5.152479892761394, "grad_norm": 0.08835545182228088, "learning_rate": 9.703169545530774e-06, "loss": 0.0011, "step": 30750 }, { "epoch": 5.160857908847185, "grad_norm": 0.05140378698706627, "learning_rate": 9.686399463357371e-06, "loss": 0.0011, "step": 30800 }, { "epoch": 5.169235924932976, "grad_norm": 0.11998426169157028, "learning_rate": 9.669629381183968e-06, "loss": 0.0013, "step": 30850 }, { "epoch": 5.177613941018767, "grad_norm": 0.11740259826183319, "learning_rate": 9.652859299010567e-06, "loss": 0.0011, "step": 30900 }, { "epoch": 5.185991957104558, "grad_norm": 0.3295953869819641, "learning_rate": 9.636089216837164e-06, "loss": 0.0011, "step": 30950 }, { "epoch": 5.194369973190349, "grad_norm": 0.21252810955047607, "learning_rate": 9.619319134663761e-06, "loss": 0.0012, "step": 31000 }, { "epoch": 5.20274798927614, "grad_norm": 0.1684710681438446, "learning_rate": 9.602549052490358e-06, "loss": 0.001, "step": 31050 }, { "epoch": 5.211126005361931, "grad_norm": 0.30938273668289185, "learning_rate": 9.585778970316955e-06, "loss": 0.0013, "step": 31100 }, { "epoch": 5.2195040214477215, "grad_norm": 0.13435423374176025, "learning_rate": 9.569008888143552e-06, "loss": 0.0012, "step": 31150 }, { "epoch": 5.227882037533512, "grad_norm": 0.24395543336868286, "learning_rate": 9.552238805970149e-06, "loss": 0.001, "step": 31200 }, { "epoch": 5.236260053619303, "grad_norm": 0.07691800594329834, "learning_rate": 9.535468723796748e-06, "loss": 0.0011, "step": 31250 }, { "epoch": 5.244638069705093, "grad_norm": 0.07506980746984482, "learning_rate": 9.518698641623345e-06, "loss": 0.001, "step": 31300 }, { "epoch": 5.253016085790884, "grad_norm": 0.09802160412073135, "learning_rate": 9.501928559449942e-06, "loss": 0.0009, "step": 31350 }, { "epoch": 5.261394101876675, "grad_norm": 0.08386828005313873, "learning_rate": 9.485158477276539e-06, "loss": 0.0011, "step": 31400 }, { "epoch": 5.269772117962466, "grad_norm": 0.23838509619235992, "learning_rate": 9.468388395103136e-06, "loss": 0.0011, "step": 31450 }, { "epoch": 5.278150134048257, "grad_norm": 0.06687796860933304, "learning_rate": 9.451618312929734e-06, "loss": 0.001, "step": 31500 }, { "epoch": 5.286528150134048, "grad_norm": 0.06589027494192123, "learning_rate": 9.434848230756332e-06, "loss": 0.0011, "step": 31550 }, { "epoch": 5.294906166219839, "grad_norm": 0.21493591368198395, "learning_rate": 9.418078148582929e-06, "loss": 0.0012, "step": 31600 }, { "epoch": 5.30328418230563, "grad_norm": 0.20591777563095093, "learning_rate": 9.401308066409526e-06, "loss": 0.0012, "step": 31650 }, { "epoch": 5.311662198391421, "grad_norm": 0.16145972907543182, "learning_rate": 9.384537984236124e-06, "loss": 0.0012, "step": 31700 }, { "epoch": 5.320040214477212, "grad_norm": 0.07270830124616623, "learning_rate": 9.367767902062721e-06, "loss": 0.001, "step": 31750 }, { "epoch": 5.328418230563003, "grad_norm": 0.12921959161758423, "learning_rate": 9.350997819889318e-06, "loss": 0.0012, "step": 31800 }, { "epoch": 5.3367962466487935, "grad_norm": 0.24689610302448273, "learning_rate": 9.334227737715915e-06, "loss": 0.0011, "step": 31850 }, { "epoch": 5.345174262734584, "grad_norm": 0.14935541152954102, "learning_rate": 9.317457655542514e-06, "loss": 0.001, "step": 31900 }, { "epoch": 5.353552278820375, "grad_norm": 0.05026477575302124, "learning_rate": 9.300687573369111e-06, "loss": 0.0009, "step": 31950 }, { "epoch": 5.361930294906166, "grad_norm": 0.11298377066850662, "learning_rate": 9.283917491195708e-06, "loss": 0.0012, "step": 32000 }, { "epoch": 5.370308310991957, "grad_norm": 0.04898526519536972, "learning_rate": 9.267147409022305e-06, "loss": 0.0015, "step": 32050 }, { "epoch": 5.378686327077748, "grad_norm": 0.07678736001253128, "learning_rate": 9.250377326848902e-06, "loss": 0.0013, "step": 32100 }, { "epoch": 5.387064343163539, "grad_norm": 0.08714163303375244, "learning_rate": 9.233607244675499e-06, "loss": 0.001, "step": 32150 }, { "epoch": 5.39544235924933, "grad_norm": 0.14269877970218658, "learning_rate": 9.216837162502098e-06, "loss": 0.001, "step": 32200 }, { "epoch": 5.403820375335121, "grad_norm": 0.0840928927063942, "learning_rate": 9.200067080328695e-06, "loss": 0.0012, "step": 32250 }, { "epoch": 5.412198391420912, "grad_norm": 0.049975261092185974, "learning_rate": 9.183296998155292e-06, "loss": 0.0013, "step": 32300 }, { "epoch": 5.420576407506703, "grad_norm": 0.10942261666059494, "learning_rate": 9.166526915981889e-06, "loss": 0.0013, "step": 32350 }, { "epoch": 5.428954423592494, "grad_norm": 0.09510983526706696, "learning_rate": 9.149756833808486e-06, "loss": 0.0013, "step": 32400 }, { "epoch": 5.4373324396782845, "grad_norm": 0.07059191167354584, "learning_rate": 9.132986751635083e-06, "loss": 0.0013, "step": 32450 }, { "epoch": 5.445710455764075, "grad_norm": 0.09271156042814255, "learning_rate": 9.116216669461682e-06, "loss": 0.0014, "step": 32500 }, { "epoch": 5.454088471849866, "grad_norm": 0.5445387363433838, "learning_rate": 9.099446587288279e-06, "loss": 0.0017, "step": 32550 }, { "epoch": 5.462466487935657, "grad_norm": 0.5922443270683289, "learning_rate": 9.082676505114876e-06, "loss": 0.0012, "step": 32600 }, { "epoch": 5.470844504021448, "grad_norm": 0.08508787304162979, "learning_rate": 9.065906422941473e-06, "loss": 0.0013, "step": 32650 }, { "epoch": 5.479222520107238, "grad_norm": 0.10297244787216187, "learning_rate": 9.049136340768071e-06, "loss": 0.0015, "step": 32700 }, { "epoch": 5.487600536193029, "grad_norm": 0.20003701746463776, "learning_rate": 9.032366258594668e-06, "loss": 0.001, "step": 32750 }, { "epoch": 5.49597855227882, "grad_norm": 0.33047032356262207, "learning_rate": 9.015596176421265e-06, "loss": 0.0011, "step": 32800 }, { "epoch": 5.504356568364611, "grad_norm": 0.12265091389417648, "learning_rate": 8.998826094247862e-06, "loss": 0.0012, "step": 32850 }, { "epoch": 5.512734584450402, "grad_norm": 0.1573624163866043, "learning_rate": 8.982056012074461e-06, "loss": 0.0013, "step": 32900 }, { "epoch": 5.521112600536193, "grad_norm": 0.13570530712604523, "learning_rate": 8.965285929901058e-06, "loss": 0.0012, "step": 32950 }, { "epoch": 5.529490616621984, "grad_norm": 0.1362573504447937, "learning_rate": 8.948515847727655e-06, "loss": 0.0015, "step": 33000 }, { "epoch": 5.537868632707775, "grad_norm": 0.16211983561515808, "learning_rate": 8.931745765554252e-06, "loss": 0.0013, "step": 33050 }, { "epoch": 5.5462466487935655, "grad_norm": 0.1816491037607193, "learning_rate": 8.914975683380849e-06, "loss": 0.0013, "step": 33100 }, { "epoch": 5.5546246648793565, "grad_norm": 0.12222578376531601, "learning_rate": 8.898205601207446e-06, "loss": 0.0012, "step": 33150 }, { "epoch": 5.563002680965147, "grad_norm": 0.24587097764015198, "learning_rate": 8.881435519034045e-06, "loss": 0.0014, "step": 33200 }, { "epoch": 5.571380697050938, "grad_norm": 0.13261163234710693, "learning_rate": 8.864665436860642e-06, "loss": 0.0012, "step": 33250 }, { "epoch": 5.579758713136729, "grad_norm": 0.1050226092338562, "learning_rate": 8.847895354687239e-06, "loss": 0.0013, "step": 33300 }, { "epoch": 5.58813672922252, "grad_norm": 0.1119270995259285, "learning_rate": 8.831125272513836e-06, "loss": 0.0013, "step": 33350 }, { "epoch": 5.596514745308311, "grad_norm": 0.41666361689567566, "learning_rate": 8.814355190340433e-06, "loss": 0.0013, "step": 33400 }, { "epoch": 5.604892761394102, "grad_norm": 0.21499872207641602, "learning_rate": 8.79758510816703e-06, "loss": 0.0013, "step": 33450 }, { "epoch": 5.613270777479893, "grad_norm": 0.1437048465013504, "learning_rate": 8.780815025993627e-06, "loss": 0.0013, "step": 33500 }, { "epoch": 5.621648793565684, "grad_norm": 0.10052605718374252, "learning_rate": 8.764044943820226e-06, "loss": 0.0016, "step": 33550 }, { "epoch": 5.630026809651475, "grad_norm": 0.16671398282051086, "learning_rate": 8.747274861646823e-06, "loss": 0.0013, "step": 33600 }, { "epoch": 5.638404825737266, "grad_norm": 0.1302991360425949, "learning_rate": 8.73050477947342e-06, "loss": 0.0013, "step": 33650 }, { "epoch": 5.646782841823057, "grad_norm": 0.07595470547676086, "learning_rate": 8.713734697300017e-06, "loss": 0.0013, "step": 33700 }, { "epoch": 5.6551608579088475, "grad_norm": 0.08618602156639099, "learning_rate": 8.696964615126615e-06, "loss": 0.0011, "step": 33750 }, { "epoch": 5.663538873994638, "grad_norm": 0.18707716464996338, "learning_rate": 8.680194532953212e-06, "loss": 0.0014, "step": 33800 }, { "epoch": 5.671916890080429, "grad_norm": 0.167672261595726, "learning_rate": 8.66342445077981e-06, "loss": 0.0015, "step": 33850 }, { "epoch": 5.680294906166219, "grad_norm": 0.15763333439826965, "learning_rate": 8.646654368606406e-06, "loss": 0.0015, "step": 33900 }, { "epoch": 5.68867292225201, "grad_norm": 0.1692523956298828, "learning_rate": 8.629884286433005e-06, "loss": 0.0014, "step": 33950 }, { "epoch": 5.697050938337801, "grad_norm": 0.06355728209018707, "learning_rate": 8.613114204259602e-06, "loss": 0.0016, "step": 34000 }, { "epoch": 5.705428954423592, "grad_norm": 0.06145229935646057, "learning_rate": 8.596344122086199e-06, "loss": 0.0014, "step": 34050 }, { "epoch": 5.713806970509383, "grad_norm": 0.1229192316532135, "learning_rate": 8.579574039912796e-06, "loss": 0.0016, "step": 34100 }, { "epoch": 5.722184986595174, "grad_norm": 0.07723601907491684, "learning_rate": 8.562803957739395e-06, "loss": 0.0012, "step": 34150 }, { "epoch": 5.730563002680965, "grad_norm": 0.16651087999343872, "learning_rate": 8.546033875565992e-06, "loss": 0.0012, "step": 34200 }, { "epoch": 5.738941018766756, "grad_norm": 0.19212378561496735, "learning_rate": 8.529263793392589e-06, "loss": 0.0015, "step": 34250 }, { "epoch": 5.747319034852547, "grad_norm": 0.14563122391700745, "learning_rate": 8.512493711219186e-06, "loss": 0.0013, "step": 34300 }, { "epoch": 5.755697050938338, "grad_norm": 0.14981712400913239, "learning_rate": 8.495723629045783e-06, "loss": 0.0013, "step": 34350 }, { "epoch": 5.7640750670241285, "grad_norm": 0.3324640989303589, "learning_rate": 8.47895354687238e-06, "loss": 0.0014, "step": 34400 }, { "epoch": 5.7724530831099194, "grad_norm": 0.2827085256576538, "learning_rate": 8.462183464698977e-06, "loss": 0.0014, "step": 34450 }, { "epoch": 5.78083109919571, "grad_norm": 0.3784811198711395, "learning_rate": 8.445413382525574e-06, "loss": 0.0015, "step": 34500 }, { "epoch": 5.789209115281501, "grad_norm": 0.08754808455705643, "learning_rate": 8.428643300352173e-06, "loss": 0.0013, "step": 34550 }, { "epoch": 5.797587131367292, "grad_norm": 0.2719215750694275, "learning_rate": 8.41187321817877e-06, "loss": 0.0014, "step": 34600 }, { "epoch": 5.805965147453083, "grad_norm": 0.17088165879249573, "learning_rate": 8.395103136005367e-06, "loss": 0.0016, "step": 34650 }, { "epoch": 5.814343163538874, "grad_norm": 0.3564954400062561, "learning_rate": 8.378333053831964e-06, "loss": 0.0013, "step": 34700 }, { "epoch": 5.822721179624665, "grad_norm": 0.1912204474210739, "learning_rate": 8.361562971658562e-06, "loss": 0.0017, "step": 34750 }, { "epoch": 5.831099195710456, "grad_norm": 0.2299826443195343, "learning_rate": 8.34479288948516e-06, "loss": 0.0011, "step": 34800 }, { "epoch": 5.839477211796247, "grad_norm": 0.07874714583158493, "learning_rate": 8.328022807311756e-06, "loss": 0.0016, "step": 34850 }, { "epoch": 5.847855227882038, "grad_norm": 0.1063261479139328, "learning_rate": 8.311252725138353e-06, "loss": 0.0014, "step": 34900 }, { "epoch": 5.856233243967829, "grad_norm": 0.07933440804481506, "learning_rate": 8.294482642964952e-06, "loss": 0.0013, "step": 34950 }, { "epoch": 5.8646112600536195, "grad_norm": 0.14268645644187927, "learning_rate": 8.277712560791549e-06, "loss": 0.0014, "step": 35000 }, { "epoch": 5.8729892761394105, "grad_norm": 0.24926510453224182, "learning_rate": 8.260942478618146e-06, "loss": 0.0014, "step": 35050 }, { "epoch": 5.881367292225201, "grad_norm": 0.12582330405712128, "learning_rate": 8.244172396444743e-06, "loss": 0.0013, "step": 35100 }, { "epoch": 5.889745308310992, "grad_norm": 0.04589623957872391, "learning_rate": 8.227402314271342e-06, "loss": 0.0012, "step": 35150 }, { "epoch": 5.898123324396783, "grad_norm": 0.3229510188102722, "learning_rate": 8.210632232097939e-06, "loss": 0.0011, "step": 35200 }, { "epoch": 5.906501340482574, "grad_norm": 0.12953703105449677, "learning_rate": 8.193862149924536e-06, "loss": 0.0013, "step": 35250 }, { "epoch": 5.914879356568365, "grad_norm": 0.31099674105644226, "learning_rate": 8.177092067751133e-06, "loss": 0.0013, "step": 35300 }, { "epoch": 5.923257372654156, "grad_norm": 0.17244383692741394, "learning_rate": 8.16032198557773e-06, "loss": 0.0012, "step": 35350 }, { "epoch": 5.931635388739946, "grad_norm": 0.6794390678405762, "learning_rate": 8.143551903404327e-06, "loss": 0.0011, "step": 35400 }, { "epoch": 5.940013404825737, "grad_norm": 0.15421807765960693, "learning_rate": 8.126781821230924e-06, "loss": 0.0015, "step": 35450 }, { "epoch": 5.948391420911528, "grad_norm": 0.09070286899805069, "learning_rate": 8.110011739057521e-06, "loss": 0.0015, "step": 35500 }, { "epoch": 5.956769436997319, "grad_norm": 0.13311493396759033, "learning_rate": 8.09324165688412e-06, "loss": 0.0014, "step": 35550 }, { "epoch": 5.96514745308311, "grad_norm": 0.3226371705532074, "learning_rate": 8.076471574710717e-06, "loss": 0.0015, "step": 35600 }, { "epoch": 5.973525469168901, "grad_norm": 0.25139835476875305, "learning_rate": 8.059701492537314e-06, "loss": 0.0016, "step": 35650 }, { "epoch": 5.9819034852546915, "grad_norm": 0.14944802224636078, "learning_rate": 8.04293141036391e-06, "loss": 0.0013, "step": 35700 }, { "epoch": 5.990281501340482, "grad_norm": 0.210645392537117, "learning_rate": 8.02616132819051e-06, "loss": 0.0014, "step": 35750 }, { "epoch": 5.998659517426273, "grad_norm": 0.3287517726421356, "learning_rate": 8.009391246017106e-06, "loss": 0.0013, "step": 35800 }, { "epoch": 6.007037533512064, "grad_norm": 0.08051057904958725, "learning_rate": 7.992621163843703e-06, "loss": 0.0007, "step": 35850 }, { "epoch": 6.015415549597855, "grad_norm": 0.03923693299293518, "learning_rate": 7.9758510816703e-06, "loss": 0.0006, "step": 35900 }, { "epoch": 6.023793565683646, "grad_norm": 0.03783218562602997, "learning_rate": 7.959080999496899e-06, "loss": 0.0009, "step": 35950 }, { "epoch": 6.032171581769437, "grad_norm": 0.251902312040329, "learning_rate": 7.942310917323496e-06, "loss": 0.0007, "step": 36000 }, { "epoch": 6.040549597855228, "grad_norm": 0.04270997270941734, "learning_rate": 7.925540835150093e-06, "loss": 0.0006, "step": 36050 }, { "epoch": 6.048927613941019, "grad_norm": 0.07284736633300781, "learning_rate": 7.90877075297669e-06, "loss": 0.0007, "step": 36100 }, { "epoch": 6.05730563002681, "grad_norm": 0.06689571589231491, "learning_rate": 7.892000670803289e-06, "loss": 0.0007, "step": 36150 }, { "epoch": 6.065683646112601, "grad_norm": 0.024217478930950165, "learning_rate": 7.875230588629886e-06, "loss": 0.0006, "step": 36200 }, { "epoch": 6.074061662198392, "grad_norm": 0.09656205028295517, "learning_rate": 7.858460506456483e-06, "loss": 0.0006, "step": 36250 }, { "epoch": 6.0824396782841825, "grad_norm": 0.030422423034906387, "learning_rate": 7.84169042428308e-06, "loss": 0.0007, "step": 36300 }, { "epoch": 6.0908176943699734, "grad_norm": 0.08347397297620773, "learning_rate": 7.824920342109677e-06, "loss": 0.0006, "step": 36350 }, { "epoch": 6.099195710455764, "grad_norm": 0.027809837833046913, "learning_rate": 7.808150259936274e-06, "loss": 0.0006, "step": 36400 }, { "epoch": 6.107573726541555, "grad_norm": 0.2227599024772644, "learning_rate": 7.791380177762871e-06, "loss": 0.0007, "step": 36450 }, { "epoch": 6.115951742627346, "grad_norm": 0.1868954300880432, "learning_rate": 7.774610095589468e-06, "loss": 0.0007, "step": 36500 }, { "epoch": 6.124329758713137, "grad_norm": 0.1395631730556488, "learning_rate": 7.757840013416067e-06, "loss": 0.0005, "step": 36550 }, { "epoch": 6.132707774798928, "grad_norm": 0.029958348721265793, "learning_rate": 7.741069931242664e-06, "loss": 0.0005, "step": 36600 }, { "epoch": 6.141085790884718, "grad_norm": 0.04532192647457123, "learning_rate": 7.72429984906926e-06, "loss": 0.0007, "step": 36650 }, { "epoch": 6.149463806970509, "grad_norm": 0.0366247221827507, "learning_rate": 7.707529766895858e-06, "loss": 0.0005, "step": 36700 }, { "epoch": 6.1578418230563, "grad_norm": 0.07616298645734787, "learning_rate": 7.690759684722455e-06, "loss": 0.0007, "step": 36750 }, { "epoch": 6.166219839142091, "grad_norm": 0.05013656988739967, "learning_rate": 7.673989602549053e-06, "loss": 0.0007, "step": 36800 }, { "epoch": 6.174597855227882, "grad_norm": 0.08419755846261978, "learning_rate": 7.65721952037565e-06, "loss": 0.0005, "step": 36850 }, { "epoch": 6.182975871313673, "grad_norm": 0.11614430695772171, "learning_rate": 7.640449438202247e-06, "loss": 0.0004, "step": 36900 }, { "epoch": 6.191353887399464, "grad_norm": 0.1487479954957962, "learning_rate": 7.6236793560288445e-06, "loss": 0.0007, "step": 36950 }, { "epoch": 6.1997319034852545, "grad_norm": 0.1483132392168045, "learning_rate": 7.606909273855443e-06, "loss": 0.0006, "step": 37000 }, { "epoch": 6.208109919571045, "grad_norm": 0.034125398844480515, "learning_rate": 7.59013919168204e-06, "loss": 0.0005, "step": 37050 }, { "epoch": 6.216487935656836, "grad_norm": 0.0705786794424057, "learning_rate": 7.573369109508637e-06, "loss": 0.0007, "step": 37100 }, { "epoch": 6.224865951742627, "grad_norm": 0.03942383453249931, "learning_rate": 7.556599027335234e-06, "loss": 0.0007, "step": 37150 }, { "epoch": 6.233243967828418, "grad_norm": 0.045597631484270096, "learning_rate": 7.539828945161832e-06, "loss": 0.0006, "step": 37200 }, { "epoch": 6.241621983914209, "grad_norm": 0.06973922997713089, "learning_rate": 7.523058862988429e-06, "loss": 0.0005, "step": 37250 }, { "epoch": 6.25, "grad_norm": 0.042143791913986206, "learning_rate": 7.506288780815026e-06, "loss": 0.0006, "step": 37300 }, { "epoch": 6.258378016085791, "grad_norm": 0.02553519792854786, "learning_rate": 7.489518698641623e-06, "loss": 0.0005, "step": 37350 }, { "epoch": 6.266756032171582, "grad_norm": 0.07629157602787018, "learning_rate": 7.472748616468222e-06, "loss": 0.0005, "step": 37400 }, { "epoch": 6.275134048257373, "grad_norm": 0.13083019852638245, "learning_rate": 7.455978534294819e-06, "loss": 0.0006, "step": 37450 }, { "epoch": 6.283512064343164, "grad_norm": 0.024578507989645004, "learning_rate": 7.439208452121416e-06, "loss": 0.0005, "step": 37500 }, { "epoch": 6.291890080428955, "grad_norm": 0.2308337688446045, "learning_rate": 7.422438369948013e-06, "loss": 0.0005, "step": 37550 }, { "epoch": 6.3002680965147455, "grad_norm": 0.09496274590492249, "learning_rate": 7.405668287774611e-06, "loss": 0.0005, "step": 37600 }, { "epoch": 6.308646112600536, "grad_norm": 0.06111710146069527, "learning_rate": 7.388898205601209e-06, "loss": 0.0007, "step": 37650 }, { "epoch": 6.317024128686327, "grad_norm": 0.30509406328201294, "learning_rate": 7.372128123427806e-06, "loss": 0.0006, "step": 37700 }, { "epoch": 6.325402144772118, "grad_norm": 0.041681017726659775, "learning_rate": 7.355358041254403e-06, "loss": 0.0006, "step": 37750 }, { "epoch": 6.333780160857909, "grad_norm": 0.05730760842561722, "learning_rate": 7.3385879590810005e-06, "loss": 0.0005, "step": 37800 }, { "epoch": 6.3421581769437, "grad_norm": 0.03064553625881672, "learning_rate": 7.3218178769075975e-06, "loss": 0.0006, "step": 37850 }, { "epoch": 6.350536193029491, "grad_norm": 0.0326654389500618, "learning_rate": 7.3050477947341945e-06, "loss": 0.0006, "step": 37900 }, { "epoch": 6.358914209115282, "grad_norm": 0.058062855154275894, "learning_rate": 7.2882777125607915e-06, "loss": 0.0007, "step": 37950 }, { "epoch": 6.367292225201073, "grad_norm": 0.1029849499464035, "learning_rate": 7.27150763038739e-06, "loss": 0.0005, "step": 38000 }, { "epoch": 6.375670241286863, "grad_norm": 0.05858965218067169, "learning_rate": 7.254737548213987e-06, "loss": 0.0006, "step": 38050 }, { "epoch": 6.384048257372654, "grad_norm": 0.0990440845489502, "learning_rate": 7.237967466040584e-06, "loss": 0.0005, "step": 38100 }, { "epoch": 6.392426273458445, "grad_norm": 0.18513937294483185, "learning_rate": 7.221197383867181e-06, "loss": 0.0005, "step": 38150 }, { "epoch": 6.400804289544236, "grad_norm": 0.045414622873067856, "learning_rate": 7.204427301693779e-06, "loss": 0.0006, "step": 38200 }, { "epoch": 6.4091823056300266, "grad_norm": 0.14755046367645264, "learning_rate": 7.187657219520376e-06, "loss": 0.0006, "step": 38250 }, { "epoch": 6.4175603217158175, "grad_norm": 0.03956648334860802, "learning_rate": 7.170887137346973e-06, "loss": 0.0005, "step": 38300 }, { "epoch": 6.425938337801608, "grad_norm": 0.03931158035993576, "learning_rate": 7.154117055173571e-06, "loss": 0.0006, "step": 38350 }, { "epoch": 6.434316353887399, "grad_norm": 0.030736852437257767, "learning_rate": 7.137346973000169e-06, "loss": 0.0006, "step": 38400 }, { "epoch": 6.44269436997319, "grad_norm": 0.04715625196695328, "learning_rate": 7.120576890826766e-06, "loss": 0.0006, "step": 38450 }, { "epoch": 6.451072386058981, "grad_norm": 0.03468763083219528, "learning_rate": 7.103806808653363e-06, "loss": 0.0006, "step": 38500 }, { "epoch": 6.459450402144772, "grad_norm": 0.04370501637458801, "learning_rate": 7.08703672647996e-06, "loss": 0.0008, "step": 38550 }, { "epoch": 6.467828418230563, "grad_norm": 0.08410083502531052, "learning_rate": 7.070266644306559e-06, "loss": 0.0007, "step": 38600 }, { "epoch": 6.476206434316354, "grad_norm": 0.07396062463521957, "learning_rate": 7.053496562133156e-06, "loss": 0.001, "step": 38650 }, { "epoch": 6.484584450402145, "grad_norm": 0.09718171507120132, "learning_rate": 7.036726479959753e-06, "loss": 0.0006, "step": 38700 }, { "epoch": 6.492962466487936, "grad_norm": 0.20317842066287994, "learning_rate": 7.01995639778635e-06, "loss": 0.0007, "step": 38750 }, { "epoch": 6.501340482573727, "grad_norm": 0.1610729843378067, "learning_rate": 7.0031863156129475e-06, "loss": 0.0007, "step": 38800 }, { "epoch": 6.509718498659518, "grad_norm": 0.2847572863101959, "learning_rate": 6.9864162334395445e-06, "loss": 0.0006, "step": 38850 }, { "epoch": 6.5180965147453085, "grad_norm": 0.05428579822182655, "learning_rate": 6.9696461512661415e-06, "loss": 0.0007, "step": 38900 }, { "epoch": 6.526474530831099, "grad_norm": 0.03034658171236515, "learning_rate": 6.9528760690927385e-06, "loss": 0.0006, "step": 38950 }, { "epoch": 6.53485254691689, "grad_norm": 0.08986043930053711, "learning_rate": 6.936105986919337e-06, "loss": 0.0005, "step": 39000 }, { "epoch": 6.543230563002681, "grad_norm": 0.05553920567035675, "learning_rate": 6.919335904745934e-06, "loss": 0.0009, "step": 39050 }, { "epoch": 6.551608579088472, "grad_norm": 0.018183773383498192, "learning_rate": 6.902565822572531e-06, "loss": 0.0008, "step": 39100 }, { "epoch": 6.559986595174263, "grad_norm": 0.06645216047763824, "learning_rate": 6.885795740399128e-06, "loss": 0.0008, "step": 39150 }, { "epoch": 6.568364611260054, "grad_norm": 0.04686279594898224, "learning_rate": 6.869025658225726e-06, "loss": 0.0006, "step": 39200 }, { "epoch": 6.576742627345844, "grad_norm": 0.14523954689502716, "learning_rate": 6.852255576052323e-06, "loss": 0.0007, "step": 39250 }, { "epoch": 6.585120643431635, "grad_norm": 0.07832646369934082, "learning_rate": 6.83548549387892e-06, "loss": 0.0007, "step": 39300 }, { "epoch": 6.593498659517426, "grad_norm": 0.07805398851633072, "learning_rate": 6.818715411705518e-06, "loss": 0.0006, "step": 39350 }, { "epoch": 6.601876675603217, "grad_norm": 0.07783017307519913, "learning_rate": 6.801945329532115e-06, "loss": 0.0006, "step": 39400 }, { "epoch": 6.610254691689008, "grad_norm": 0.044575657695531845, "learning_rate": 6.785175247358713e-06, "loss": 0.0006, "step": 39450 }, { "epoch": 6.618632707774799, "grad_norm": 0.31225234270095825, "learning_rate": 6.76840516518531e-06, "loss": 0.0006, "step": 39500 }, { "epoch": 6.6270107238605895, "grad_norm": 0.08033174276351929, "learning_rate": 6.751635083011907e-06, "loss": 0.0008, "step": 39550 }, { "epoch": 6.6353887399463805, "grad_norm": 0.06084591895341873, "learning_rate": 6.734865000838504e-06, "loss": 0.0007, "step": 39600 }, { "epoch": 6.643766756032171, "grad_norm": 0.05018865689635277, "learning_rate": 6.718094918665103e-06, "loss": 0.0008, "step": 39650 }, { "epoch": 6.652144772117962, "grad_norm": 0.05032634735107422, "learning_rate": 6.7013248364917e-06, "loss": 0.0006, "step": 39700 }, { "epoch": 6.660522788203753, "grad_norm": 0.08206313848495483, "learning_rate": 6.684554754318297e-06, "loss": 0.0009, "step": 39750 }, { "epoch": 6.668900804289544, "grad_norm": 0.1278487741947174, "learning_rate": 6.667784672144894e-06, "loss": 0.0006, "step": 39800 }, { "epoch": 6.677278820375335, "grad_norm": 0.05226191505789757, "learning_rate": 6.6510145899714915e-06, "loss": 0.0008, "step": 39850 }, { "epoch": 6.685656836461126, "grad_norm": 0.04356776922941208, "learning_rate": 6.6342445077980886e-06, "loss": 0.0006, "step": 39900 }, { "epoch": 6.694034852546917, "grad_norm": 0.06333254277706146, "learning_rate": 6.6174744256246856e-06, "loss": 0.0007, "step": 39950 }, { "epoch": 6.702412868632708, "grad_norm": 0.08492754399776459, "learning_rate": 6.600704343451283e-06, "loss": 0.0007, "step": 40000 }, { "epoch": 6.702412868632708, "eval_loss": 0.061279989778995514, "eval_runtime": 0.3146, "eval_samples_per_second": 63.58, "eval_steps_per_second": 3.179, "step": 40000 }, { "epoch": 6.710790884718499, "grad_norm": 0.3515622317790985, "learning_rate": 6.583934261277881e-06, "loss": 0.0009, "step": 40050 }, { "epoch": 6.71916890080429, "grad_norm": 0.10627135634422302, "learning_rate": 6.567164179104478e-06, "loss": 0.0007, "step": 40100 }, { "epoch": 6.7275469168900806, "grad_norm": 0.17090724408626556, "learning_rate": 6.550394096931075e-06, "loss": 0.0008, "step": 40150 }, { "epoch": 6.7359249329758715, "grad_norm": 0.03962019085884094, "learning_rate": 6.533624014757672e-06, "loss": 0.0005, "step": 40200 }, { "epoch": 6.744302949061662, "grad_norm": 0.5243228077888489, "learning_rate": 6.51685393258427e-06, "loss": 0.0007, "step": 40250 }, { "epoch": 6.752680965147453, "grad_norm": 0.04279276728630066, "learning_rate": 6.500083850410868e-06, "loss": 0.0007, "step": 40300 }, { "epoch": 6.761058981233244, "grad_norm": 0.05610975995659828, "learning_rate": 6.483313768237465e-06, "loss": 0.0006, "step": 40350 }, { "epoch": 6.769436997319035, "grad_norm": 0.05965403839945793, "learning_rate": 6.466543686064062e-06, "loss": 0.0007, "step": 40400 }, { "epoch": 6.777815013404826, "grad_norm": 0.1390516757965088, "learning_rate": 6.44977360389066e-06, "loss": 0.0008, "step": 40450 }, { "epoch": 6.786193029490617, "grad_norm": 0.06495050340890884, "learning_rate": 6.433003521717257e-06, "loss": 0.0007, "step": 40500 }, { "epoch": 6.794571045576408, "grad_norm": 0.031072689220309258, "learning_rate": 6.416233439543854e-06, "loss": 0.0007, "step": 40550 }, { "epoch": 6.802949061662199, "grad_norm": 0.07859810441732407, "learning_rate": 6.399463357370451e-06, "loss": 0.0006, "step": 40600 }, { "epoch": 6.81132707774799, "grad_norm": 0.032027170062065125, "learning_rate": 6.38269327519705e-06, "loss": 0.0006, "step": 40650 }, { "epoch": 6.819705093833781, "grad_norm": 0.04607260972261429, "learning_rate": 6.365923193023647e-06, "loss": 0.0006, "step": 40700 }, { "epoch": 6.828083109919571, "grad_norm": 0.3754972815513611, "learning_rate": 6.349153110850244e-06, "loss": 0.0008, "step": 40750 }, { "epoch": 6.836461126005362, "grad_norm": 0.2285059541463852, "learning_rate": 6.332383028676841e-06, "loss": 0.0008, "step": 40800 }, { "epoch": 6.8448391420911525, "grad_norm": 0.0579165481030941, "learning_rate": 6.3156129465034386e-06, "loss": 0.0007, "step": 40850 }, { "epoch": 6.853217158176943, "grad_norm": 0.045123569667339325, "learning_rate": 6.298842864330036e-06, "loss": 0.0009, "step": 40900 }, { "epoch": 6.861595174262734, "grad_norm": 0.05418705940246582, "learning_rate": 6.282072782156633e-06, "loss": 0.0007, "step": 40950 }, { "epoch": 6.869973190348525, "grad_norm": 0.09083729982376099, "learning_rate": 6.2653026999832305e-06, "loss": 0.0007, "step": 41000 }, { "epoch": 6.878351206434316, "grad_norm": 0.04620116204023361, "learning_rate": 6.248532617809828e-06, "loss": 0.0008, "step": 41050 }, { "epoch": 6.886729222520107, "grad_norm": 0.18256771564483643, "learning_rate": 6.231762535636425e-06, "loss": 0.0006, "step": 41100 }, { "epoch": 6.895107238605898, "grad_norm": 0.09917750209569931, "learning_rate": 6.214992453463022e-06, "loss": 0.0006, "step": 41150 }, { "epoch": 6.903485254691689, "grad_norm": 0.07544329017400742, "learning_rate": 6.198222371289619e-06, "loss": 0.0008, "step": 41200 }, { "epoch": 6.91186327077748, "grad_norm": 0.23120667040348053, "learning_rate": 6.181452289116217e-06, "loss": 0.0006, "step": 41250 }, { "epoch": 6.920241286863271, "grad_norm": 0.030208513140678406, "learning_rate": 6.164682206942815e-06, "loss": 0.0009, "step": 41300 }, { "epoch": 6.928619302949062, "grad_norm": 0.06649070233106613, "learning_rate": 6.147912124769412e-06, "loss": 0.0007, "step": 41350 }, { "epoch": 6.936997319034853, "grad_norm": 0.0900665819644928, "learning_rate": 6.131142042596009e-06, "loss": 0.0006, "step": 41400 }, { "epoch": 6.9453753351206435, "grad_norm": 0.046752411872148514, "learning_rate": 6.114371960422607e-06, "loss": 0.0006, "step": 41450 }, { "epoch": 6.9537533512064345, "grad_norm": 0.1204705610871315, "learning_rate": 6.097601878249204e-06, "loss": 0.0007, "step": 41500 }, { "epoch": 6.962131367292225, "grad_norm": 0.0734005719423294, "learning_rate": 6.080831796075801e-06, "loss": 0.0007, "step": 41550 }, { "epoch": 6.970509383378016, "grad_norm": 0.0475836880505085, "learning_rate": 6.064061713902398e-06, "loss": 0.0008, "step": 41600 }, { "epoch": 6.978887399463807, "grad_norm": 0.08627843111753464, "learning_rate": 6.047291631728997e-06, "loss": 0.0007, "step": 41650 }, { "epoch": 6.987265415549598, "grad_norm": 0.04399965703487396, "learning_rate": 6.030521549555594e-06, "loss": 0.0007, "step": 41700 }, { "epoch": 6.995643431635389, "grad_norm": 0.09367845952510834, "learning_rate": 6.013751467382191e-06, "loss": 0.0007, "step": 41750 }, { "epoch": 7.00402144772118, "grad_norm": 0.042545393109321594, "learning_rate": 5.996981385208788e-06, "loss": 0.0006, "step": 41800 }, { "epoch": 7.012399463806971, "grad_norm": 0.06873136013746262, "learning_rate": 5.980211303035386e-06, "loss": 0.0003, "step": 41850 }, { "epoch": 7.020777479892762, "grad_norm": 0.016868956387043, "learning_rate": 5.963441220861983e-06, "loss": 0.0003, "step": 41900 }, { "epoch": 7.029155495978553, "grad_norm": 0.040632057934999466, "learning_rate": 5.94667113868858e-06, "loss": 0.0004, "step": 41950 }, { "epoch": 7.037533512064343, "grad_norm": 0.02409088797867298, "learning_rate": 5.9299010565151775e-06, "loss": 0.0003, "step": 42000 }, { "epoch": 7.045911528150134, "grad_norm": 0.04467145353555679, "learning_rate": 5.913130974341775e-06, "loss": 0.0003, "step": 42050 }, { "epoch": 7.054289544235925, "grad_norm": 0.03071122244000435, "learning_rate": 5.896360892168372e-06, "loss": 0.0003, "step": 42100 }, { "epoch": 7.0626675603217155, "grad_norm": 0.028979197144508362, "learning_rate": 5.879590809994969e-06, "loss": 0.0003, "step": 42150 }, { "epoch": 7.071045576407506, "grad_norm": 0.01919564977288246, "learning_rate": 5.862820727821566e-06, "loss": 0.0004, "step": 42200 }, { "epoch": 7.079423592493297, "grad_norm": 0.027090469375252724, "learning_rate": 5.846050645648163e-06, "loss": 0.0003, "step": 42250 }, { "epoch": 7.087801608579088, "grad_norm": 0.02643194980919361, "learning_rate": 5.829280563474762e-06, "loss": 0.0003, "step": 42300 }, { "epoch": 7.096179624664879, "grad_norm": 0.021015044301748276, "learning_rate": 5.812510481301359e-06, "loss": 0.0003, "step": 42350 }, { "epoch": 7.10455764075067, "grad_norm": 0.030943244695663452, "learning_rate": 5.795740399127956e-06, "loss": 0.0003, "step": 42400 }, { "epoch": 7.112935656836461, "grad_norm": 0.027034178376197815, "learning_rate": 5.778970316954553e-06, "loss": 0.0004, "step": 42450 }, { "epoch": 7.121313672922252, "grad_norm": 0.022702839225530624, "learning_rate": 5.762200234781151e-06, "loss": 0.0003, "step": 42500 }, { "epoch": 7.129691689008043, "grad_norm": 0.030643748119473457, "learning_rate": 5.745430152607748e-06, "loss": 0.0004, "step": 42550 }, { "epoch": 7.138069705093834, "grad_norm": 0.07582689076662064, "learning_rate": 5.728660070434345e-06, "loss": 0.0003, "step": 42600 }, { "epoch": 7.146447721179625, "grad_norm": 0.024221094325184822, "learning_rate": 5.711889988260942e-06, "loss": 0.0003, "step": 42650 }, { "epoch": 7.154825737265416, "grad_norm": 0.04585973173379898, "learning_rate": 5.695119906087541e-06, "loss": 0.0003, "step": 42700 }, { "epoch": 7.1632037533512065, "grad_norm": 0.1931953877210617, "learning_rate": 5.678349823914138e-06, "loss": 0.0004, "step": 42750 }, { "epoch": 7.171581769436997, "grad_norm": 0.02779720537364483, "learning_rate": 5.661579741740735e-06, "loss": 0.0002, "step": 42800 }, { "epoch": 7.179959785522788, "grad_norm": 0.0313730388879776, "learning_rate": 5.644809659567332e-06, "loss": 0.0004, "step": 42850 }, { "epoch": 7.188337801608579, "grad_norm": 0.0504007451236248, "learning_rate": 5.62803957739393e-06, "loss": 0.0006, "step": 42900 }, { "epoch": 7.19671581769437, "grad_norm": 0.019938312470912933, "learning_rate": 5.6112694952205275e-06, "loss": 0.0003, "step": 42950 }, { "epoch": 7.205093833780161, "grad_norm": 0.020967137068510056, "learning_rate": 5.5944994130471245e-06, "loss": 0.0003, "step": 43000 }, { "epoch": 7.213471849865952, "grad_norm": 0.02640225552022457, "learning_rate": 5.5777293308737215e-06, "loss": 0.0003, "step": 43050 }, { "epoch": 7.221849865951743, "grad_norm": 0.02011190541088581, "learning_rate": 5.560959248700319e-06, "loss": 0.0003, "step": 43100 }, { "epoch": 7.230227882037534, "grad_norm": 0.05346396192908287, "learning_rate": 5.544189166526916e-06, "loss": 0.0003, "step": 43150 }, { "epoch": 7.238605898123325, "grad_norm": 0.038962822407484055, "learning_rate": 5.5274190843535134e-06, "loss": 0.0003, "step": 43200 }, { "epoch": 7.246983914209116, "grad_norm": 0.031206265091896057, "learning_rate": 5.5106490021801104e-06, "loss": 0.0003, "step": 43250 }, { "epoch": 7.255361930294907, "grad_norm": 0.0827702060341835, "learning_rate": 5.493878920006709e-06, "loss": 0.0004, "step": 43300 }, { "epoch": 7.263739946380697, "grad_norm": 0.023083705455064774, "learning_rate": 5.477108837833306e-06, "loss": 0.0004, "step": 43350 }, { "epoch": 7.272117962466488, "grad_norm": 0.024041956290602684, "learning_rate": 5.460338755659903e-06, "loss": 0.0003, "step": 43400 }, { "epoch": 7.2804959785522785, "grad_norm": 0.07957682758569717, "learning_rate": 5.4435686734865e-06, "loss": 0.0003, "step": 43450 }, { "epoch": 7.288873994638069, "grad_norm": 0.18736758828163147, "learning_rate": 5.426798591313098e-06, "loss": 0.0005, "step": 43500 }, { "epoch": 7.29725201072386, "grad_norm": 0.04302476719021797, "learning_rate": 5.410028509139695e-06, "loss": 0.0003, "step": 43550 }, { "epoch": 7.305630026809651, "grad_norm": 0.35610461235046387, "learning_rate": 5.393258426966292e-06, "loss": 0.0005, "step": 43600 }, { "epoch": 7.314008042895442, "grad_norm": 0.023378223180770874, "learning_rate": 5.376488344792889e-06, "loss": 0.0003, "step": 43650 }, { "epoch": 7.322386058981233, "grad_norm": 0.031664662063121796, "learning_rate": 5.359718262619488e-06, "loss": 0.0003, "step": 43700 }, { "epoch": 7.330764075067024, "grad_norm": 0.01903984695672989, "learning_rate": 5.342948180446085e-06, "loss": 0.0003, "step": 43750 }, { "epoch": 7.339142091152815, "grad_norm": 0.031017929315567017, "learning_rate": 5.326178098272682e-06, "loss": 0.0004, "step": 43800 }, { "epoch": 7.347520107238606, "grad_norm": 0.028874006122350693, "learning_rate": 5.309408016099279e-06, "loss": 0.0003, "step": 43850 }, { "epoch": 7.355898123324397, "grad_norm": 0.016122756525874138, "learning_rate": 5.292637933925877e-06, "loss": 0.0003, "step": 43900 }, { "epoch": 7.364276139410188, "grad_norm": 0.037943582981824875, "learning_rate": 5.2758678517524745e-06, "loss": 0.0003, "step": 43950 }, { "epoch": 7.372654155495979, "grad_norm": 0.02277122251689434, "learning_rate": 5.2590977695790716e-06, "loss": 0.0004, "step": 44000 }, { "epoch": 7.3810321715817695, "grad_norm": 0.049923092126846313, "learning_rate": 5.2423276874056686e-06, "loss": 0.0003, "step": 44050 }, { "epoch": 7.38941018766756, "grad_norm": 0.04292990267276764, "learning_rate": 5.2255576052322664e-06, "loss": 0.0005, "step": 44100 }, { "epoch": 7.397788203753351, "grad_norm": 0.04316338151693344, "learning_rate": 5.2087875230588634e-06, "loss": 0.0003, "step": 44150 }, { "epoch": 7.406166219839142, "grad_norm": 0.0165548212826252, "learning_rate": 5.1920174408854605e-06, "loss": 0.0003, "step": 44200 }, { "epoch": 7.414544235924933, "grad_norm": 0.041038576513528824, "learning_rate": 5.1752473587120575e-06, "loss": 0.0004, "step": 44250 }, { "epoch": 7.422922252010724, "grad_norm": 0.07028800249099731, "learning_rate": 5.158477276538656e-06, "loss": 0.0003, "step": 44300 }, { "epoch": 7.431300268096515, "grad_norm": 0.03722773492336273, "learning_rate": 5.141707194365253e-06, "loss": 0.0003, "step": 44350 }, { "epoch": 7.439678284182306, "grad_norm": 0.015645667910575867, "learning_rate": 5.12493711219185e-06, "loss": 0.0004, "step": 44400 }, { "epoch": 7.448056300268097, "grad_norm": 0.18982096016407013, "learning_rate": 5.108167030018447e-06, "loss": 0.0005, "step": 44450 }, { "epoch": 7.456434316353888, "grad_norm": 0.02524687349796295, "learning_rate": 5.091396947845045e-06, "loss": 0.0004, "step": 44500 }, { "epoch": 7.464812332439679, "grad_norm": 0.02892642468214035, "learning_rate": 5.074626865671642e-06, "loss": 0.0006, "step": 44550 }, { "epoch": 7.473190348525469, "grad_norm": 0.04107584059238434, "learning_rate": 5.057856783498239e-06, "loss": 0.0007, "step": 44600 }, { "epoch": 7.48156836461126, "grad_norm": 0.05140475928783417, "learning_rate": 5.041086701324837e-06, "loss": 0.0003, "step": 44650 }, { "epoch": 7.4899463806970505, "grad_norm": 0.054712191224098206, "learning_rate": 5.024316619151435e-06, "loss": 0.0004, "step": 44700 }, { "epoch": 7.4983243967828415, "grad_norm": 0.022500043734908104, "learning_rate": 5.007546536978032e-06, "loss": 0.0004, "step": 44750 }, { "epoch": 7.506702412868632, "grad_norm": 0.6084216833114624, "learning_rate": 4.990776454804629e-06, "loss": 0.0006, "step": 44800 }, { "epoch": 7.515080428954423, "grad_norm": 0.07291937619447708, "learning_rate": 4.974006372631227e-06, "loss": 0.0004, "step": 44850 }, { "epoch": 7.523458445040214, "grad_norm": 0.06688908487558365, "learning_rate": 4.957236290457824e-06, "loss": 0.0005, "step": 44900 }, { "epoch": 7.531836461126005, "grad_norm": 0.025029698386788368, "learning_rate": 4.940466208284422e-06, "loss": 0.0005, "step": 44950 }, { "epoch": 7.540214477211796, "grad_norm": 0.08454358577728271, "learning_rate": 4.923696126111019e-06, "loss": 0.0003, "step": 45000 }, { "epoch": 7.548592493297587, "grad_norm": 0.0553942508995533, "learning_rate": 4.906926043937616e-06, "loss": 0.0003, "step": 45050 }, { "epoch": 7.556970509383378, "grad_norm": 0.05681919679045677, "learning_rate": 4.890155961764213e-06, "loss": 0.0005, "step": 45100 }, { "epoch": 7.565348525469169, "grad_norm": 0.019724005833268166, "learning_rate": 4.8733858795908105e-06, "loss": 0.0004, "step": 45150 }, { "epoch": 7.57372654155496, "grad_norm": 0.027729319408535957, "learning_rate": 4.8566157974174075e-06, "loss": 0.0003, "step": 45200 }, { "epoch": 7.582104557640751, "grad_norm": 0.05744357779622078, "learning_rate": 4.8398457152440045e-06, "loss": 0.0004, "step": 45250 }, { "epoch": 7.590482573726542, "grad_norm": 0.061757415533065796, "learning_rate": 4.823075633070602e-06, "loss": 0.0003, "step": 45300 }, { "epoch": 7.5988605898123325, "grad_norm": 0.04031449928879738, "learning_rate": 4.806305550897199e-06, "loss": 0.0005, "step": 45350 }, { "epoch": 7.607238605898123, "grad_norm": 0.10854317247867584, "learning_rate": 4.789535468723797e-06, "loss": 0.0004, "step": 45400 }, { "epoch": 7.615616621983914, "grad_norm": 0.01701934076845646, "learning_rate": 4.772765386550394e-06, "loss": 0.0003, "step": 45450 }, { "epoch": 7.623994638069705, "grad_norm": 0.04535774141550064, "learning_rate": 4.755995304376992e-06, "loss": 0.0003, "step": 45500 }, { "epoch": 7.632372654155496, "grad_norm": 0.03864599019289017, "learning_rate": 4.739225222203589e-06, "loss": 0.0005, "step": 45550 }, { "epoch": 7.640750670241287, "grad_norm": 0.03809565305709839, "learning_rate": 4.722455140030187e-06, "loss": 0.0004, "step": 45600 }, { "epoch": 7.649128686327078, "grad_norm": 0.043824635446071625, "learning_rate": 4.705685057856784e-06, "loss": 0.0004, "step": 45650 }, { "epoch": 7.657506702412869, "grad_norm": 0.02916356548666954, "learning_rate": 4.688914975683381e-06, "loss": 0.0004, "step": 45700 }, { "epoch": 7.66588471849866, "grad_norm": 0.1915924996137619, "learning_rate": 4.672144893509978e-06, "loss": 0.0004, "step": 45750 }, { "epoch": 7.674262734584451, "grad_norm": 0.07420273870229721, "learning_rate": 4.655374811336576e-06, "loss": 0.0005, "step": 45800 }, { "epoch": 7.682640750670242, "grad_norm": 0.026272999122738838, "learning_rate": 4.638604729163173e-06, "loss": 0.0004, "step": 45850 }, { "epoch": 7.691018766756033, "grad_norm": 0.3181280493736267, "learning_rate": 4.621834646989771e-06, "loss": 0.0004, "step": 45900 }, { "epoch": 7.6993967828418235, "grad_norm": 0.035441432148218155, "learning_rate": 4.605064564816368e-06, "loss": 0.0004, "step": 45950 }, { "epoch": 7.707774798927614, "grad_norm": 0.01749766804277897, "learning_rate": 4.588294482642966e-06, "loss": 0.0004, "step": 46000 }, { "epoch": 7.716152815013404, "grad_norm": 0.07927963137626648, "learning_rate": 4.571524400469563e-06, "loss": 0.0004, "step": 46050 }, { "epoch": 7.724530831099195, "grad_norm": 0.03702886402606964, "learning_rate": 4.5547543182961605e-06, "loss": 0.0004, "step": 46100 }, { "epoch": 7.732908847184986, "grad_norm": 0.04682036116719246, "learning_rate": 4.5379842361227575e-06, "loss": 0.0003, "step": 46150 }, { "epoch": 7.741286863270777, "grad_norm": 0.013787736184895039, "learning_rate": 4.5212141539493545e-06, "loss": 0.0004, "step": 46200 }, { "epoch": 7.749664879356568, "grad_norm": 0.029385261237621307, "learning_rate": 4.5044440717759515e-06, "loss": 0.0005, "step": 46250 }, { "epoch": 7.758042895442359, "grad_norm": 0.018814504146575928, "learning_rate": 4.487673989602549e-06, "loss": 0.0004, "step": 46300 }, { "epoch": 7.76642091152815, "grad_norm": 0.017043303698301315, "learning_rate": 4.470903907429146e-06, "loss": 0.0004, "step": 46350 }, { "epoch": 7.774798927613941, "grad_norm": 0.12016864866018295, "learning_rate": 4.454133825255744e-06, "loss": 0.0004, "step": 46400 }, { "epoch": 7.783176943699732, "grad_norm": 0.051153432577848434, "learning_rate": 4.437363743082341e-06, "loss": 0.0003, "step": 46450 }, { "epoch": 7.791554959785523, "grad_norm": 0.030375001952052116, "learning_rate": 4.420593660908939e-06, "loss": 0.0004, "step": 46500 }, { "epoch": 7.799932975871314, "grad_norm": 0.060530513525009155, "learning_rate": 4.403823578735536e-06, "loss": 0.0004, "step": 46550 }, { "epoch": 7.8083109919571045, "grad_norm": 0.034854013472795486, "learning_rate": 4.387053496562134e-06, "loss": 0.0004, "step": 46600 }, { "epoch": 7.8166890080428955, "grad_norm": 0.03328954800963402, "learning_rate": 4.370283414388731e-06, "loss": 0.0004, "step": 46650 }, { "epoch": 7.825067024128686, "grad_norm": 0.04314300790429115, "learning_rate": 4.353513332215328e-06, "loss": 0.0004, "step": 46700 }, { "epoch": 7.833445040214477, "grad_norm": 0.09990786761045456, "learning_rate": 4.336743250041925e-06, "loss": 0.0004, "step": 46750 }, { "epoch": 7.841823056300268, "grad_norm": 0.035922013223171234, "learning_rate": 4.319973167868523e-06, "loss": 0.0003, "step": 46800 }, { "epoch": 7.850201072386059, "grad_norm": 0.05447731912136078, "learning_rate": 4.30320308569512e-06, "loss": 0.0004, "step": 46850 }, { "epoch": 7.85857908847185, "grad_norm": 0.01950427144765854, "learning_rate": 4.286433003521718e-06, "loss": 0.0004, "step": 46900 }, { "epoch": 7.866957104557641, "grad_norm": 0.04952532425522804, "learning_rate": 4.269662921348315e-06, "loss": 0.0004, "step": 46950 }, { "epoch": 7.875335120643432, "grad_norm": 0.04123789444565773, "learning_rate": 4.252892839174913e-06, "loss": 0.0004, "step": 47000 }, { "epoch": 7.883713136729223, "grad_norm": 0.0161293838173151, "learning_rate": 4.23612275700151e-06, "loss": 0.0003, "step": 47050 }, { "epoch": 7.892091152815014, "grad_norm": 0.039569880813360214, "learning_rate": 4.2193526748281075e-06, "loss": 0.0004, "step": 47100 }, { "epoch": 7.900469168900805, "grad_norm": 0.209671750664711, "learning_rate": 4.2025825926547045e-06, "loss": 0.0004, "step": 47150 }, { "epoch": 7.908847184986596, "grad_norm": 0.049620840698480606, "learning_rate": 4.1858125104813016e-06, "loss": 0.0005, "step": 47200 }, { "epoch": 7.917225201072386, "grad_norm": 0.03689347580075264, "learning_rate": 4.1690424283078986e-06, "loss": 0.0004, "step": 47250 }, { "epoch": 7.9256032171581765, "grad_norm": 0.05554811283946037, "learning_rate": 4.1522723461344964e-06, "loss": 0.0004, "step": 47300 }, { "epoch": 7.933981233243967, "grad_norm": 0.040197305381298065, "learning_rate": 4.1355022639610934e-06, "loss": 0.0003, "step": 47350 }, { "epoch": 7.942359249329758, "grad_norm": 0.01716030202805996, "learning_rate": 4.118732181787691e-06, "loss": 0.0003, "step": 47400 }, { "epoch": 7.950737265415549, "grad_norm": 0.0522179938852787, "learning_rate": 4.101962099614288e-06, "loss": 0.0003, "step": 47450 }, { "epoch": 7.95911528150134, "grad_norm": 0.3440731167793274, "learning_rate": 4.085192017440886e-06, "loss": 0.0004, "step": 47500 }, { "epoch": 7.967493297587131, "grad_norm": 0.03338254243135452, "learning_rate": 4.068421935267483e-06, "loss": 0.0004, "step": 47550 }, { "epoch": 7.975871313672922, "grad_norm": 0.061764348298311234, "learning_rate": 4.051651853094081e-06, "loss": 0.0004, "step": 47600 }, { "epoch": 7.984249329758713, "grad_norm": 0.046575699001550674, "learning_rate": 4.034881770920678e-06, "loss": 0.0004, "step": 47650 }, { "epoch": 7.992627345844504, "grad_norm": 0.062236297875642776, "learning_rate": 4.018111688747275e-06, "loss": 0.0004, "step": 47700 }, { "epoch": 8.001005361930295, "grad_norm": 0.028931235894560814, "learning_rate": 4.001341606573872e-06, "loss": 0.0003, "step": 47750 }, { "epoch": 8.009383378016086, "grad_norm": 0.01811792142689228, "learning_rate": 3.98457152440047e-06, "loss": 0.0002, "step": 47800 }, { "epoch": 8.017761394101877, "grad_norm": 0.01822470873594284, "learning_rate": 3.967801442227067e-06, "loss": 0.0002, "step": 47850 }, { "epoch": 8.026139410187668, "grad_norm": 0.04903008043766022, "learning_rate": 3.951031360053665e-06, "loss": 0.0002, "step": 47900 }, { "epoch": 8.034517426273458, "grad_norm": 0.022891085594892502, "learning_rate": 3.934261277880262e-06, "loss": 0.0002, "step": 47950 }, { "epoch": 8.04289544235925, "grad_norm": 0.015373194590210915, "learning_rate": 3.91749119570686e-06, "loss": 0.0002, "step": 48000 }, { "epoch": 8.05127345844504, "grad_norm": 0.01921216771006584, "learning_rate": 3.900721113533457e-06, "loss": 0.0002, "step": 48050 }, { "epoch": 8.059651474530831, "grad_norm": 0.01771024614572525, "learning_rate": 3.8839510313600546e-06, "loss": 0.0002, "step": 48100 }, { "epoch": 8.068029490616622, "grad_norm": 0.012290588580071926, "learning_rate": 3.867180949186652e-06, "loss": 0.0002, "step": 48150 }, { "epoch": 8.076407506702413, "grad_norm": 0.013452921062707901, "learning_rate": 3.850410867013249e-06, "loss": 0.0002, "step": 48200 }, { "epoch": 8.084785522788204, "grad_norm": 0.024924032390117645, "learning_rate": 3.833640784839846e-06, "loss": 0.0002, "step": 48250 }, { "epoch": 8.093163538873995, "grad_norm": 0.010622446425259113, "learning_rate": 3.8168707026664435e-06, "loss": 0.0002, "step": 48300 }, { "epoch": 8.101541554959786, "grad_norm": 0.02651936188340187, "learning_rate": 3.800100620493041e-06, "loss": 0.0003, "step": 48350 }, { "epoch": 8.109919571045577, "grad_norm": 0.011863762512803078, "learning_rate": 3.783330538319638e-06, "loss": 0.0002, "step": 48400 }, { "epoch": 8.118297587131368, "grad_norm": 0.02366674318909645, "learning_rate": 3.7665604561462354e-06, "loss": 0.0002, "step": 48450 }, { "epoch": 8.126675603217159, "grad_norm": 0.026062361896038055, "learning_rate": 3.7497903739728324e-06, "loss": 0.0002, "step": 48500 }, { "epoch": 8.13505361930295, "grad_norm": 0.016329048201441765, "learning_rate": 3.7330202917994302e-06, "loss": 0.0002, "step": 48550 }, { "epoch": 8.14343163538874, "grad_norm": 0.09319298714399338, "learning_rate": 3.7162502096260272e-06, "loss": 0.0002, "step": 48600 }, { "epoch": 8.151809651474531, "grad_norm": 0.014463214203715324, "learning_rate": 3.6994801274526247e-06, "loss": 0.0002, "step": 48650 }, { "epoch": 8.160187667560322, "grad_norm": 0.030192028731107712, "learning_rate": 3.682710045279222e-06, "loss": 0.0004, "step": 48700 }, { "epoch": 8.168565683646113, "grad_norm": 0.014410781674087048, "learning_rate": 3.6659399631058195e-06, "loss": 0.0002, "step": 48750 }, { "epoch": 8.176943699731904, "grad_norm": 0.028254050761461258, "learning_rate": 3.6491698809324166e-06, "loss": 0.0002, "step": 48800 }, { "epoch": 8.185321715817695, "grad_norm": 0.022153843194246292, "learning_rate": 3.6323997987590144e-06, "loss": 0.0002, "step": 48850 }, { "epoch": 8.193699731903486, "grad_norm": 0.01866259053349495, "learning_rate": 3.6156297165856114e-06, "loss": 0.0002, "step": 48900 }, { "epoch": 8.202077747989277, "grad_norm": 0.014017355628311634, "learning_rate": 3.598859634412209e-06, "loss": 0.0002, "step": 48950 }, { "epoch": 8.210455764075068, "grad_norm": 0.017173465341329575, "learning_rate": 3.582089552238806e-06, "loss": 0.0002, "step": 49000 }, { "epoch": 8.218833780160859, "grad_norm": 0.20775650441646576, "learning_rate": 3.5653194700654037e-06, "loss": 0.0002, "step": 49050 }, { "epoch": 8.22721179624665, "grad_norm": 0.014167393557727337, "learning_rate": 3.5485493878920008e-06, "loss": 0.0002, "step": 49100 }, { "epoch": 8.23558981233244, "grad_norm": 0.02049107290804386, "learning_rate": 3.531779305718598e-06, "loss": 0.0002, "step": 49150 }, { "epoch": 8.243967828418231, "grad_norm": 0.026173189282417297, "learning_rate": 3.5150092235451956e-06, "loss": 0.0002, "step": 49200 }, { "epoch": 8.25234584450402, "grad_norm": 0.02050282247364521, "learning_rate": 3.498239141371793e-06, "loss": 0.0002, "step": 49250 }, { "epoch": 8.260723860589813, "grad_norm": 0.015291319228708744, "learning_rate": 3.48146905919839e-06, "loss": 0.0002, "step": 49300 }, { "epoch": 8.269101876675602, "grad_norm": 0.026897624135017395, "learning_rate": 3.464698977024988e-06, "loss": 0.0002, "step": 49350 }, { "epoch": 8.277479892761393, "grad_norm": 0.016716543585062027, "learning_rate": 3.447928894851585e-06, "loss": 0.0002, "step": 49400 }, { "epoch": 8.285857908847184, "grad_norm": 0.024870146065950394, "learning_rate": 3.4311588126781824e-06, "loss": 0.0002, "step": 49450 }, { "epoch": 8.294235924932975, "grad_norm": 0.017461460083723068, "learning_rate": 3.4143887305047794e-06, "loss": 0.0002, "step": 49500 }, { "epoch": 8.302613941018766, "grad_norm": 0.014370834454894066, "learning_rate": 3.3976186483313773e-06, "loss": 0.0002, "step": 49550 }, { "epoch": 8.310991957104557, "grad_norm": 0.026379108428955078, "learning_rate": 3.3808485661579743e-06, "loss": 0.0003, "step": 49600 }, { "epoch": 8.319369973190348, "grad_norm": 0.012285185977816582, "learning_rate": 3.364078483984572e-06, "loss": 0.0002, "step": 49650 }, { "epoch": 8.327747989276139, "grad_norm": 0.02005821093916893, "learning_rate": 3.347308401811169e-06, "loss": 0.0002, "step": 49700 }, { "epoch": 8.33612600536193, "grad_norm": 0.016585618257522583, "learning_rate": 3.3305383196377666e-06, "loss": 0.0003, "step": 49750 }, { "epoch": 8.34450402144772, "grad_norm": 0.023517385125160217, "learning_rate": 3.3137682374643636e-06, "loss": 0.0002, "step": 49800 }, { "epoch": 8.352882037533512, "grad_norm": 0.013065959326922894, "learning_rate": 3.2969981552909615e-06, "loss": 0.0002, "step": 49850 }, { "epoch": 8.361260053619302, "grad_norm": 0.028112584725022316, "learning_rate": 3.2802280731175585e-06, "loss": 0.0002, "step": 49900 }, { "epoch": 8.369638069705093, "grad_norm": 0.011142577044665813, "learning_rate": 3.263457990944156e-06, "loss": 0.0002, "step": 49950 }, { "epoch": 8.378016085790884, "grad_norm": 0.02244596742093563, "learning_rate": 3.246687908770753e-06, "loss": 0.0002, "step": 50000 }, { "epoch": 8.378016085790884, "eval_loss": 0.050992656499147415, "eval_runtime": 0.3133, "eval_samples_per_second": 63.832, "eval_steps_per_second": 3.192, "step": 50000 } ], "logging_steps": 50, "max_steps": 59680, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.0705940805709087e+20, "train_batch_size": 64, "trial_name": null, "trial_params": null }