{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 300, "global_step": 49697, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002012224262394044, "grad_norm": 1.5522648096084595, "learning_rate": 1.9919517102615694e-06, "loss": 1.2489, "step": 100 }, { "epoch": 0.004024448524788088, "grad_norm": 1.59534752368927, "learning_rate": 4.0040241448692155e-06, "loss": 1.1249, "step": 200 }, { "epoch": 0.006036672787182132, "grad_norm": 1.5959556102752686, "learning_rate": 6.016096579476862e-06, "loss": 1.0698, "step": 300 }, { "epoch": 0.006036672787182132, "eval_loss": 0.9532507061958313, "eval_runtime": 11.1234, "eval_samples_per_second": 33.982, "eval_steps_per_second": 1.169, "step": 300 }, { "epoch": 0.008048897049576176, "grad_norm": 1.6886017322540283, "learning_rate": 8.028169014084509e-06, "loss": 1.0365, "step": 400 }, { "epoch": 0.01006112131197022, "grad_norm": 1.421373963356018, "learning_rate": 1.0040241448692154e-05, "loss": 1.0155, "step": 500 }, { "epoch": 0.012073345574364264, "grad_norm": 1.578765630722046, "learning_rate": 1.20523138832998e-05, "loss": 1.006, "step": 600 }, { "epoch": 0.012073345574364264, "eval_loss": 0.8175720572471619, "eval_runtime": 11.5611, "eval_samples_per_second": 32.696, "eval_steps_per_second": 1.124, "step": 600 }, { "epoch": 0.014085569836758306, "grad_norm": 1.1900346279144287, "learning_rate": 1.4064386317907446e-05, "loss": 0.9972, "step": 700 }, { "epoch": 0.01609779409915235, "grad_norm": 1.4590531587600708, "learning_rate": 1.607645875251509e-05, "loss": 0.9895, "step": 800 }, { "epoch": 0.018110018361546396, "grad_norm": 1.8518555164337158, "learning_rate": 1.8088531187122737e-05, "loss": 0.9718, "step": 900 }, { "epoch": 0.018110018361546396, "eval_loss": 0.781577467918396, "eval_runtime": 11.4278, "eval_samples_per_second": 33.077, "eval_steps_per_second": 1.138, "step": 900 }, { "epoch": 0.02012224262394044, "grad_norm": 1.351710319519043, "learning_rate": 1.999999947988626e-05, "loss": 0.972, "step": 1000 }, { "epoch": 0.022134466886334483, "grad_norm": 1.2841336727142334, "learning_rate": 1.9999770630715236e-05, "loss": 0.9662, "step": 1100 }, { "epoch": 0.024146691148728527, "grad_norm": 1.2296431064605713, "learning_rate": 1.9999125701534677e-05, "loss": 0.9578, "step": 1200 }, { "epoch": 0.024146691148728527, "eval_loss": 0.9337042570114136, "eval_runtime": 11.324, "eval_samples_per_second": 33.38, "eval_steps_per_second": 1.148, "step": 1200 }, { "epoch": 0.026158915411122568, "grad_norm": 1.3778767585754395, "learning_rate": 1.9998064719179408e-05, "loss": 0.9614, "step": 1300 }, { "epoch": 0.028171139673516612, "grad_norm": 1.3921650648117065, "learning_rate": 1.9996587727795803e-05, "loss": 0.9541, "step": 1400 }, { "epoch": 0.030183363935910656, "grad_norm": 1.3527588844299316, "learning_rate": 1.9994694788839924e-05, "loss": 0.9488, "step": 1500 }, { "epoch": 0.030183363935910656, "eval_loss": 0.7029635310173035, "eval_runtime": 11.506, "eval_samples_per_second": 32.853, "eval_steps_per_second": 1.13, "step": 1500 }, { "epoch": 0.0321955881983047, "grad_norm": 0.8907983303070068, "learning_rate": 1.9992385981074994e-05, "loss": 0.9418, "step": 1600 }, { "epoch": 0.03420781246069875, "grad_norm": 1.136816382408142, "learning_rate": 1.998966140056808e-05, "loss": 0.946, "step": 1700 }, { "epoch": 0.03622003672309279, "grad_norm": 1.0668370723724365, "learning_rate": 1.9986521160686134e-05, "loss": 0.9357, "step": 1800 }, { "epoch": 0.03622003672309279, "eval_loss": 0.68252032995224, "eval_runtime": 11.3032, "eval_samples_per_second": 33.442, "eval_steps_per_second": 1.15, "step": 1800 }, { "epoch": 0.038232260985486835, "grad_norm": 0.8517168760299683, "learning_rate": 1.9982965392091262e-05, "loss": 0.936, "step": 1900 }, { "epoch": 0.04024448524788088, "grad_norm": 1.0746815204620361, "learning_rate": 1.9978994242735275e-05, "loss": 0.9384, "step": 2000 }, { "epoch": 0.04225670951027492, "grad_norm": 1.0119695663452148, "learning_rate": 1.9974607877853555e-05, "loss": 0.9252, "step": 2100 }, { "epoch": 0.04225670951027492, "eval_loss": 0.672024130821228, "eval_runtime": 11.3298, "eval_samples_per_second": 33.363, "eval_steps_per_second": 1.147, "step": 2100 }, { "epoch": 0.04426893377266897, "grad_norm": 0.7535356283187866, "learning_rate": 1.9969806479958154e-05, "loss": 0.9215, "step": 2200 }, { "epoch": 0.04628115803506301, "grad_norm": 0.837115466594696, "learning_rate": 1.996459024883023e-05, "loss": 0.9229, "step": 2300 }, { "epoch": 0.048293382297457055, "grad_norm": 0.9772033095359802, "learning_rate": 1.995895940151171e-05, "loss": 0.9155, "step": 2400 }, { "epoch": 0.048293382297457055, "eval_loss": 0.6609585285186768, "eval_runtime": 11.8546, "eval_samples_per_second": 31.886, "eval_steps_per_second": 1.097, "step": 2400 }, { "epoch": 0.0503056065598511, "grad_norm": 0.9059876799583435, "learning_rate": 1.9952914172296264e-05, "loss": 0.9104, "step": 2500 }, { "epoch": 0.052317830822245136, "grad_norm": 1.090819239616394, "learning_rate": 1.9946454812719572e-05, "loss": 0.9056, "step": 2600 }, { "epoch": 0.05433005508463918, "grad_norm": 0.8924378156661987, "learning_rate": 1.9939581591548833e-05, "loss": 0.9102, "step": 2700 }, { "epoch": 0.05433005508463918, "eval_loss": 0.6568426489830017, "eval_runtime": 11.3424, "eval_samples_per_second": 33.326, "eval_steps_per_second": 1.146, "step": 2700 }, { "epoch": 0.056342279347033224, "grad_norm": 0.9142224788665771, "learning_rate": 1.9932294794771596e-05, "loss": 0.9101, "step": 2800 }, { "epoch": 0.05835450360942727, "grad_norm": 1.060359239578247, "learning_rate": 1.992459472558387e-05, "loss": 0.9013, "step": 2900 }, { "epoch": 0.06036672787182131, "grad_norm": 0.7167413234710693, "learning_rate": 1.9916481704377487e-05, "loss": 0.9002, "step": 3000 }, { "epoch": 0.06036672787182131, "eval_loss": 0.6527668237686157, "eval_runtime": 11.4, "eval_samples_per_second": 33.158, "eval_steps_per_second": 1.14, "step": 3000 }, { "epoch": 0.062378952134215356, "grad_norm": 0.783549427986145, "learning_rate": 1.9907956068726782e-05, "loss": 0.897, "step": 3100 }, { "epoch": 0.0643911763966094, "grad_norm": 0.9683724045753479, "learning_rate": 1.9899018173374552e-05, "loss": 0.9294, "step": 3200 }, { "epoch": 0.06640340065900345, "grad_norm": 1.1547231674194336, "learning_rate": 1.9889668390217284e-05, "loss": 0.901, "step": 3300 }, { "epoch": 0.06640340065900345, "eval_loss": 0.6419159173965454, "eval_runtime": 11.429, "eval_samples_per_second": 33.074, "eval_steps_per_second": 1.137, "step": 3300 }, { "epoch": 0.0684156249213975, "grad_norm": 0.81548011302948, "learning_rate": 1.9879907108289684e-05, "loss": 0.9008, "step": 3400 }, { "epoch": 0.07042784918379154, "grad_norm": 0.7857891321182251, "learning_rate": 1.98697347337485e-05, "loss": 0.8928, "step": 3500 }, { "epoch": 0.07244007344618558, "grad_norm": 0.8332715630531311, "learning_rate": 1.985915168985561e-05, "loss": 0.8889, "step": 3600 }, { "epoch": 0.07244007344618558, "eval_loss": 0.6356409192085266, "eval_runtime": 11.2917, "eval_samples_per_second": 33.476, "eval_steps_per_second": 1.151, "step": 3600 }, { "epoch": 0.07445229770857963, "grad_norm": 0.9201735258102417, "learning_rate": 1.9848158416960414e-05, "loss": 0.8869, "step": 3700 }, { "epoch": 0.07646452197097367, "grad_norm": 0.7852803468704224, "learning_rate": 1.9836755372481512e-05, "loss": 0.8973, "step": 3800 }, { "epoch": 0.07847674623336771, "grad_norm": 0.7758309841156006, "learning_rate": 1.982494303088767e-05, "loss": 0.8925, "step": 3900 }, { "epoch": 0.07847674623336771, "eval_loss": 0.6345422863960266, "eval_runtime": 11.3533, "eval_samples_per_second": 33.294, "eval_steps_per_second": 1.145, "step": 3900 }, { "epoch": 0.08048897049576176, "grad_norm": 0.9436432123184204, "learning_rate": 1.981272188367809e-05, "loss": 0.8847, "step": 4000 }, { "epoch": 0.0825011947581558, "grad_norm": 0.8394960165023804, "learning_rate": 1.980009243936193e-05, "loss": 0.8923, "step": 4100 }, { "epoch": 0.08451341902054985, "grad_norm": 0.8079524636268616, "learning_rate": 1.9787055223437184e-05, "loss": 0.8828, "step": 4200 }, { "epoch": 0.08451341902054985, "eval_loss": 0.6277508735656738, "eval_runtime": 11.2988, "eval_samples_per_second": 33.455, "eval_steps_per_second": 1.151, "step": 4200 }, { "epoch": 0.08652564328294389, "grad_norm": 0.8562188744544983, "learning_rate": 1.977361077836878e-05, "loss": 0.8801, "step": 4300 }, { "epoch": 0.08853786754533793, "grad_norm": 0.9642734527587891, "learning_rate": 1.9759759663566032e-05, "loss": 0.896, "step": 4400 }, { "epoch": 0.09055009180773198, "grad_norm": 0.8723398447036743, "learning_rate": 1.9745502455359367e-05, "loss": 0.8879, "step": 4500 }, { "epoch": 0.09055009180773198, "eval_loss": 0.6282201409339905, "eval_runtime": 11.4757, "eval_samples_per_second": 32.939, "eval_steps_per_second": 1.133, "step": 4500 }, { "epoch": 0.09256231607012602, "grad_norm": 0.8613621592521667, "learning_rate": 1.9730839746976314e-05, "loss": 0.8854, "step": 4600 }, { "epoch": 0.09457454033252007, "grad_norm": 0.7336219549179077, "learning_rate": 1.9715772148516855e-05, "loss": 0.8806, "step": 4700 }, { "epoch": 0.09658676459491411, "grad_norm": 0.7842460870742798, "learning_rate": 1.970030028692802e-05, "loss": 0.8798, "step": 4800 }, { "epoch": 0.09658676459491411, "eval_loss": 0.6203732490539551, "eval_runtime": 11.2931, "eval_samples_per_second": 33.472, "eval_steps_per_second": 1.151, "step": 4800 }, { "epoch": 0.09859898885730815, "grad_norm": 1.042386770248413, "learning_rate": 1.968442480597781e-05, "loss": 0.8786, "step": 4900 }, { "epoch": 0.1006112131197022, "grad_norm": 0.8358279466629028, "learning_rate": 1.9668146366228398e-05, "loss": 0.8834, "step": 5000 }, { "epoch": 0.10262343738209624, "grad_norm": 0.9129268527030945, "learning_rate": 1.965146564500866e-05, "loss": 0.8763, "step": 5100 }, { "epoch": 0.10262343738209624, "eval_loss": 0.6140510439872742, "eval_runtime": 11.3122, "eval_samples_per_second": 33.415, "eval_steps_per_second": 1.149, "step": 5100 }, { "epoch": 0.10463566164449027, "grad_norm": 0.9329330325126648, "learning_rate": 1.963438333638598e-05, "loss": 0.8724, "step": 5200 }, { "epoch": 0.10664788590688432, "grad_norm": 0.9156613349914551, "learning_rate": 1.9616900151137375e-05, "loss": 0.8798, "step": 5300 }, { "epoch": 0.10866011016927836, "grad_norm": 1.0988123416900635, "learning_rate": 1.9599016816719912e-05, "loss": 0.8864, "step": 5400 }, { "epoch": 0.10866011016927836, "eval_loss": 0.613735556602478, "eval_runtime": 11.5595, "eval_samples_per_second": 32.7, "eval_steps_per_second": 1.125, "step": 5400 }, { "epoch": 0.1106723344316724, "grad_norm": 0.9962302446365356, "learning_rate": 1.9580734077240467e-05, "loss": 0.879, "step": 5500 }, { "epoch": 0.11268455869406645, "grad_norm": 0.6542097926139832, "learning_rate": 1.9562052693424724e-05, "loss": 0.8754, "step": 5600 }, { "epoch": 0.11469678295646049, "grad_norm": 0.8420646786689758, "learning_rate": 1.9542973442585542e-05, "loss": 0.8753, "step": 5700 }, { "epoch": 0.11469678295646049, "eval_loss": 0.6112973690032959, "eval_runtime": 11.3099, "eval_samples_per_second": 33.422, "eval_steps_per_second": 1.149, "step": 5700 }, { "epoch": 0.11670900721885454, "grad_norm": 1.0234030485153198, "learning_rate": 1.9523497118590625e-05, "loss": 0.869, "step": 5800 }, { "epoch": 0.11872123148124858, "grad_norm": 0.7687940001487732, "learning_rate": 1.9503624531829463e-05, "loss": 0.875, "step": 5900 }, { "epoch": 0.12073345574364262, "grad_norm": 0.858860194683075, "learning_rate": 1.9483356509179633e-05, "loss": 0.8682, "step": 6000 }, { "epoch": 0.12073345574364262, "eval_loss": 0.6082560420036316, "eval_runtime": 11.2984, "eval_samples_per_second": 33.456, "eval_steps_per_second": 1.151, "step": 6000 }, { "epoch": 0.12274568000603667, "grad_norm": 0.7500011324882507, "learning_rate": 1.946269389397239e-05, "loss": 0.8667, "step": 6100 }, { "epoch": 0.12475790426843071, "grad_norm": 0.8498502373695374, "learning_rate": 1.9441637545957558e-05, "loss": 0.8717, "step": 6200 }, { "epoch": 0.12677012853082475, "grad_norm": 0.9230628609657288, "learning_rate": 1.9420188341267783e-05, "loss": 0.8689, "step": 6300 }, { "epoch": 0.12677012853082475, "eval_loss": 0.6047795414924622, "eval_runtime": 11.3052, "eval_samples_per_second": 33.436, "eval_steps_per_second": 1.15, "step": 6300 }, { "epoch": 0.1287823527932188, "grad_norm": 0.7312197089195251, "learning_rate": 1.939834717238207e-05, "loss": 0.8676, "step": 6400 }, { "epoch": 0.13079457705561284, "grad_norm": 0.7080931067466736, "learning_rate": 1.9376114948088634e-05, "loss": 0.8632, "step": 6500 }, { "epoch": 0.1328068013180069, "grad_norm": 0.793525755405426, "learning_rate": 1.9353492593447107e-05, "loss": 0.8682, "step": 6600 }, { "epoch": 0.1328068013180069, "eval_loss": 0.6011930704116821, "eval_runtime": 11.4543, "eval_samples_per_second": 33.001, "eval_steps_per_second": 1.135, "step": 6600 }, { "epoch": 0.13481902558040093, "grad_norm": 0.7798284292221069, "learning_rate": 1.9330481049750028e-05, "loss": 0.8636, "step": 6700 }, { "epoch": 0.136831249842795, "grad_norm": 0.9270545840263367, "learning_rate": 1.9307081274483698e-05, "loss": 0.8644, "step": 6800 }, { "epoch": 0.13884347410518902, "grad_norm": 0.7777066826820374, "learning_rate": 1.9283294241288315e-05, "loss": 0.8682, "step": 6900 }, { "epoch": 0.13884347410518902, "eval_loss": 0.6046885848045349, "eval_runtime": 11.4509, "eval_samples_per_second": 33.01, "eval_steps_per_second": 1.135, "step": 6900 }, { "epoch": 0.14085569836758308, "grad_norm": 0.7538514733314514, "learning_rate": 1.925912093991748e-05, "loss": 0.8654, "step": 7000 }, { "epoch": 0.1428679226299771, "grad_norm": 0.6866621375083923, "learning_rate": 1.9234562376197015e-05, "loss": 0.8497, "step": 7100 }, { "epoch": 0.14488014689237116, "grad_norm": 0.829768717288971, "learning_rate": 1.92096195719831e-05, "loss": 0.8575, "step": 7200 }, { "epoch": 0.14488014689237116, "eval_loss": 0.6001401543617249, "eval_runtime": 11.2516, "eval_samples_per_second": 33.595, "eval_steps_per_second": 1.155, "step": 7200 }, { "epoch": 0.1468923711547652, "grad_norm": 0.8665058016777039, "learning_rate": 1.9184293565119755e-05, "loss": 0.8612, "step": 7300 }, { "epoch": 0.14890459541715925, "grad_norm": 0.7740942239761353, "learning_rate": 1.9158585409395674e-05, "loss": 0.8596, "step": 7400 }, { "epoch": 0.15091681967955328, "grad_norm": 0.672917902469635, "learning_rate": 1.9132496174500364e-05, "loss": 0.854, "step": 7500 }, { "epoch": 0.15091681967955328, "eval_loss": 0.5939906239509583, "eval_runtime": 11.3101, "eval_samples_per_second": 33.421, "eval_steps_per_second": 1.149, "step": 7500 }, { "epoch": 0.15292904394194734, "grad_norm": 0.719465970993042, "learning_rate": 1.9106026945979627e-05, "loss": 0.8615, "step": 7600 }, { "epoch": 0.15494126820434137, "grad_norm": 0.7433097958564758, "learning_rate": 1.9079178825190416e-05, "loss": 0.8564, "step": 7700 }, { "epoch": 0.15695349246673543, "grad_norm": 0.7390840649604797, "learning_rate": 1.9051952929254983e-05, "loss": 0.8526, "step": 7800 }, { "epoch": 0.15695349246673543, "eval_loss": 0.5941105484962463, "eval_runtime": 11.2494, "eval_samples_per_second": 33.602, "eval_steps_per_second": 1.156, "step": 7800 }, { "epoch": 0.15896571672912946, "grad_norm": 0.721076488494873, "learning_rate": 1.902435039101442e-05, "loss": 0.8535, "step": 7900 }, { "epoch": 0.16097794099152352, "grad_norm": 0.7117634415626526, "learning_rate": 1.899637235898151e-05, "loss": 0.8548, "step": 8000 }, { "epoch": 0.16299016525391755, "grad_norm": 0.7325859069824219, "learning_rate": 1.8968019997292937e-05, "loss": 0.8661, "step": 8100 }, { "epoch": 0.16299016525391755, "eval_loss": 0.5943772196769714, "eval_runtime": 11.2277, "eval_samples_per_second": 33.667, "eval_steps_per_second": 1.158, "step": 8100 }, { "epoch": 0.1650023895163116, "grad_norm": 0.8927565217018127, "learning_rate": 1.893929448566085e-05, "loss": 0.8535, "step": 8200 }, { "epoch": 0.16701461377870563, "grad_norm": 0.9083840250968933, "learning_rate": 1.8910197019323782e-05, "loss": 0.8581, "step": 8300 }, { "epoch": 0.1690268380410997, "grad_norm": 0.7133694291114807, "learning_rate": 1.8880728808996906e-05, "loss": 0.8491, "step": 8400 }, { "epoch": 0.1690268380410997, "eval_loss": 0.5923792719841003, "eval_runtime": 11.2757, "eval_samples_per_second": 33.523, "eval_steps_per_second": 1.153, "step": 8400 }, { "epoch": 0.17103906230349372, "grad_norm": 0.7994174361228943, "learning_rate": 1.8850891080821673e-05, "loss": 0.8577, "step": 8500 }, { "epoch": 0.17305128656588778, "grad_norm": 1.106224775314331, "learning_rate": 1.8820685076314782e-05, "loss": 0.849, "step": 8600 }, { "epoch": 0.1750635108282818, "grad_norm": 1.0492300987243652, "learning_rate": 1.8790112052316523e-05, "loss": 0.8579, "step": 8700 }, { "epoch": 0.1750635108282818, "eval_loss": 0.6185858845710754, "eval_runtime": 11.3469, "eval_samples_per_second": 33.313, "eval_steps_per_second": 1.146, "step": 8700 }, { "epoch": 0.17707573509067587, "grad_norm": 0.7523091435432434, "learning_rate": 1.875917328093849e-05, "loss": 0.8548, "step": 8800 }, { "epoch": 0.1790879593530699, "grad_norm": 0.8177125453948975, "learning_rate": 1.8727870049510636e-05, "loss": 0.8512, "step": 8900 }, { "epoch": 0.18110018361546396, "grad_norm": 0.7863544821739197, "learning_rate": 1.869620366052772e-05, "loss": 0.8474, "step": 9000 }, { "epoch": 0.18110018361546396, "eval_loss": 0.5867164134979248, "eval_runtime": 11.2542, "eval_samples_per_second": 33.588, "eval_steps_per_second": 1.155, "step": 9000 }, { "epoch": 0.18311240787785799, "grad_norm": 0.7436131834983826, "learning_rate": 1.8664175431595106e-05, "loss": 0.8587, "step": 9100 }, { "epoch": 0.18512463214025204, "grad_norm": 0.803816020488739, "learning_rate": 1.8631786695373943e-05, "loss": 0.8455, "step": 9200 }, { "epoch": 0.18713685640264607, "grad_norm": 0.9202460050582886, "learning_rate": 1.8599038799525712e-05, "loss": 0.8513, "step": 9300 }, { "epoch": 0.18713685640264607, "eval_loss": 0.583454430103302, "eval_runtime": 11.2388, "eval_samples_per_second": 33.633, "eval_steps_per_second": 1.157, "step": 9300 }, { "epoch": 0.18914908066504013, "grad_norm": 0.8134105801582336, "learning_rate": 1.856593310665614e-05, "loss": 0.8499, "step": 9400 }, { "epoch": 0.19116130492743416, "grad_norm": 0.7113932967185974, "learning_rate": 1.8532470994258533e-05, "loss": 0.849, "step": 9500 }, { "epoch": 0.19317352918982822, "grad_norm": 0.8230564594268799, "learning_rate": 1.8498653854656424e-05, "loss": 0.8413, "step": 9600 }, { "epoch": 0.19317352918982822, "eval_loss": 0.5848163962364197, "eval_runtime": 11.2801, "eval_samples_per_second": 33.51, "eval_steps_per_second": 1.152, "step": 9600 }, { "epoch": 0.19518575345222225, "grad_norm": 0.6756404638290405, "learning_rate": 1.8464483094945667e-05, "loss": 0.8543, "step": 9700 }, { "epoch": 0.1971979777146163, "grad_norm": 0.7398785352706909, "learning_rate": 1.8429960136935878e-05, "loss": 0.8428, "step": 9800 }, { "epoch": 0.19921020197701034, "grad_norm": 0.7419747710227966, "learning_rate": 1.8395086417091272e-05, "loss": 0.8516, "step": 9900 }, { "epoch": 0.19921020197701034, "eval_loss": 0.5863896608352661, "eval_runtime": 11.3198, "eval_samples_per_second": 33.393, "eval_steps_per_second": 1.148, "step": 9900 }, { "epoch": 0.2012224262394044, "grad_norm": 0.8145945072174072, "learning_rate": 1.8359863386470904e-05, "loss": 0.8508, "step": 10000 }, { "epoch": 0.20323465050179843, "grad_norm": 0.7068437933921814, "learning_rate": 1.8324292510668278e-05, "loss": 0.8495, "step": 10100 }, { "epoch": 0.20524687476419248, "grad_norm": 0.7419267892837524, "learning_rate": 1.828837526975038e-05, "loss": 0.8461, "step": 10200 }, { "epoch": 0.20524687476419248, "eval_loss": 0.5834963917732239, "eval_runtime": 11.7842, "eval_samples_per_second": 32.077, "eval_steps_per_second": 1.103, "step": 10200 }, { "epoch": 0.2072590990265865, "grad_norm": 1.129436731338501, "learning_rate": 1.8252113158196078e-05, "loss": 0.8435, "step": 10300 }, { "epoch": 0.20927132328898054, "grad_norm": 0.6937255859375, "learning_rate": 1.821550768483396e-05, "loss": 0.8485, "step": 10400 }, { "epoch": 0.2112835475513746, "grad_norm": 0.8506975769996643, "learning_rate": 1.8178560372779525e-05, "loss": 0.8473, "step": 10500 }, { "epoch": 0.2112835475513746, "eval_loss": 0.5813661217689514, "eval_runtime": 11.832, "eval_samples_per_second": 31.947, "eval_steps_per_second": 1.099, "step": 10500 }, { "epoch": 0.21329577181376863, "grad_norm": 0.733964204788208, "learning_rate": 1.814127275937183e-05, "loss": 0.836, "step": 10600 }, { "epoch": 0.2153079960761627, "grad_norm": 0.7400948405265808, "learning_rate": 1.8103646396109523e-05, "loss": 0.8473, "step": 10700 }, { "epoch": 0.21732022033855672, "grad_norm": 0.9023438096046448, "learning_rate": 1.8065682848586266e-05, "loss": 0.8468, "step": 10800 }, { "epoch": 0.21732022033855672, "eval_loss": 0.5793610215187073, "eval_runtime": 11.234, "eval_samples_per_second": 33.648, "eval_steps_per_second": 1.157, "step": 10800 }, { "epoch": 0.21933244460095078, "grad_norm": 0.82066810131073, "learning_rate": 1.8027383696425613e-05, "loss": 0.8457, "step": 10900 }, { "epoch": 0.2213446688633448, "grad_norm": 0.6094478964805603, "learning_rate": 1.7988750533215276e-05, "loss": 0.8408, "step": 11000 }, { "epoch": 0.22335689312573886, "grad_norm": 0.7535290122032166, "learning_rate": 1.7949784966440823e-05, "loss": 0.8403, "step": 11100 }, { "epoch": 0.22335689312573886, "eval_loss": 0.578126072883606, "eval_runtime": 11.202, "eval_samples_per_second": 33.744, "eval_steps_per_second": 1.161, "step": 11100 }, { "epoch": 0.2253691173881329, "grad_norm": 0.7472143769264221, "learning_rate": 1.791048861741877e-05, "loss": 0.8434, "step": 11200 }, { "epoch": 0.22738134165052695, "grad_norm": 0.8236815333366394, "learning_rate": 1.7870863121229162e-05, "loss": 0.8273, "step": 11300 }, { "epoch": 0.22939356591292098, "grad_norm": 0.6772099137306213, "learning_rate": 1.783091012664749e-05, "loss": 0.8355, "step": 11400 }, { "epoch": 0.22939356591292098, "eval_loss": 0.5848814249038696, "eval_runtime": 11.4019, "eval_samples_per_second": 33.152, "eval_steps_per_second": 1.14, "step": 11400 }, { "epoch": 0.23140579017531504, "grad_norm": 0.7480434775352478, "learning_rate": 1.779063129607612e-05, "loss": 0.8437, "step": 11500 }, { "epoch": 0.23341801443770907, "grad_norm": 0.8341161608695984, "learning_rate": 1.7750028305475125e-05, "loss": 0.8384, "step": 11600 }, { "epoch": 0.23543023870010313, "grad_norm": 0.9399694800376892, "learning_rate": 1.7709102844292516e-05, "loss": 0.8419, "step": 11700 }, { "epoch": 0.23543023870010313, "eval_loss": 0.5769637227058411, "eval_runtime": 11.2547, "eval_samples_per_second": 33.586, "eval_steps_per_second": 1.155, "step": 11700 }, { "epoch": 0.23744246296249716, "grad_norm": 0.8473734855651855, "learning_rate": 1.7667856615393987e-05, "loss": 0.8346, "step": 11800 }, { "epoch": 0.23945468722489122, "grad_norm": 0.6887069940567017, "learning_rate": 1.7626291334992027e-05, "loss": 0.8381, "step": 11900 }, { "epoch": 0.24146691148728525, "grad_norm": 0.6946566700935364, "learning_rate": 1.758440873257454e-05, "loss": 0.8345, "step": 12000 }, { "epoch": 0.24146691148728525, "eval_loss": 0.5747541785240173, "eval_runtime": 11.4122, "eval_samples_per_second": 33.122, "eval_steps_per_second": 1.139, "step": 12000 }, { "epoch": 0.2434791357496793, "grad_norm": 0.681305468082428, "learning_rate": 1.7542210550832854e-05, "loss": 0.841, "step": 12100 }, { "epoch": 0.24549136001207333, "grad_norm": 0.8475384712219238, "learning_rate": 1.749969854558923e-05, "loss": 0.8392, "step": 12200 }, { "epoch": 0.2475035842744674, "grad_norm": 1.1652250289916992, "learning_rate": 1.745687448572379e-05, "loss": 0.8388, "step": 12300 }, { "epoch": 0.2475035842744674, "eval_loss": 0.5746700763702393, "eval_runtime": 11.4476, "eval_samples_per_second": 33.02, "eval_steps_per_second": 1.136, "step": 12300 }, { "epoch": 0.24951580853686142, "grad_norm": 0.7575956583023071, "learning_rate": 1.741374015310094e-05, "loss": 0.8362, "step": 12400 }, { "epoch": 0.25152803279925545, "grad_norm": 0.7489831447601318, "learning_rate": 1.737029734249519e-05, "loss": 0.836, "step": 12500 }, { "epoch": 0.2535402570616495, "grad_norm": 0.7467206716537476, "learning_rate": 1.732654786151651e-05, "loss": 0.8317, "step": 12600 }, { "epoch": 0.2535402570616495, "eval_loss": 0.5750060081481934, "eval_runtime": 11.2549, "eval_samples_per_second": 33.585, "eval_steps_per_second": 1.155, "step": 12600 }, { "epoch": 0.25555248132404357, "grad_norm": 0.7825116515159607, "learning_rate": 1.7282493530535095e-05, "loss": 0.8335, "step": 12700 }, { "epoch": 0.2575647055864376, "grad_norm": 0.8054665923118591, "learning_rate": 1.723813618260564e-05, "loss": 0.8332, "step": 12800 }, { "epoch": 0.25957692984883163, "grad_norm": 0.740932822227478, "learning_rate": 1.7193477663391055e-05, "loss": 0.8333, "step": 12900 }, { "epoch": 0.25957692984883163, "eval_loss": 0.574753999710083, "eval_runtime": 11.3005, "eval_samples_per_second": 33.45, "eval_steps_per_second": 1.15, "step": 12900 }, { "epoch": 0.2615891541112257, "grad_norm": 0.6655648350715637, "learning_rate": 1.714851983108567e-05, "loss": 0.8332, "step": 13000 }, { "epoch": 0.26360137837361974, "grad_norm": 0.8892366886138916, "learning_rate": 1.710326455633792e-05, "loss": 0.833, "step": 13100 }, { "epoch": 0.2656136026360138, "grad_norm": 0.7081986665725708, "learning_rate": 1.7057713722172505e-05, "loss": 0.8352, "step": 13200 }, { "epoch": 0.2656136026360138, "eval_loss": 0.569306492805481, "eval_runtime": 11.2208, "eval_samples_per_second": 33.688, "eval_steps_per_second": 1.159, "step": 13200 }, { "epoch": 0.2676258268984078, "grad_norm": 0.7726171612739563, "learning_rate": 1.701186922391206e-05, "loss": 0.8325, "step": 13300 }, { "epoch": 0.26963805116080186, "grad_norm": 0.6000068187713623, "learning_rate": 1.6965732969098262e-05, "loss": 0.8303, "step": 13400 }, { "epoch": 0.2716502754231959, "grad_norm": 0.7751488089561462, "learning_rate": 1.6919306877412474e-05, "loss": 0.8311, "step": 13500 }, { "epoch": 0.2716502754231959, "eval_loss": 0.5708428621292114, "eval_runtime": 11.2236, "eval_samples_per_second": 33.679, "eval_steps_per_second": 1.158, "step": 13500 }, { "epoch": 0.27366249968559, "grad_norm": 0.7674184441566467, "learning_rate": 1.6872592880595872e-05, "loss": 0.8391, "step": 13600 }, { "epoch": 0.275674723947984, "grad_norm": 0.999799370765686, "learning_rate": 1.6825592922369066e-05, "loss": 0.8215, "step": 13700 }, { "epoch": 0.27768694821037804, "grad_norm": 0.7192254662513733, "learning_rate": 1.6778308958351213e-05, "loss": 0.8304, "step": 13800 }, { "epoch": 0.27768694821037804, "eval_loss": 0.5696760416030884, "eval_runtime": 11.2331, "eval_samples_per_second": 33.65, "eval_steps_per_second": 1.157, "step": 13800 }, { "epoch": 0.2796991724727721, "grad_norm": 1.1758594512939453, "learning_rate": 1.673074295597867e-05, "loss": 0.8346, "step": 13900 }, { "epoch": 0.28171139673516615, "grad_norm": 0.5974677801132202, "learning_rate": 1.6682896894423094e-05, "loss": 0.824, "step": 14000 }, { "epoch": 0.28372362099756016, "grad_norm": 0.720886766910553, "learning_rate": 1.6634772764509128e-05, "loss": 0.8246, "step": 14100 }, { "epoch": 0.28372362099756016, "eval_loss": 0.5675772428512573, "eval_runtime": 11.3956, "eval_samples_per_second": 33.171, "eval_steps_per_second": 1.141, "step": 14100 }, { "epoch": 0.2857358452599542, "grad_norm": 0.6889091730117798, "learning_rate": 1.6586372568631545e-05, "loss": 0.8231, "step": 14200 }, { "epoch": 0.28774806952234827, "grad_norm": 0.6523007154464722, "learning_rate": 1.6537698320671933e-05, "loss": 0.8272, "step": 14300 }, { "epoch": 0.28976029378474233, "grad_norm": 0.7638033628463745, "learning_rate": 1.64887520459149e-05, "loss": 0.8306, "step": 14400 }, { "epoch": 0.28976029378474233, "eval_loss": 0.569464921951294, "eval_runtime": 11.248, "eval_samples_per_second": 33.606, "eval_steps_per_second": 1.156, "step": 14400 }, { "epoch": 0.29177251804713633, "grad_norm": 0.6883799433708191, "learning_rate": 1.6439535780963808e-05, "loss": 0.8327, "step": 14500 }, { "epoch": 0.2937847423095304, "grad_norm": 0.8693552017211914, "learning_rate": 1.6390051573656028e-05, "loss": 0.8299, "step": 14600 }, { "epoch": 0.29579696657192445, "grad_norm": 0.6811352372169495, "learning_rate": 1.634030148297773e-05, "loss": 0.8257, "step": 14700 }, { "epoch": 0.29579696657192445, "eval_loss": 0.5680450797080994, "eval_runtime": 11.451, "eval_samples_per_second": 33.01, "eval_steps_per_second": 1.135, "step": 14700 }, { "epoch": 0.2978091908343185, "grad_norm": 0.7108572721481323, "learning_rate": 1.629028757897821e-05, "loss": 0.826, "step": 14800 }, { "epoch": 0.2998214150967125, "grad_norm": 0.701524555683136, "learning_rate": 1.6240011942683774e-05, "loss": 0.8233, "step": 14900 }, { "epoch": 0.30183363935910656, "grad_norm": 0.6415804028511047, "learning_rate": 1.6189476666011123e-05, "loss": 0.8174, "step": 15000 }, { "epoch": 0.30183363935910656, "eval_loss": 0.5662389397621155, "eval_runtime": 11.3747, "eval_samples_per_second": 33.232, "eval_steps_per_second": 1.143, "step": 15000 }, { "epoch": 0.3038458636215006, "grad_norm": 0.593760073184967, "learning_rate": 1.6138683851680328e-05, "loss": 0.8269, "step": 15100 }, { "epoch": 0.3058580878838947, "grad_norm": 0.6708555221557617, "learning_rate": 1.608763561312733e-05, "loss": 0.8277, "step": 15200 }, { "epoch": 0.3078703121462887, "grad_norm": 0.5819365382194519, "learning_rate": 1.603633407441601e-05, "loss": 0.8237, "step": 15300 }, { "epoch": 0.3078703121462887, "eval_loss": 0.5628697872161865, "eval_runtime": 11.3199, "eval_samples_per_second": 33.393, "eval_steps_per_second": 1.148, "step": 15300 }, { "epoch": 0.30988253640868274, "grad_norm": 0.725537896156311, "learning_rate": 1.5984781370149798e-05, "loss": 0.8355, "step": 15400 }, { "epoch": 0.3118947606710768, "grad_norm": 0.642382800579071, "learning_rate": 1.5932979645382863e-05, "loss": 0.8292, "step": 15500 }, { "epoch": 0.31390698493347086, "grad_norm": 0.6141934394836426, "learning_rate": 1.588093105553086e-05, "loss": 0.8306, "step": 15600 }, { "epoch": 0.31390698493347086, "eval_loss": 0.5633600354194641, "eval_runtime": 11.3793, "eval_samples_per_second": 33.218, "eval_steps_per_second": 1.142, "step": 15600 }, { "epoch": 0.31591920919586486, "grad_norm": 0.6902384757995605, "learning_rate": 1.5828637766281238e-05, "loss": 0.8243, "step": 15700 }, { "epoch": 0.3179314334582589, "grad_norm": 0.7464603781700134, "learning_rate": 1.5776101953503134e-05, "loss": 0.8296, "step": 15800 }, { "epoch": 0.319943657720653, "grad_norm": 0.6735148429870605, "learning_rate": 1.5723325803156834e-05, "loss": 0.8168, "step": 15900 }, { "epoch": 0.319943657720653, "eval_loss": 0.5626727938652039, "eval_runtime": 11.3991, "eval_samples_per_second": 33.16, "eval_steps_per_second": 1.14, "step": 15900 }, { "epoch": 0.32195588198304703, "grad_norm": 0.7461301684379578, "learning_rate": 1.5670311511202823e-05, "loss": 0.8175, "step": 16000 }, { "epoch": 0.32396810624544103, "grad_norm": 0.6454249620437622, "learning_rate": 1.5617061283510404e-05, "loss": 0.8287, "step": 16100 }, { "epoch": 0.3259803305078351, "grad_norm": 0.723892331123352, "learning_rate": 1.5563577335765925e-05, "loss": 0.8256, "step": 16200 }, { "epoch": 0.3259803305078351, "eval_loss": 0.5635449290275574, "eval_runtime": 11.3171, "eval_samples_per_second": 33.401, "eval_steps_per_second": 1.149, "step": 16200 }, { "epoch": 0.32799255477022915, "grad_norm": 0.6277914047241211, "learning_rate": 1.5509861893380576e-05, "loss": 0.8274, "step": 16300 }, { "epoch": 0.3300047790326232, "grad_norm": 0.6103200316429138, "learning_rate": 1.5455917191397806e-05, "loss": 0.8207, "step": 16400 }, { "epoch": 0.3320170032950172, "grad_norm": 0.6216299533843994, "learning_rate": 1.5401745474400306e-05, "loss": 0.8218, "step": 16500 }, { "epoch": 0.3320170032950172, "eval_loss": 0.5613713264465332, "eval_runtime": 11.3097, "eval_samples_per_second": 33.423, "eval_steps_per_second": 1.149, "step": 16500 }, { "epoch": 0.33402922755741127, "grad_norm": 0.6130411624908447, "learning_rate": 1.5347348996416626e-05, "loss": 0.8193, "step": 16600 }, { "epoch": 0.3360414518198053, "grad_norm": 0.7175905704498291, "learning_rate": 1.5292730020827394e-05, "loss": 0.8205, "step": 16700 }, { "epoch": 0.3380536760821994, "grad_norm": 0.5804928541183472, "learning_rate": 1.5237890820271124e-05, "loss": 0.8256, "step": 16800 }, { "epoch": 0.3380536760821994, "eval_loss": 0.558940589427948, "eval_runtime": 11.507, "eval_samples_per_second": 32.849, "eval_steps_per_second": 1.13, "step": 16800 }, { "epoch": 0.3400659003445934, "grad_norm": 0.7494300007820129, "learning_rate": 1.518283367654966e-05, "loss": 0.8225, "step": 16900 }, { "epoch": 0.34207812460698744, "grad_norm": 0.5440366268157959, "learning_rate": 1.5127560880533242e-05, "loss": 0.8272, "step": 17000 }, { "epoch": 0.3440903488693815, "grad_norm": 0.5601567625999451, "learning_rate": 1.5072074732065165e-05, "loss": 0.829, "step": 17100 }, { "epoch": 0.3440903488693815, "eval_loss": 0.5592995285987854, "eval_runtime": 11.056, "eval_samples_per_second": 34.19, "eval_steps_per_second": 1.176, "step": 17100 }, { "epoch": 0.34610257313177556, "grad_norm": 0.6553789377212524, "learning_rate": 1.5016377539866106e-05, "loss": 0.824, "step": 17200 }, { "epoch": 0.34811479739416956, "grad_norm": 0.7243614792823792, "learning_rate": 1.4960471621438047e-05, "loss": 0.8206, "step": 17300 }, { "epoch": 0.3501270216565636, "grad_norm": 0.7584229111671448, "learning_rate": 1.4904359302967848e-05, "loss": 0.8264, "step": 17400 }, { "epoch": 0.3501270216565636, "eval_loss": 0.5582433342933655, "eval_runtime": 11.4613, "eval_samples_per_second": 32.98, "eval_steps_per_second": 1.134, "step": 17400 }, { "epoch": 0.3521392459189577, "grad_norm": 0.9413104057312012, "learning_rate": 1.4848042919230464e-05, "loss": 0.8082, "step": 17500 }, { "epoch": 0.35415147018135174, "grad_norm": 0.7952352166175842, "learning_rate": 1.4791524813491789e-05, "loss": 0.8138, "step": 17600 }, { "epoch": 0.35616369444374574, "grad_norm": 0.6611462235450745, "learning_rate": 1.4734807337411166e-05, "loss": 0.817, "step": 17700 }, { "epoch": 0.35616369444374574, "eval_loss": 0.5570442080497742, "eval_runtime": 11.4931, "eval_samples_per_second": 32.889, "eval_steps_per_second": 1.131, "step": 17700 }, { "epoch": 0.3581759187061398, "grad_norm": 0.8845998644828796, "learning_rate": 1.4677892850943516e-05, "loss": 0.8124, "step": 17800 }, { "epoch": 0.36018814296853385, "grad_norm": 0.6421878337860107, "learning_rate": 1.462078372224117e-05, "loss": 0.814, "step": 17900 }, { "epoch": 0.3622003672309279, "grad_norm": 0.6532554030418396, "learning_rate": 1.456348232755531e-05, "loss": 0.8081, "step": 18000 }, { "epoch": 0.3622003672309279, "eval_loss": 0.5557852983474731, "eval_runtime": 11.4159, "eval_samples_per_second": 33.112, "eval_steps_per_second": 1.139, "step": 18000 }, { "epoch": 0.3642125914933219, "grad_norm": 0.8483557105064392, "learning_rate": 1.4505991051137112e-05, "loss": 0.8137, "step": 18100 }, { "epoch": 0.36622481575571597, "grad_norm": 0.7414484620094299, "learning_rate": 1.4448312285138524e-05, "loss": 0.8095, "step": 18200 }, { "epoch": 0.36823704001811003, "grad_norm": 0.6685389280319214, "learning_rate": 1.4390448429512747e-05, "loss": 0.8108, "step": 18300 }, { "epoch": 0.36823704001811003, "eval_loss": 0.5559925436973572, "eval_runtime": 11.4267, "eval_samples_per_second": 33.081, "eval_steps_per_second": 1.138, "step": 18300 }, { "epoch": 0.3702492642805041, "grad_norm": 0.5973154306411743, "learning_rate": 1.4332401891914365e-05, "loss": 0.8144, "step": 18400 }, { "epoch": 0.3722614885428981, "grad_norm": 0.6153602004051208, "learning_rate": 1.4274175087599166e-05, "loss": 0.8234, "step": 18500 }, { "epoch": 0.37427371280529215, "grad_norm": 0.6379988789558411, "learning_rate": 1.4215770439323657e-05, "loss": 0.8137, "step": 18600 }, { "epoch": 0.37427371280529215, "eval_loss": 0.5545734763145447, "eval_runtime": 11.3444, "eval_samples_per_second": 33.32, "eval_steps_per_second": 1.146, "step": 18600 }, { "epoch": 0.3762859370676862, "grad_norm": 0.6836999654769897, "learning_rate": 1.4157190377244233e-05, "loss": 0.811, "step": 18700 }, { "epoch": 0.37829816133008026, "grad_norm": 0.5659916400909424, "learning_rate": 1.409843733881608e-05, "loss": 0.8175, "step": 18800 }, { "epoch": 0.38031038559247426, "grad_norm": 0.6270354986190796, "learning_rate": 1.4039513768691753e-05, "loss": 0.8221, "step": 18900 }, { "epoch": 0.38031038559247426, "eval_loss": 0.5561990737915039, "eval_runtime": 11.437, "eval_samples_per_second": 33.051, "eval_steps_per_second": 1.137, "step": 18900 }, { "epoch": 0.3823226098548683, "grad_norm": 0.6403433680534363, "learning_rate": 1.3980422118619447e-05, "loss": 0.8156, "step": 19000 }, { "epoch": 0.3843348341172624, "grad_norm": 0.5956655144691467, "learning_rate": 1.3921164847340996e-05, "loss": 0.8161, "step": 19100 }, { "epoch": 0.38634705837965644, "grad_norm": 1.1075905561447144, "learning_rate": 1.3861744420489547e-05, "loss": 0.8115, "step": 19200 }, { "epoch": 0.38634705837965644, "eval_loss": 0.5551438927650452, "eval_runtime": 11.6061, "eval_samples_per_second": 32.569, "eval_steps_per_second": 1.12, "step": 19200 }, { "epoch": 0.38835928264205044, "grad_norm": 0.5919958353042603, "learning_rate": 1.380216331048699e-05, "loss": 0.8042, "step": 19300 }, { "epoch": 0.3903715069044445, "grad_norm": 0.599104106426239, "learning_rate": 1.3742423996441067e-05, "loss": 0.8107, "step": 19400 }, { "epoch": 0.39238373116683856, "grad_norm": 0.6891294121742249, "learning_rate": 1.3682528964042234e-05, "loss": 0.8082, "step": 19500 }, { "epoch": 0.39238373116683856, "eval_loss": 0.5554007291793823, "eval_runtime": 11.5763, "eval_samples_per_second": 32.653, "eval_steps_per_second": 1.123, "step": 19500 }, { "epoch": 0.3943959554292326, "grad_norm": 0.6625336408615112, "learning_rate": 1.3622480705460217e-05, "loss": 0.8161, "step": 19600 }, { "epoch": 0.3964081796916266, "grad_norm": 0.6874691843986511, "learning_rate": 1.3562281719240323e-05, "loss": 0.808, "step": 19700 }, { "epoch": 0.3984204039540207, "grad_norm": 0.6335239410400391, "learning_rate": 1.3501934510199479e-05, "loss": 0.8172, "step": 19800 }, { "epoch": 0.3984204039540207, "eval_loss": 0.5533725023269653, "eval_runtime": 11.4224, "eval_samples_per_second": 33.093, "eval_steps_per_second": 1.138, "step": 19800 }, { "epoch": 0.40043262821641473, "grad_norm": 0.6799935102462769, "learning_rate": 1.3441441589322013e-05, "loss": 0.8102, "step": 19900 }, { "epoch": 0.4024448524788088, "grad_norm": 0.7125223278999329, "learning_rate": 1.338080547365517e-05, "loss": 0.8196, "step": 20000 }, { "epoch": 0.4044570767412028, "grad_norm": 0.6379702091217041, "learning_rate": 1.3320028686204378e-05, "loss": 0.7988, "step": 20100 }, { "epoch": 0.4044570767412028, "eval_loss": 0.5532128214836121, "eval_runtime": 11.5518, "eval_samples_per_second": 32.722, "eval_steps_per_second": 1.125, "step": 20100 }, { "epoch": 0.40646930100359685, "grad_norm": 0.6244897842407227, "learning_rate": 1.325911375582827e-05, "loss": 0.8078, "step": 20200 }, { "epoch": 0.4084815252659909, "grad_norm": 0.6567655801773071, "learning_rate": 1.319806321713346e-05, "loss": 0.812, "step": 20300 }, { "epoch": 0.41049374952838497, "grad_norm": 0.7605450749397278, "learning_rate": 1.3136879610369091e-05, "loss": 0.8078, "step": 20400 }, { "epoch": 0.41049374952838497, "eval_loss": 0.5506391525268555, "eval_runtime": 11.3697, "eval_samples_per_second": 33.246, "eval_steps_per_second": 1.143, "step": 20400 }, { "epoch": 0.41250597379077897, "grad_norm": 0.669282853603363, "learning_rate": 1.3075565481321122e-05, "loss": 0.8086, "step": 20500 }, { "epoch": 0.414518198053173, "grad_norm": 0.6792070269584656, "learning_rate": 1.301412338120641e-05, "loss": 0.8075, "step": 20600 }, { "epoch": 0.4165304223155671, "grad_norm": 0.5937780737876892, "learning_rate": 1.2952555866566554e-05, "loss": 0.8151, "step": 20700 }, { "epoch": 0.4165304223155671, "eval_loss": 0.5495349168777466, "eval_runtime": 11.3633, "eval_samples_per_second": 33.265, "eval_steps_per_second": 1.144, "step": 20700 }, { "epoch": 0.4185426465779611, "grad_norm": 0.6547305583953857, "learning_rate": 1.2890865499161522e-05, "loss": 0.8022, "step": 20800 }, { "epoch": 0.42055487084035514, "grad_norm": 0.5942917466163635, "learning_rate": 1.2829054845863054e-05, "loss": 0.8079, "step": 20900 }, { "epoch": 0.4225670951027492, "grad_norm": 0.5794849991798401, "learning_rate": 1.2767126478547865e-05, "loss": 0.8152, "step": 21000 }, { "epoch": 0.4225670951027492, "eval_loss": 0.5491987466812134, "eval_runtime": 11.3343, "eval_samples_per_second": 33.35, "eval_steps_per_second": 1.147, "step": 21000 }, { "epoch": 0.42457931936514326, "grad_norm": 0.6574000120162964, "learning_rate": 1.2705082973990623e-05, "loss": 0.8087, "step": 21100 }, { "epoch": 0.42659154362753726, "grad_norm": 0.6523112654685974, "learning_rate": 1.264292691375674e-05, "loss": 0.8098, "step": 21200 }, { "epoch": 0.4286037678899313, "grad_norm": 0.6403859853744507, "learning_rate": 1.2580660884094944e-05, "loss": 0.8125, "step": 21300 }, { "epoch": 0.4286037678899313, "eval_loss": 0.5487639307975769, "eval_runtime": 11.6017, "eval_samples_per_second": 32.581, "eval_steps_per_second": 1.121, "step": 21300 }, { "epoch": 0.4306159921523254, "grad_norm": 0.6883541345596313, "learning_rate": 1.2518287475829687e-05, "loss": 0.804, "step": 21400 }, { "epoch": 0.43262821641471944, "grad_norm": 0.6650357246398926, "learning_rate": 1.2455809284253329e-05, "loss": 0.8097, "step": 21500 }, { "epoch": 0.43464044067711344, "grad_norm": 0.6048406958580017, "learning_rate": 1.239322890901815e-05, "loss": 0.8059, "step": 21600 }, { "epoch": 0.43464044067711344, "eval_loss": 0.5487421751022339, "eval_runtime": 11.4779, "eval_samples_per_second": 32.933, "eval_steps_per_second": 1.133, "step": 21600 }, { "epoch": 0.4366526649395075, "grad_norm": 0.6876850724220276, "learning_rate": 1.233054895402819e-05, "loss": 0.8027, "step": 21700 }, { "epoch": 0.43866488920190155, "grad_norm": 0.656778872013092, "learning_rate": 1.2267772027330893e-05, "loss": 0.8124, "step": 21800 }, { "epoch": 0.4406771134642956, "grad_norm": 0.6603732109069824, "learning_rate": 1.22049007410086e-05, "loss": 0.8032, "step": 21900 }, { "epoch": 0.4406771134642956, "eval_loss": 0.547619104385376, "eval_runtime": 11.4392, "eval_samples_per_second": 33.044, "eval_steps_per_second": 1.136, "step": 21900 }, { "epoch": 0.4426893377266896, "grad_norm": 0.5987362861633301, "learning_rate": 1.2141937711069857e-05, "loss": 0.8075, "step": 22000 }, { "epoch": 0.44470156198908367, "grad_norm": 0.6756895780563354, "learning_rate": 1.2078885557340562e-05, "loss": 0.8092, "step": 22100 }, { "epoch": 0.44671378625147773, "grad_norm": 0.7242164015769958, "learning_rate": 1.2015746903354968e-05, "loss": 0.8156, "step": 22200 }, { "epoch": 0.44671378625147773, "eval_loss": 0.5490314364433289, "eval_runtime": 11.6139, "eval_samples_per_second": 32.547, "eval_steps_per_second": 1.119, "step": 22200 }, { "epoch": 0.4487260105138718, "grad_norm": 0.77918541431427, "learning_rate": 1.1952524376246504e-05, "loss": 0.8063, "step": 22300 }, { "epoch": 0.4507382347762658, "grad_norm": 0.6913318634033203, "learning_rate": 1.1889220606638476e-05, "loss": 0.8079, "step": 22400 }, { "epoch": 0.45275045903865985, "grad_norm": 0.747986376285553, "learning_rate": 1.1825838228534607e-05, "loss": 0.8033, "step": 22500 }, { "epoch": 0.45275045903865985, "eval_loss": 0.5468713045120239, "eval_runtime": 11.4, "eval_samples_per_second": 33.158, "eval_steps_per_second": 1.14, "step": 22500 }, { "epoch": 0.4547626833010539, "grad_norm": 0.6693961024284363, "learning_rate": 1.1762379879209442e-05, "loss": 0.8089, "step": 22600 }, { "epoch": 0.45677490756344796, "grad_norm": 0.6168875098228455, "learning_rate": 1.1698848199098596e-05, "loss": 0.7998, "step": 22700 }, { "epoch": 0.45878713182584197, "grad_norm": 0.6753715872764587, "learning_rate": 1.1635245831688913e-05, "loss": 0.8057, "step": 22800 }, { "epoch": 0.45878713182584197, "eval_loss": 0.5467536449432373, "eval_runtime": 11.3082, "eval_samples_per_second": 33.427, "eval_steps_per_second": 1.15, "step": 22800 }, { "epoch": 0.460799356088236, "grad_norm": 0.6399224996566772, "learning_rate": 1.1571575423408456e-05, "loss": 0.7965, "step": 22900 }, { "epoch": 0.4628115803506301, "grad_norm": 0.5371870994567871, "learning_rate": 1.1507839623516401e-05, "loss": 0.8014, "step": 23000 }, { "epoch": 0.46482380461302414, "grad_norm": 0.711793839931488, "learning_rate": 1.1444041083992801e-05, "loss": 0.8081, "step": 23100 }, { "epoch": 0.46482380461302414, "eval_loss": 0.5455725193023682, "eval_runtime": 11.4796, "eval_samples_per_second": 32.928, "eval_steps_per_second": 1.132, "step": 23100 }, { "epoch": 0.46683602887541814, "grad_norm": 0.566677451133728, "learning_rate": 1.1380182459428234e-05, "loss": 0.8027, "step": 23200 }, { "epoch": 0.4688482531378122, "grad_norm": 0.7086474895477295, "learning_rate": 1.1316266406913355e-05, "loss": 0.8024, "step": 23300 }, { "epoch": 0.47086047740020626, "grad_norm": 0.6261083483695984, "learning_rate": 1.1252295585928343e-05, "loss": 0.8054, "step": 23400 }, { "epoch": 0.47086047740020626, "eval_loss": 0.5444592833518982, "eval_runtime": 11.5945, "eval_samples_per_second": 32.602, "eval_steps_per_second": 1.121, "step": 23400 }, { "epoch": 0.4728727016626003, "grad_norm": 0.6763809323310852, "learning_rate": 1.1188272658232228e-05, "loss": 0.7952, "step": 23500 }, { "epoch": 0.4748849259249943, "grad_norm": 0.6690487265586853, "learning_rate": 1.1124200287752157e-05, "loss": 0.807, "step": 23600 }, { "epoch": 0.4768971501873884, "grad_norm": 0.5711999535560608, "learning_rate": 1.1060081140472519e-05, "loss": 0.8052, "step": 23700 }, { "epoch": 0.4768971501873884, "eval_loss": 0.5443876385688782, "eval_runtime": 11.4195, "eval_samples_per_second": 33.101, "eval_steps_per_second": 1.138, "step": 23700 }, { "epoch": 0.47890937444978243, "grad_norm": 0.6411765217781067, "learning_rate": 1.0995917884324056e-05, "loss": 0.7976, "step": 23800 }, { "epoch": 0.4809215987121765, "grad_norm": 0.5719566941261292, "learning_rate": 1.0931713189072827e-05, "loss": 0.7992, "step": 23900 }, { "epoch": 0.4829338229745705, "grad_norm": 0.5175074934959412, "learning_rate": 1.086746972620913e-05, "loss": 0.8009, "step": 24000 }, { "epoch": 0.4829338229745705, "eval_loss": 0.5424737334251404, "eval_runtime": 11.3763, "eval_samples_per_second": 33.227, "eval_steps_per_second": 1.143, "step": 24000 }, { "epoch": 0.48494604723696455, "grad_norm": 0.6476929783821106, "learning_rate": 1.0803190168836341e-05, "loss": 0.7984, "step": 24100 }, { "epoch": 0.4869582714993586, "grad_norm": 0.6742759943008423, "learning_rate": 1.0738877191559691e-05, "loss": 0.7989, "step": 24200 }, { "epoch": 0.48897049576175267, "grad_norm": 0.5645999908447266, "learning_rate": 1.067453347037498e-05, "loss": 0.7985, "step": 24300 }, { "epoch": 0.48897049576175267, "eval_loss": 0.5427749752998352, "eval_runtime": 11.4256, "eval_samples_per_second": 33.084, "eval_steps_per_second": 1.138, "step": 24300 }, { "epoch": 0.49098272002414667, "grad_norm": 0.5972943902015686, "learning_rate": 1.0610161682557225e-05, "loss": 0.7961, "step": 24400 }, { "epoch": 0.4929949442865407, "grad_norm": 0.6340279579162598, "learning_rate": 1.0545764506549273e-05, "loss": 0.8033, "step": 24500 }, { "epoch": 0.4950071685489348, "grad_norm": 0.6096486449241638, "learning_rate": 1.0481344621850347e-05, "loss": 0.7955, "step": 24600 }, { "epoch": 0.4950071685489348, "eval_loss": 0.5418882369995117, "eval_runtime": 11.4157, "eval_samples_per_second": 33.112, "eval_steps_per_second": 1.139, "step": 24600 }, { "epoch": 0.49701939281132884, "grad_norm": 0.5778651833534241, "learning_rate": 1.041690470890455e-05, "loss": 0.7954, "step": 24700 }, { "epoch": 0.49903161707372284, "grad_norm": 0.5838211178779602, "learning_rate": 1.0352447448989337e-05, "loss": 0.7854, "step": 24800 }, { "epoch": 0.5010438413361169, "grad_norm": 0.5919055342674255, "learning_rate": 1.0287975524103964e-05, "loss": 0.7925, "step": 24900 }, { "epoch": 0.5010438413361169, "eval_loss": 0.541851818561554, "eval_runtime": 11.2979, "eval_samples_per_second": 33.457, "eval_steps_per_second": 1.151, "step": 24900 }, { "epoch": 0.5030560655985109, "grad_norm": 0.5358749628067017, "learning_rate": 1.022349161685787e-05, "loss": 0.7986, "step": 25000 }, { "epoch": 0.505068289860905, "grad_norm": 0.6401896476745605, "learning_rate": 1.0158998410359074e-05, "loss": 0.7914, "step": 25100 }, { "epoch": 0.507080514123299, "grad_norm": 0.5817869901657104, "learning_rate": 1.0094498588102523e-05, "loss": 0.7956, "step": 25200 }, { "epoch": 0.507080514123299, "eval_loss": 0.5417122840881348, "eval_runtime": 11.503, "eval_samples_per_second": 32.861, "eval_steps_per_second": 1.13, "step": 25200 }, { "epoch": 0.5090927383856931, "grad_norm": 0.5595591068267822, "learning_rate": 1.0029994833858438e-05, "loss": 0.7943, "step": 25300 }, { "epoch": 0.5111049626480871, "grad_norm": 0.5861169099807739, "learning_rate": 9.965489831560652e-06, "loss": 0.8006, "step": 25400 }, { "epoch": 0.5131171869104811, "grad_norm": 0.5644922852516174, "learning_rate": 9.900986265194924e-06, "loss": 0.7868, "step": 25500 }, { "epoch": 0.5131171869104811, "eval_loss": 0.5409750938415527, "eval_runtime": 11.3254, "eval_samples_per_second": 33.376, "eval_steps_per_second": 1.148, "step": 25500 }, { "epoch": 0.5151294111728753, "grad_norm": 0.5210478901863098, "learning_rate": 9.836486818687262e-06, "loss": 0.7967, "step": 25600 }, { "epoch": 0.5171416354352693, "grad_norm": 0.5937855839729309, "learning_rate": 9.771994175792262e-06, "loss": 0.7839, "step": 25700 }, { "epoch": 0.5191538596976633, "grad_norm": 0.68199622631073, "learning_rate": 9.707511019981416e-06, "loss": 0.7929, "step": 25800 }, { "epoch": 0.5191538596976633, "eval_loss": 0.53957599401474, "eval_runtime": 11.2847, "eval_samples_per_second": 33.497, "eval_steps_per_second": 1.152, "step": 25800 }, { "epoch": 0.5211660839600574, "grad_norm": 0.6363146305084229, "learning_rate": 9.643040034331475e-06, "loss": 0.7893, "step": 25900 }, { "epoch": 0.5231783082224514, "grad_norm": 0.6275014877319336, "learning_rate": 9.578583901412802e-06, "loss": 0.7883, "step": 26000 }, { "epoch": 0.5251905324848455, "grad_norm": 0.5840523838996887, "learning_rate": 9.514145303177751e-06, "loss": 0.7961, "step": 26100 }, { "epoch": 0.5251905324848455, "eval_loss": 0.5387553572654724, "eval_runtime": 11.2936, "eval_samples_per_second": 33.47, "eval_steps_per_second": 1.151, "step": 26100 }, { "epoch": 0.5272027567472395, "grad_norm": 0.706901490688324, "learning_rate": 9.449726920849085e-06, "loss": 0.795, "step": 26200 }, { "epoch": 0.5292149810096335, "grad_norm": 0.5236905813217163, "learning_rate": 9.385331434808386e-06, "loss": 0.7919, "step": 26300 }, { "epoch": 0.5312272052720276, "grad_norm": 0.6014547348022461, "learning_rate": 9.320961524484565e-06, "loss": 0.7917, "step": 26400 }, { "epoch": 0.5312272052720276, "eval_loss": 0.5388390421867371, "eval_runtime": 11.3827, "eval_samples_per_second": 33.208, "eval_steps_per_second": 1.142, "step": 26400 }, { "epoch": 0.5332394295344216, "grad_norm": 0.5613085031509399, "learning_rate": 9.256619868242341e-06, "loss": 0.7957, "step": 26500 }, { "epoch": 0.5352516537968156, "grad_norm": 0.6822344064712524, "learning_rate": 9.192309143270818e-06, "loss": 0.7867, "step": 26600 }, { "epoch": 0.5372638780592097, "grad_norm": 0.6041319370269775, "learning_rate": 9.128032025472077e-06, "loss": 0.7884, "step": 26700 }, { "epoch": 0.5372638780592097, "eval_loss": 0.5368719696998596, "eval_runtime": 11.3484, "eval_samples_per_second": 33.309, "eval_steps_per_second": 1.146, "step": 26700 }, { "epoch": 0.5392761023216037, "grad_norm": 0.644088089466095, "learning_rate": 9.063791189349841e-06, "loss": 0.7867, "step": 26800 }, { "epoch": 0.5412883265839978, "grad_norm": 0.627928614616394, "learning_rate": 8.999589307898192e-06, "loss": 0.7896, "step": 26900 }, { "epoch": 0.5433005508463918, "grad_norm": 0.6207029819488525, "learning_rate": 8.935429052490347e-06, "loss": 0.7853, "step": 27000 }, { "epoch": 0.5433005508463918, "eval_loss": 0.5371023416519165, "eval_runtime": 11.3461, "eval_samples_per_second": 33.316, "eval_steps_per_second": 1.146, "step": 27000 }, { "epoch": 0.5453127751087858, "grad_norm": 0.541533887386322, "learning_rate": 8.87131309276751e-06, "loss": 0.7916, "step": 27100 }, { "epoch": 0.54732499937118, "grad_norm": 0.590813934803009, "learning_rate": 8.807244096527783e-06, "loss": 0.7948, "step": 27200 }, { "epoch": 0.549337223633574, "grad_norm": 0.584229588508606, "learning_rate": 8.743224729615168e-06, "loss": 0.7918, "step": 27300 }, { "epoch": 0.549337223633574, "eval_loss": 0.5366615653038025, "eval_runtime": 11.3157, "eval_samples_per_second": 33.405, "eval_steps_per_second": 1.149, "step": 27300 }, { "epoch": 0.551349447895968, "grad_norm": 0.6746295094490051, "learning_rate": 8.679257655808645e-06, "loss": 0.7911, "step": 27400 }, { "epoch": 0.5533616721583621, "grad_norm": 0.6765587329864502, "learning_rate": 8.615345536711331e-06, "loss": 0.7906, "step": 27500 }, { "epoch": 0.5553738964207561, "grad_norm": 0.5838325619697571, "learning_rate": 8.551491031639736e-06, "loss": 0.7937, "step": 27600 }, { "epoch": 0.5553738964207561, "eval_loss": 0.5361348390579224, "eval_runtime": 11.3123, "eval_samples_per_second": 33.415, "eval_steps_per_second": 1.149, "step": 27600 }, { "epoch": 0.5573861206831502, "grad_norm": 0.6001378893852234, "learning_rate": 8.487696797513108e-06, "loss": 0.7777, "step": 27700 }, { "epoch": 0.5593983449455442, "grad_norm": 0.5667701363563538, "learning_rate": 8.423965488742885e-06, "loss": 0.7856, "step": 27800 }, { "epoch": 0.5614105692079382, "grad_norm": 0.632291316986084, "learning_rate": 8.360299757122247e-06, "loss": 0.7792, "step": 27900 }, { "epoch": 0.5614105692079382, "eval_loss": 0.5353109240531921, "eval_runtime": 11.3749, "eval_samples_per_second": 33.231, "eval_steps_per_second": 1.143, "step": 27900 }, { "epoch": 0.5634227934703323, "grad_norm": 0.5472155213356018, "learning_rate": 8.296702251715778e-06, "loss": 0.7831, "step": 28000 }, { "epoch": 0.5654350177327263, "grad_norm": 0.590352475643158, "learning_rate": 8.233175618749243e-06, "loss": 0.7833, "step": 28100 }, { "epoch": 0.5674472419951203, "grad_norm": 0.5392365455627441, "learning_rate": 8.16972250149947e-06, "loss": 0.7846, "step": 28200 }, { "epoch": 0.5674472419951203, "eval_loss": 0.5345659852027893, "eval_runtime": 11.3797, "eval_samples_per_second": 33.217, "eval_steps_per_second": 1.142, "step": 28200 }, { "epoch": 0.5694594662575144, "grad_norm": 0.5367996692657471, "learning_rate": 8.106345540184382e-06, "loss": 0.7881, "step": 28300 }, { "epoch": 0.5714716905199084, "grad_norm": 0.7017585039138794, "learning_rate": 8.043047371853135e-06, "loss": 0.7902, "step": 28400 }, { "epoch": 0.5734839147823025, "grad_norm": 0.6775383353233337, "learning_rate": 7.979830630276384e-06, "loss": 0.795, "step": 28500 }, { "epoch": 0.5734839147823025, "eval_loss": 0.5349369645118713, "eval_runtime": 11.3477, "eval_samples_per_second": 33.311, "eval_steps_per_second": 1.146, "step": 28500 }, { "epoch": 0.5754961390446965, "grad_norm": 0.5782616138458252, "learning_rate": 7.91669794583671e-06, "loss": 0.7902, "step": 28600 }, { "epoch": 0.5775083633070905, "grad_norm": 0.5419892072677612, "learning_rate": 7.853651945419155e-06, "loss": 0.7858, "step": 28700 }, { "epoch": 0.5795205875694847, "grad_norm": 0.6611707210540771, "learning_rate": 7.790695252301938e-06, "loss": 0.7894, "step": 28800 }, { "epoch": 0.5795205875694847, "eval_loss": 0.5343945026397705, "eval_runtime": 11.4492, "eval_samples_per_second": 33.015, "eval_steps_per_second": 1.135, "step": 28800 }, { "epoch": 0.5815328118318787, "grad_norm": 0.5788918137550354, "learning_rate": 7.727830486047288e-06, "loss": 0.7868, "step": 28900 }, { "epoch": 0.5835450360942727, "grad_norm": 0.5480091571807861, "learning_rate": 7.665060262392461e-06, "loss": 0.7858, "step": 29000 }, { "epoch": 0.5855572603566668, "grad_norm": 0.730056881904602, "learning_rate": 7.602387193140887e-06, "loss": 0.7884, "step": 29100 }, { "epoch": 0.5855572603566668, "eval_loss": 0.5339014530181885, "eval_runtime": 11.3802, "eval_samples_per_second": 33.216, "eval_steps_per_second": 1.142, "step": 29100 }, { "epoch": 0.5875694846190608, "grad_norm": 0.5774337649345398, "learning_rate": 7.539813886053502e-06, "loss": 0.7893, "step": 29200 }, { "epoch": 0.5895817088814549, "grad_norm": 0.615470290184021, "learning_rate": 7.477342944740249e-06, "loss": 0.7817, "step": 29300 }, { "epoch": 0.5915939331438489, "grad_norm": 0.6776989698410034, "learning_rate": 7.414976968551735e-06, "loss": 0.7783, "step": 29400 }, { "epoch": 0.5915939331438489, "eval_loss": 0.533939003944397, "eval_runtime": 11.3711, "eval_samples_per_second": 33.242, "eval_steps_per_second": 1.143, "step": 29400 }, { "epoch": 0.5936061574062429, "grad_norm": 0.5885875821113586, "learning_rate": 7.352718552471077e-06, "loss": 0.784, "step": 29500 }, { "epoch": 0.595618381668637, "grad_norm": 0.5772850513458252, "learning_rate": 7.290570287005931e-06, "loss": 0.7819, "step": 29600 }, { "epoch": 0.597630605931031, "grad_norm": 0.6122897863388062, "learning_rate": 7.228534758080694e-06, "loss": 0.7891, "step": 29700 }, { "epoch": 0.597630605931031, "eval_loss": 0.5327485799789429, "eval_runtime": 11.3326, "eval_samples_per_second": 33.355, "eval_steps_per_second": 1.147, "step": 29700 }, { "epoch": 0.599642830193425, "grad_norm": 0.6210538148880005, "learning_rate": 7.1666145469289226e-06, "loss": 0.7832, "step": 29800 }, { "epoch": 0.6016550544558191, "grad_norm": 0.593087911605835, "learning_rate": 7.1048122299859145e-06, "loss": 0.7888, "step": 29900 }, { "epoch": 0.6036672787182131, "grad_norm": 0.5805263519287109, "learning_rate": 7.043130378781516e-06, "loss": 0.7825, "step": 30000 }, { "epoch": 0.6036672787182131, "eval_loss": 0.5322030782699585, "eval_runtime": 11.3763, "eval_samples_per_second": 33.227, "eval_steps_per_second": 1.143, "step": 30000 }, { "epoch": 0.6056795029806072, "grad_norm": 0.5463854074478149, "learning_rate": 6.981571559833122e-06, "loss": 0.7881, "step": 30100 }, { "epoch": 0.6076917272430012, "grad_norm": 0.5730445384979248, "learning_rate": 6.920138334538878e-06, "loss": 0.7858, "step": 30200 }, { "epoch": 0.6097039515053952, "grad_norm": 0.5871597528457642, "learning_rate": 6.858833259071108e-06, "loss": 0.7777, "step": 30300 }, { "epoch": 0.6097039515053952, "eval_loss": 0.5328507423400879, "eval_runtime": 11.3806, "eval_samples_per_second": 33.215, "eval_steps_per_second": 1.142, "step": 30300 }, { "epoch": 0.6117161757677894, "grad_norm": 0.6252338290214539, "learning_rate": 6.797658884269962e-06, "loss": 0.778, "step": 30400 }, { "epoch": 0.6137284000301834, "grad_norm": 0.588524580001831, "learning_rate": 6.736617755537267e-06, "loss": 0.7772, "step": 30500 }, { "epoch": 0.6157406242925774, "grad_norm": 0.621525228023529, "learning_rate": 6.675712412730625e-06, "loss": 0.7832, "step": 30600 }, { "epoch": 0.6157406242925774, "eval_loss": 0.5325730443000793, "eval_runtime": 11.3314, "eval_samples_per_second": 33.359, "eval_steps_per_second": 1.147, "step": 30600 }, { "epoch": 0.6177528485549715, "grad_norm": 0.5612871646881104, "learning_rate": 6.614945390057723e-06, "loss": 0.7831, "step": 30700 }, { "epoch": 0.6197650728173655, "grad_norm": 0.5247837901115417, "learning_rate": 6.554319215970895e-06, "loss": 0.7828, "step": 30800 }, { "epoch": 0.6217772970797596, "grad_norm": 0.5758721232414246, "learning_rate": 6.493836413061907e-06, "loss": 0.781, "step": 30900 }, { "epoch": 0.6217772970797596, "eval_loss": 0.5314515829086304, "eval_runtime": 11.3823, "eval_samples_per_second": 33.21, "eval_steps_per_second": 1.142, "step": 30900 }, { "epoch": 0.6237895213421536, "grad_norm": 0.7134236693382263, "learning_rate": 6.433499497957006e-06, "loss": 0.7852, "step": 31000 }, { "epoch": 0.6258017456045476, "grad_norm": 0.5432785153388977, "learning_rate": 6.373310981212197e-06, "loss": 0.7776, "step": 31100 }, { "epoch": 0.6278139698669417, "grad_norm": 0.6110942959785461, "learning_rate": 6.3132733672087875e-06, "loss": 0.787, "step": 31200 }, { "epoch": 0.6278139698669417, "eval_loss": 0.5303037166595459, "eval_runtime": 11.4219, "eval_samples_per_second": 33.094, "eval_steps_per_second": 1.138, "step": 31200 }, { "epoch": 0.6298261941293357, "grad_norm": 0.5783369541168213, "learning_rate": 6.253389154049177e-06, "loss": 0.7807, "step": 31300 }, { "epoch": 0.6318384183917297, "grad_norm": 0.5356603860855103, "learning_rate": 6.19366083345291e-06, "loss": 0.7801, "step": 31400 }, { "epoch": 0.6338506426541238, "grad_norm": 0.5529428124427795, "learning_rate": 6.134090890653015e-06, "loss": 0.7774, "step": 31500 }, { "epoch": 0.6338506426541238, "eval_loss": 0.5301904678344727, "eval_runtime": 11.4476, "eval_samples_per_second": 33.02, "eval_steps_per_second": 1.136, "step": 31500 }, { "epoch": 0.6358628669165178, "grad_norm": 0.5553627610206604, "learning_rate": 6.074681804292581e-06, "loss": 0.7791, "step": 31600 }, { "epoch": 0.6378750911789118, "grad_norm": 0.5281953811645508, "learning_rate": 6.0154360463216325e-06, "loss": 0.7769, "step": 31700 }, { "epoch": 0.639887315441306, "grad_norm": 0.6406475305557251, "learning_rate": 5.956356081894259e-06, "loss": 0.7799, "step": 31800 }, { "epoch": 0.639887315441306, "eval_loss": 0.5294053554534912, "eval_runtime": 11.3422, "eval_samples_per_second": 33.327, "eval_steps_per_second": 1.146, "step": 31800 }, { "epoch": 0.6418995397037, "grad_norm": 0.49855828285217285, "learning_rate": 5.897444369266066e-06, "loss": 0.7759, "step": 31900 }, { "epoch": 0.6439117639660941, "grad_norm": 0.5699638724327087, "learning_rate": 5.838703359691873e-06, "loss": 0.7673, "step": 32000 }, { "epoch": 0.6459239882284881, "grad_norm": 0.5306676030158997, "learning_rate": 5.780135497323724e-06, "loss": 0.7799, "step": 32100 }, { "epoch": 0.6459239882284881, "eval_loss": 0.5290261507034302, "eval_runtime": 11.3435, "eval_samples_per_second": 33.323, "eval_steps_per_second": 1.146, "step": 32100 }, { "epoch": 0.6479362124908821, "grad_norm": 0.5989037752151489, "learning_rate": 5.721743219109187e-06, "loss": 0.7757, "step": 32200 }, { "epoch": 0.6499484367532762, "grad_norm": 0.5595914721488953, "learning_rate": 5.663528954689958e-06, "loss": 0.7761, "step": 32300 }, { "epoch": 0.6519606610156702, "grad_norm": 0.5618345737457275, "learning_rate": 5.605495126300766e-06, "loss": 0.779, "step": 32400 }, { "epoch": 0.6519606610156702, "eval_loss": 0.529247522354126, "eval_runtime": 11.3716, "eval_samples_per_second": 33.241, "eval_steps_per_second": 1.143, "step": 32400 }, { "epoch": 0.6539728852780642, "grad_norm": 0.5271475315093994, "learning_rate": 5.547644148668585e-06, "loss": 0.7747, "step": 32500 }, { "epoch": 0.6559851095404583, "grad_norm": 0.5703973770141602, "learning_rate": 5.489978428912157e-06, "loss": 0.7801, "step": 32600 }, { "epoch": 0.6579973338028523, "grad_norm": 0.570797860622406, "learning_rate": 5.432500366441843e-06, "loss": 0.7756, "step": 32700 }, { "epoch": 0.6579973338028523, "eval_loss": 0.5275307893753052, "eval_runtime": 11.3412, "eval_samples_per_second": 33.33, "eval_steps_per_second": 1.146, "step": 32700 }, { "epoch": 0.6600095580652464, "grad_norm": 0.564414918422699, "learning_rate": 5.3752123528597746e-06, "loss": 0.7688, "step": 32800 }, { "epoch": 0.6620217823276404, "grad_norm": 0.5405446290969849, "learning_rate": 5.318116771860351e-06, "loss": 0.7777, "step": 32900 }, { "epoch": 0.6640340065900344, "grad_norm": 0.5645068883895874, "learning_rate": 5.261215999131055e-06, "loss": 0.7723, "step": 33000 }, { "epoch": 0.6640340065900344, "eval_loss": 0.5280060172080994, "eval_runtime": 11.3103, "eval_samples_per_second": 33.421, "eval_steps_per_second": 1.149, "step": 33000 }, { "epoch": 0.6660462308524285, "grad_norm": 0.5821409225463867, "learning_rate": 5.204512402253592e-06, "loss": 0.7857, "step": 33100 }, { "epoch": 0.6680584551148225, "grad_norm": 0.5534176230430603, "learning_rate": 5.148008340605393e-06, "loss": 0.7726, "step": 33200 }, { "epoch": 0.6700706793772165, "grad_norm": 0.5734113454818726, "learning_rate": 5.091706165261438e-06, "loss": 0.7806, "step": 33300 }, { "epoch": 0.6700706793772165, "eval_loss": 0.527226984500885, "eval_runtime": 11.3532, "eval_samples_per_second": 33.295, "eval_steps_per_second": 1.145, "step": 33300 }, { "epoch": 0.6720829036396107, "grad_norm": 0.5118337273597717, "learning_rate": 5.035608218896424e-06, "loss": 0.7794, "step": 33400 }, { "epoch": 0.6740951279020047, "grad_norm": 0.520524799823761, "learning_rate": 4.979716835687296e-06, "loss": 0.7833, "step": 33500 }, { "epoch": 0.6761073521643988, "grad_norm": 0.5260956883430481, "learning_rate": 4.924034341216123e-06, "loss": 0.7722, "step": 33600 }, { "epoch": 0.6761073521643988, "eval_loss": 0.5266076326370239, "eval_runtime": 11.3351, "eval_samples_per_second": 33.348, "eval_steps_per_second": 1.147, "step": 33600 }, { "epoch": 0.6781195764267928, "grad_norm": 0.5933238863945007, "learning_rate": 4.868563052373329e-06, "loss": 0.778, "step": 33700 }, { "epoch": 0.6801318006891868, "grad_norm": 0.5882487297058105, "learning_rate": 4.813305277261294e-06, "loss": 0.778, "step": 33800 }, { "epoch": 0.6821440249515809, "grad_norm": 0.5495398640632629, "learning_rate": 4.758263315098319e-06, "loss": 0.7749, "step": 33900 }, { "epoch": 0.6821440249515809, "eval_loss": 0.527021050453186, "eval_runtime": 11.3019, "eval_samples_per_second": 33.446, "eval_steps_per_second": 1.15, "step": 33900 }, { "epoch": 0.6841562492139749, "grad_norm": 0.5372888445854187, "learning_rate": 4.703439456122942e-06, "loss": 0.7726, "step": 34000 }, { "epoch": 0.6861684734763689, "grad_norm": 0.5453928709030151, "learning_rate": 4.648835981498665e-06, "loss": 0.7736, "step": 34100 }, { "epoch": 0.688180697738763, "grad_norm": 0.534249484539032, "learning_rate": 4.594455163219025e-06, "loss": 0.7669, "step": 34200 }, { "epoch": 0.688180697738763, "eval_loss": 0.5258325934410095, "eval_runtime": 11.3315, "eval_samples_per_second": 33.358, "eval_steps_per_second": 1.147, "step": 34200 }, { "epoch": 0.690192922001157, "grad_norm": 0.602557897567749, "learning_rate": 4.5402992640130615e-06, "loss": 0.7776, "step": 34300 }, { "epoch": 0.6922051462635511, "grad_norm": 0.6340908408164978, "learning_rate": 4.486370537251166e-06, "loss": 0.7724, "step": 34400 }, { "epoch": 0.6942173705259451, "grad_norm": 0.5442144870758057, "learning_rate": 4.43267122685132e-06, "loss": 0.7678, "step": 34500 }, { "epoch": 0.6942173705259451, "eval_loss": 0.52588951587677, "eval_runtime": 11.3113, "eval_samples_per_second": 33.418, "eval_steps_per_second": 1.149, "step": 34500 }, { "epoch": 0.6962295947883391, "grad_norm": 0.5438702702522278, "learning_rate": 4.379203567185733e-06, "loss": 0.7722, "step": 34600 }, { "epoch": 0.6982418190507332, "grad_norm": 0.575579822063446, "learning_rate": 4.325969782987868e-06, "loss": 0.7806, "step": 34700 }, { "epoch": 0.7002540433131272, "grad_norm": 0.53037029504776, "learning_rate": 4.2729720892598725e-06, "loss": 0.7677, "step": 34800 }, { "epoch": 0.7002540433131272, "eval_loss": 0.5252464413642883, "eval_runtime": 11.2976, "eval_samples_per_second": 33.458, "eval_steps_per_second": 1.151, "step": 34800 }, { "epoch": 0.7022662675755212, "grad_norm": 0.5570893883705139, "learning_rate": 4.220212691180422e-06, "loss": 0.7674, "step": 34900 }, { "epoch": 0.7042784918379154, "grad_norm": 0.564457893371582, "learning_rate": 4.167693784012948e-06, "loss": 0.7774, "step": 35000 }, { "epoch": 0.7062907161003094, "grad_norm": 0.6193362474441528, "learning_rate": 4.115417553014317e-06, "loss": 0.7739, "step": 35100 }, { "epoch": 0.7062907161003094, "eval_loss": 0.5251539349555969, "eval_runtime": 11.3037, "eval_samples_per_second": 33.44, "eval_steps_per_second": 1.15, "step": 35100 }, { "epoch": 0.7083029403627035, "grad_norm": 0.5650792121887207, "learning_rate": 4.063386173343888e-06, "loss": 0.775, "step": 35200 }, { "epoch": 0.7103151646250975, "grad_norm": 0.5598296523094177, "learning_rate": 4.0116018099730155e-06, "loss": 0.7736, "step": 35300 }, { "epoch": 0.7123273888874915, "grad_norm": 0.5999264717102051, "learning_rate": 3.960066617594962e-06, "loss": 0.7728, "step": 35400 }, { "epoch": 0.7123273888874915, "eval_loss": 0.5251903533935547, "eval_runtime": 11.3608, "eval_samples_per_second": 33.272, "eval_steps_per_second": 1.144, "step": 35400 }, { "epoch": 0.7143396131498856, "grad_norm": 0.5485169291496277, "learning_rate": 3.908782740535244e-06, "loss": 0.7663, "step": 35500 }, { "epoch": 0.7163518374122796, "grad_norm": 0.5973437428474426, "learning_rate": 3.857752312662413e-06, "loss": 0.7731, "step": 35600 }, { "epoch": 0.7183640616746736, "grad_norm": 0.559617280960083, "learning_rate": 3.8069774572992614e-06, "loss": 0.7623, "step": 35700 }, { "epoch": 0.7183640616746736, "eval_loss": 0.5247710347175598, "eval_runtime": 11.3529, "eval_samples_per_second": 33.296, "eval_steps_per_second": 1.145, "step": 35700 }, { "epoch": 0.7203762859370677, "grad_norm": 0.5565606355667114, "learning_rate": 3.756460287134479e-06, "loss": 0.7773, "step": 35800 }, { "epoch": 0.7223885101994617, "grad_norm": 0.5371571779251099, "learning_rate": 3.706202904134747e-06, "loss": 0.7761, "step": 35900 }, { "epoch": 0.7244007344618558, "grad_norm": 0.5425861477851868, "learning_rate": 3.6562073994572624e-06, "loss": 0.7775, "step": 36000 }, { "epoch": 0.7244007344618558, "eval_loss": 0.5243012309074402, "eval_runtime": 11.3858, "eval_samples_per_second": 33.199, "eval_steps_per_second": 1.142, "step": 36000 }, { "epoch": 0.7264129587242498, "grad_norm": 0.5546737909317017, "learning_rate": 3.6064758533627496e-06, "loss": 0.7712, "step": 36100 }, { "epoch": 0.7284251829866438, "grad_norm": 0.6678885221481323, "learning_rate": 3.55701033512889e-06, "loss": 0.769, "step": 36200 }, { "epoch": 0.7304374072490379, "grad_norm": 0.5747791528701782, "learning_rate": 3.5078129029642192e-06, "loss": 0.7671, "step": 36300 }, { "epoch": 0.7304374072490379, "eval_loss": 0.523876428604126, "eval_runtime": 11.3643, "eval_samples_per_second": 33.262, "eval_steps_per_second": 1.144, "step": 36300 }, { "epoch": 0.7324496315114319, "grad_norm": 0.6479108333587646, "learning_rate": 3.458885603922498e-06, "loss": 0.7678, "step": 36400 }, { "epoch": 0.734461855773826, "grad_norm": 0.5260623693466187, "learning_rate": 3.4102304738175264e-06, "loss": 0.7686, "step": 36500 }, { "epoch": 0.7364740800362201, "grad_norm": 0.5565561056137085, "learning_rate": 3.3618495371384384e-06, "loss": 0.7722, "step": 36600 }, { "epoch": 0.7364740800362201, "eval_loss": 0.5241602659225464, "eval_runtime": 11.2637, "eval_samples_per_second": 33.559, "eval_steps_per_second": 1.154, "step": 36600 }, { "epoch": 0.7384863042986141, "grad_norm": 0.5522435307502747, "learning_rate": 3.3137448069654687e-06, "loss": 0.7753, "step": 36700 }, { "epoch": 0.7404985285610082, "grad_norm": 0.5111953020095825, "learning_rate": 3.265918284886186e-06, "loss": 0.7739, "step": 36800 }, { "epoch": 0.7425107528234022, "grad_norm": 0.5280485153198242, "learning_rate": 3.2183719609122146e-06, "loss": 0.7626, "step": 36900 }, { "epoch": 0.7425107528234022, "eval_loss": 0.5227437615394592, "eval_runtime": 11.3194, "eval_samples_per_second": 33.394, "eval_steps_per_second": 1.148, "step": 36900 }, { "epoch": 0.7445229770857962, "grad_norm": 0.5183678865432739, "learning_rate": 3.171107813396418e-06, "loss": 0.7745, "step": 37000 }, { "epoch": 0.7465352013481903, "grad_norm": 0.5712314248085022, "learning_rate": 3.124127808950602e-06, "loss": 0.7711, "step": 37100 }, { "epoch": 0.7485474256105843, "grad_norm": 0.5488412380218506, "learning_rate": 3.0774339023636756e-06, "loss": 0.7689, "step": 37200 }, { "epoch": 0.7485474256105843, "eval_loss": 0.5230608582496643, "eval_runtime": 11.338, "eval_samples_per_second": 33.339, "eval_steps_per_second": 1.147, "step": 37200 }, { "epoch": 0.7505596498729783, "grad_norm": 0.5331023335456848, "learning_rate": 3.0310280365203102e-06, "loss": 0.7663, "step": 37300 }, { "epoch": 0.7525718741353724, "grad_norm": 0.5227448344230652, "learning_rate": 2.9849121423201054e-06, "loss": 0.7645, "step": 37400 }, { "epoch": 0.7545840983977664, "grad_norm": 0.5383438467979431, "learning_rate": 2.9390881385972445e-06, "loss": 0.7624, "step": 37500 }, { "epoch": 0.7545840983977664, "eval_loss": 0.5230525732040405, "eval_runtime": 11.3076, "eval_samples_per_second": 33.429, "eval_steps_per_second": 1.15, "step": 37500 }, { "epoch": 0.7565963226601605, "grad_norm": 0.5267183184623718, "learning_rate": 2.8935579320406504e-06, "loss": 0.7744, "step": 37600 }, { "epoch": 0.7586085469225545, "grad_norm": 0.5995730757713318, "learning_rate": 2.8483234171146544e-06, "loss": 0.77, "step": 37700 }, { "epoch": 0.7606207711849485, "grad_norm": 0.5342182517051697, "learning_rate": 2.803386475980171e-06, "loss": 0.772, "step": 37800 }, { "epoch": 0.7606207711849485, "eval_loss": 0.5222497582435608, "eval_runtime": 11.6813, "eval_samples_per_second": 32.36, "eval_steps_per_second": 1.113, "step": 37800 }, { "epoch": 0.7626329954473426, "grad_norm": 0.5149078965187073, "learning_rate": 2.758748978416369e-06, "loss": 0.7675, "step": 37900 }, { "epoch": 0.7646452197097366, "grad_norm": 0.5688450932502747, "learning_rate": 2.7144127817428965e-06, "loss": 0.7655, "step": 38000 }, { "epoch": 0.7666574439721306, "grad_norm": 0.5706648826599121, "learning_rate": 2.6703797307425792e-06, "loss": 0.7645, "step": 38100 }, { "epoch": 0.7666574439721306, "eval_loss": 0.5218858122825623, "eval_runtime": 11.6659, "eval_samples_per_second": 32.402, "eval_steps_per_second": 1.114, "step": 38100 }, { "epoch": 0.7686696682345248, "grad_norm": 0.5271847248077393, "learning_rate": 2.626651657584672e-06, "loss": 0.7699, "step": 38200 }, { "epoch": 0.7706818924969188, "grad_norm": 0.5311073064804077, "learning_rate": 2.5832303817486137e-06, "loss": 0.766, "step": 38300 }, { "epoch": 0.7726941167593129, "grad_norm": 0.5762016177177429, "learning_rate": 2.540117709948332e-06, "loss": 0.7612, "step": 38400 }, { "epoch": 0.7726941167593129, "eval_loss": 0.5214508175849915, "eval_runtime": 11.4525, "eval_samples_per_second": 33.006, "eval_steps_per_second": 1.135, "step": 38400 }, { "epoch": 0.7747063410217069, "grad_norm": 0.5659816861152649, "learning_rate": 2.497315436057064e-06, "loss": 0.7693, "step": 38500 }, { "epoch": 0.7767185652841009, "grad_norm": 0.530085563659668, "learning_rate": 2.4548253410327104e-06, "loss": 0.7598, "step": 38600 }, { "epoch": 0.778730789546495, "grad_norm": 0.624070405960083, "learning_rate": 2.412649192843739e-06, "loss": 0.7722, "step": 38700 }, { "epoch": 0.778730789546495, "eval_loss": 0.5214821100234985, "eval_runtime": 11.3194, "eval_samples_per_second": 33.394, "eval_steps_per_second": 1.148, "step": 38700 }, { "epoch": 0.780743013808889, "grad_norm": 0.5348799228668213, "learning_rate": 2.3707887463956146e-06, "loss": 0.7615, "step": 38800 }, { "epoch": 0.782755238071283, "grad_norm": 0.5490187406539917, "learning_rate": 2.3292457434577854e-06, "loss": 0.7714, "step": 38900 }, { "epoch": 0.7847674623336771, "grad_norm": 0.5568532943725586, "learning_rate": 2.2880219125912064e-06, "loss": 0.7604, "step": 39000 }, { "epoch": 0.7847674623336771, "eval_loss": 0.5214923620223999, "eval_runtime": 11.3214, "eval_samples_per_second": 33.388, "eval_steps_per_second": 1.148, "step": 39000 }, { "epoch": 0.7867796865960711, "grad_norm": 0.5511381030082703, "learning_rate": 2.2471189690764093e-06, "loss": 0.7644, "step": 39100 }, { "epoch": 0.7887919108584652, "grad_norm": 0.5425460338592529, "learning_rate": 2.2065386148421486e-06, "loss": 0.7633, "step": 39200 }, { "epoch": 0.7908041351208592, "grad_norm": 0.4867189824581146, "learning_rate": 2.1662825383945686e-06, "loss": 0.7674, "step": 39300 }, { "epoch": 0.7908041351208592, "eval_loss": 0.5209300518035889, "eval_runtime": 11.3182, "eval_samples_per_second": 33.397, "eval_steps_per_second": 1.149, "step": 39300 }, { "epoch": 0.7928163593832532, "grad_norm": 0.5154452919960022, "learning_rate": 2.1263524147469573e-06, "loss": 0.7663, "step": 39400 }, { "epoch": 0.7948285836456473, "grad_norm": 0.5264437198638916, "learning_rate": 2.0867499053500473e-06, "loss": 0.7642, "step": 39500 }, { "epoch": 0.7968408079080413, "grad_norm": 0.5303503274917603, "learning_rate": 2.047476658022881e-06, "loss": 0.7722, "step": 39600 }, { "epoch": 0.7968408079080413, "eval_loss": 0.5208966135978699, "eval_runtime": 11.3632, "eval_samples_per_second": 33.265, "eval_steps_per_second": 1.144, "step": 39600 }, { "epoch": 0.7988530321704354, "grad_norm": 0.5367266535758972, "learning_rate": 2.0085343068842546e-06, "loss": 0.753, "step": 39700 }, { "epoch": 0.8008652564328295, "grad_norm": 0.5081086754798889, "learning_rate": 1.9699244722847143e-06, "loss": 0.7571, "step": 39800 }, { "epoch": 0.8028774806952235, "grad_norm": 0.5019336938858032, "learning_rate": 1.9316487607391465e-06, "loss": 0.7723, "step": 39900 }, { "epoch": 0.8028774806952235, "eval_loss": 0.5206644535064697, "eval_runtime": 11.3602, "eval_samples_per_second": 33.274, "eval_steps_per_second": 1.144, "step": 39900 }, { "epoch": 0.8048897049576176, "grad_norm": 0.5184951424598694, "learning_rate": 1.893708764859924e-06, "loss": 0.7677, "step": 40000 }, { "epoch": 0.8069019292200116, "grad_norm": 0.5265465974807739, "learning_rate": 1.8561060632906369e-06, "loss": 0.7686, "step": 40100 }, { "epoch": 0.8089141534824056, "grad_norm": 0.5161654353141785, "learning_rate": 1.8188422206404165e-06, "loss": 0.769, "step": 40200 }, { "epoch": 0.8089141534824056, "eval_loss": 0.5201809406280518, "eval_runtime": 11.369, "eval_samples_per_second": 33.248, "eval_steps_per_second": 1.143, "step": 40200 }, { "epoch": 0.8109263777447997, "grad_norm": 0.5580165982246399, "learning_rate": 1.7819187874188293e-06, "loss": 0.7686, "step": 40300 }, { "epoch": 0.8129386020071937, "grad_norm": 0.5577532052993774, "learning_rate": 1.7453372999713557e-06, "loss": 0.7616, "step": 40400 }, { "epoch": 0.8149508262695877, "grad_norm": 0.5307947993278503, "learning_rate": 1.709099280415476e-06, "loss": 0.7705, "step": 40500 }, { "epoch": 0.8149508262695877, "eval_loss": 0.5200989842414856, "eval_runtime": 11.3357, "eval_samples_per_second": 33.346, "eval_steps_per_second": 1.147, "step": 40500 }, { "epoch": 0.8169630505319818, "grad_norm": 0.5261068940162659, "learning_rate": 1.6732062365773272e-06, "loss": 0.7674, "step": 40600 }, { "epoch": 0.8189752747943758, "grad_norm": 0.4946574568748474, "learning_rate": 1.6376596619289653e-06, "loss": 0.7654, "step": 40700 }, { "epoch": 0.8209874990567699, "grad_norm": 0.5491064786911011, "learning_rate": 1.6024610355262282e-06, "loss": 0.7695, "step": 40800 }, { "epoch": 0.8209874990567699, "eval_loss": 0.5198547840118408, "eval_runtime": 11.316, "eval_samples_per_second": 33.404, "eval_steps_per_second": 1.149, "step": 40800 }, { "epoch": 0.8229997233191639, "grad_norm": 0.5306958556175232, "learning_rate": 1.5676118219471891e-06, "loss": 0.7619, "step": 40900 }, { "epoch": 0.8250119475815579, "grad_norm": 0.5380471348762512, "learning_rate": 1.5331134712312235e-06, "loss": 0.767, "step": 41000 }, { "epoch": 0.827024171843952, "grad_norm": 0.5167573094367981, "learning_rate": 1.4989674188186598e-06, "loss": 0.7599, "step": 41100 }, { "epoch": 0.827024171843952, "eval_loss": 0.5196862816810608, "eval_runtime": 11.2973, "eval_samples_per_second": 33.459, "eval_steps_per_second": 1.151, "step": 41100 }, { "epoch": 0.829036396106346, "grad_norm": 0.5409244894981384, "learning_rate": 1.4651750854910685e-06, "loss": 0.7587, "step": 41200 }, { "epoch": 0.83104862036874, "grad_norm": 0.5431727170944214, "learning_rate": 1.4317378773121393e-06, "loss": 0.7579, "step": 41300 }, { "epoch": 0.8330608446311342, "grad_norm": 0.53000807762146, "learning_rate": 1.3986571855691744e-06, "loss": 0.7688, "step": 41400 }, { "epoch": 0.8330608446311342, "eval_loss": 0.5197826623916626, "eval_runtime": 11.3928, "eval_samples_per_second": 33.179, "eval_steps_per_second": 1.141, "step": 41400 }, { "epoch": 0.8350730688935282, "grad_norm": 0.5434339046478271, "learning_rate": 1.3659343867151975e-06, "loss": 0.7695, "step": 41500 }, { "epoch": 0.8370852931559222, "grad_norm": 0.5368450284004211, "learning_rate": 1.3335708423116856e-06, "loss": 0.7636, "step": 41600 }, { "epoch": 0.8390975174183163, "grad_norm": 0.5331200361251831, "learning_rate": 1.3015678989719116e-06, "loss": 0.7696, "step": 41700 }, { "epoch": 0.8390975174183163, "eval_loss": 0.519400954246521, "eval_runtime": 11.3064, "eval_samples_per_second": 33.432, "eval_steps_per_second": 1.15, "step": 41700 }, { "epoch": 0.8411097416807103, "grad_norm": 0.5858904123306274, "learning_rate": 1.2699268883049154e-06, "loss": 0.7648, "step": 41800 }, { "epoch": 0.8431219659431044, "grad_norm": 0.5302870273590088, "learning_rate": 1.2386491268600976e-06, "loss": 0.7553, "step": 41900 }, { "epoch": 0.8451341902054984, "grad_norm": 0.4971041679382324, "learning_rate": 1.2077359160724388e-06, "loss": 0.7655, "step": 42000 }, { "epoch": 0.8451341902054984, "eval_loss": 0.519396960735321, "eval_runtime": 11.3912, "eval_samples_per_second": 33.183, "eval_steps_per_second": 1.141, "step": 42000 }, { "epoch": 0.8471464144678924, "grad_norm": 0.5351930856704712, "learning_rate": 1.1771885422083418e-06, "loss": 0.7603, "step": 42100 }, { "epoch": 0.8491586387302865, "grad_norm": 0.4970718026161194, "learning_rate": 1.1470082763121227e-06, "loss": 0.7661, "step": 42200 }, { "epoch": 0.8511708629926805, "grad_norm": 0.5322678089141846, "learning_rate": 1.1171963741531178e-06, "loss": 0.7616, "step": 42300 }, { "epoch": 0.8511708629926805, "eval_loss": 0.5193082094192505, "eval_runtime": 11.3559, "eval_samples_per_second": 33.287, "eval_steps_per_second": 1.145, "step": 42300 }, { "epoch": 0.8531830872550745, "grad_norm": 0.5380090475082397, "learning_rate": 1.0877540761734317e-06, "loss": 0.7623, "step": 42400 }, { "epoch": 0.8551953115174686, "grad_norm": 0.5419859290122986, "learning_rate": 1.0586826074363277e-06, "loss": 0.761, "step": 42500 }, { "epoch": 0.8572075357798626, "grad_norm": 0.5447313189506531, "learning_rate": 1.0299831775752478e-06, "loss": 0.7635, "step": 42600 }, { "epoch": 0.8572075357798626, "eval_loss": 0.5189518332481384, "eval_runtime": 11.3146, "eval_samples_per_second": 33.408, "eval_steps_per_second": 1.149, "step": 42600 }, { "epoch": 0.8592197600422568, "grad_norm": 0.5054132342338562, "learning_rate": 1.0016569807434894e-06, "loss": 0.7553, "step": 42700 }, { "epoch": 0.8612319843046508, "grad_norm": 0.5626354217529297, "learning_rate": 9.737051955645104e-07, "loss": 0.76, "step": 42800 }, { "epoch": 0.8632442085670448, "grad_norm": 0.6139233112335205, "learning_rate": 9.461289850828936e-07, "loss": 0.7586, "step": 42900 }, { "epoch": 0.8632442085670448, "eval_loss": 0.5188504457473755, "eval_runtime": 11.3931, "eval_samples_per_second": 33.178, "eval_steps_per_second": 1.141, "step": 42900 }, { "epoch": 0.8652564328294389, "grad_norm": 0.5168823003768921, "learning_rate": 9.189294967159457e-07, "loss": 0.7569, "step": 43000 }, { "epoch": 0.8672686570918329, "grad_norm": 0.5103846192359924, "learning_rate": 8.921078622059643e-07, "loss": 0.7598, "step": 43100 }, { "epoch": 0.8692808813542269, "grad_norm": 0.5376741290092468, "learning_rate": 8.656651975731434e-07, "loss": 0.7687, "step": 43200 }, { "epoch": 0.8692808813542269, "eval_loss": 0.5187187790870667, "eval_runtime": 11.3132, "eval_samples_per_second": 33.412, "eval_steps_per_second": 1.149, "step": 43200 }, { "epoch": 0.871293105616621, "grad_norm": 0.5139674544334412, "learning_rate": 8.396026030691329e-07, "loss": 0.7543, "step": 43300 }, { "epoch": 0.873305329879015, "grad_norm": 0.4912608563899994, "learning_rate": 8.139211631312638e-07, "loss": 0.759, "step": 43400 }, { "epoch": 0.8753175541414091, "grad_norm": 0.5286913514137268, "learning_rate": 7.886219463374256e-07, "loss": 0.7579, "step": 43500 }, { "epoch": 0.8753175541414091, "eval_loss": 0.5185059905052185, "eval_runtime": 11.3249, "eval_samples_per_second": 33.378, "eval_steps_per_second": 1.148, "step": 43500 }, { "epoch": 0.8773297784038031, "grad_norm": 0.4960270822048187, "learning_rate": 7.637060053615963e-07, "loss": 0.7582, "step": 43600 }, { "epoch": 0.8793420026661971, "grad_norm": 0.5134163498878479, "learning_rate": 7.391743769300541e-07, "loss": 0.7624, "step": 43700 }, { "epoch": 0.8813542269285912, "grad_norm": 0.5594838857650757, "learning_rate": 7.150280817782296e-07, "loss": 0.7626, "step": 43800 }, { "epoch": 0.8813542269285912, "eval_loss": 0.5184139013290405, "eval_runtime": 11.3303, "eval_samples_per_second": 33.362, "eval_steps_per_second": 1.147, "step": 43800 }, { "epoch": 0.8833664511909852, "grad_norm": 0.523009717464447, "learning_rate": 6.912681246082409e-07, "loss": 0.7554, "step": 43900 }, { "epoch": 0.8853786754533792, "grad_norm": 0.50362229347229, "learning_rate": 6.678954940470806e-07, "loss": 0.758, "step": 44000 }, { "epoch": 0.8873908997157733, "grad_norm": 0.5441898107528687, "learning_rate": 6.449111626054927e-07, "loss": 0.7573, "step": 44100 }, { "epoch": 0.8873908997157733, "eval_loss": 0.5184325575828552, "eval_runtime": 11.3938, "eval_samples_per_second": 33.176, "eval_steps_per_second": 1.141, "step": 44100 }, { "epoch": 0.8894031239781673, "grad_norm": 0.520699679851532, "learning_rate": 6.223160866374967e-07, "loss": 0.7638, "step": 44200 }, { "epoch": 0.8914153482405615, "grad_norm": 0.4745332598686218, "learning_rate": 6.001112063005998e-07, "loss": 0.7577, "step": 44300 }, { "epoch": 0.8934275725029555, "grad_norm": 0.49645400047302246, "learning_rate": 5.782974455166767e-07, "loss": 0.7619, "step": 44400 }, { "epoch": 0.8934275725029555, "eval_loss": 0.518170952796936, "eval_runtime": 11.3133, "eval_samples_per_second": 33.412, "eval_steps_per_second": 1.149, "step": 44400 }, { "epoch": 0.8954397967653495, "grad_norm": 0.5159271955490112, "learning_rate": 5.568757119335244e-07, "loss": 0.7571, "step": 44500 }, { "epoch": 0.8974520210277436, "grad_norm": 0.5097435712814331, "learning_rate": 5.358468968871e-07, "loss": 0.7697, "step": 44600 }, { "epoch": 0.8994642452901376, "grad_norm": 0.5482389330863953, "learning_rate": 5.152118753644275e-07, "loss": 0.7682, "step": 44700 }, { "epoch": 0.8994642452901376, "eval_loss": 0.5181338787078857, "eval_runtime": 11.4656, "eval_samples_per_second": 32.968, "eval_steps_per_second": 1.134, "step": 44700 }, { "epoch": 0.9014764695525316, "grad_norm": 0.5253916382789612, "learning_rate": 4.949715059671978e-07, "loss": 0.7656, "step": 44800 }, { "epoch": 0.9034886938149257, "grad_norm": 0.4978592097759247, "learning_rate": 4.7512663087603826e-07, "loss": 0.7621, "step": 44900 }, { "epoch": 0.9055009180773197, "grad_norm": 0.5216113924980164, "learning_rate": 4.5567807581546664e-07, "loss": 0.7595, "step": 45000 }, { "epoch": 0.9055009180773197, "eval_loss": 0.5181112885475159, "eval_runtime": 11.5213, "eval_samples_per_second": 32.809, "eval_steps_per_second": 1.128, "step": 45000 }, { "epoch": 0.9075131423397138, "grad_norm": 0.5027504563331604, "learning_rate": 4.366266500195426e-07, "loss": 0.7588, "step": 45100 }, { "epoch": 0.9095253666021078, "grad_norm": 0.5365561842918396, "learning_rate": 4.1797314619819285e-07, "loss": 0.7612, "step": 45200 }, { "epoch": 0.9115375908645018, "grad_norm": 0.5316836833953857, "learning_rate": 3.997183405042238e-07, "loss": 0.7639, "step": 45300 }, { "epoch": 0.9115375908645018, "eval_loss": 0.5180224776268005, "eval_runtime": 11.5144, "eval_samples_per_second": 32.828, "eval_steps_per_second": 1.129, "step": 45300 }, { "epoch": 0.9135498151268959, "grad_norm": 0.5350984930992126, "learning_rate": 3.8186299250103085e-07, "loss": 0.7582, "step": 45400 }, { "epoch": 0.9155620393892899, "grad_norm": 0.5509154796600342, "learning_rate": 3.644078451309907e-07, "loss": 0.7686, "step": 45500 }, { "epoch": 0.9175742636516839, "grad_norm": 0.5419358611106873, "learning_rate": 3.47353624684551e-07, "loss": 0.762, "step": 45600 }, { "epoch": 0.9175742636516839, "eval_loss": 0.5179212689399719, "eval_runtime": 11.4423, "eval_samples_per_second": 33.035, "eval_steps_per_second": 1.136, "step": 45600 }, { "epoch": 0.919586487914078, "grad_norm": 0.5258903503417969, "learning_rate": 3.307010407700084e-07, "loss": 0.7598, "step": 45700 }, { "epoch": 0.921598712176472, "grad_norm": 0.519910454750061, "learning_rate": 3.1445078628398294e-07, "loss": 0.7589, "step": 45800 }, { "epoch": 0.9236109364388662, "grad_norm": 0.5140842795372009, "learning_rate": 2.986035373825902e-07, "loss": 0.762, "step": 45900 }, { "epoch": 0.9236109364388662, "eval_loss": 0.5178348422050476, "eval_runtime": 11.4694, "eval_samples_per_second": 32.957, "eval_steps_per_second": 1.133, "step": 45900 }, { "epoch": 0.9256231607012602, "grad_norm": 0.5274850726127625, "learning_rate": 2.8315995345329804e-07, "loss": 0.758, "step": 46000 }, { "epoch": 0.9276353849636542, "grad_norm": 0.5443992018699646, "learning_rate": 2.681206770875022e-07, "loss": 0.7614, "step": 46100 }, { "epoch": 0.9296476092260483, "grad_norm": 0.5250468254089355, "learning_rate": 2.5348633405378296e-07, "loss": 0.7666, "step": 46200 }, { "epoch": 0.9296476092260483, "eval_loss": 0.5178038477897644, "eval_runtime": 11.6986, "eval_samples_per_second": 32.311, "eval_steps_per_second": 1.111, "step": 46200 }, { "epoch": 0.9316598334884423, "grad_norm": 0.5096211433410645, "learning_rate": 2.392575332718627e-07, "loss": 0.7697, "step": 46300 }, { "epoch": 0.9336720577508363, "grad_norm": 0.549790620803833, "learning_rate": 2.2543486678727855e-07, "loss": 0.7676, "step": 46400 }, { "epoch": 0.9356842820132304, "grad_norm": 0.524726152420044, "learning_rate": 2.120189097467451e-07, "loss": 0.7673, "step": 46500 }, { "epoch": 0.9356842820132304, "eval_loss": 0.5176617503166199, "eval_runtime": 11.5673, "eval_samples_per_second": 32.678, "eval_steps_per_second": 1.124, "step": 46500 }, { "epoch": 0.9376965062756244, "grad_norm": 0.5029181838035583, "learning_rate": 1.9901022037421723e-07, "loss": 0.7642, "step": 46600 }, { "epoch": 0.9397087305380185, "grad_norm": 0.5207979679107666, "learning_rate": 1.8640933994767073e-07, "loss": 0.7592, "step": 46700 }, { "epoch": 0.9417209548004125, "grad_norm": 0.5468851923942566, "learning_rate": 1.74216792776577e-07, "loss": 0.7631, "step": 46800 }, { "epoch": 0.9417209548004125, "eval_loss": 0.5177092552185059, "eval_runtime": 11.4559, "eval_samples_per_second": 32.996, "eval_steps_per_second": 1.135, "step": 46800 }, { "epoch": 0.9437331790628065, "grad_norm": 0.5044853091239929, "learning_rate": 1.62433086180086e-07, "loss": 0.7644, "step": 46900 }, { "epoch": 0.9457454033252006, "grad_norm": 0.5245229005813599, "learning_rate": 1.5105871046592e-07, "loss": 0.7605, "step": 47000 }, { "epoch": 0.9477576275875946, "grad_norm": 0.49839621782302856, "learning_rate": 1.400941389099697e-07, "loss": 0.7565, "step": 47100 }, { "epoch": 0.9477576275875946, "eval_loss": 0.5176432132720947, "eval_runtime": 11.5662, "eval_samples_per_second": 32.681, "eval_steps_per_second": 1.124, "step": 47100 }, { "epoch": 0.9497698518499886, "grad_norm": 0.4973909556865692, "learning_rate": 1.2953982773660223e-07, "loss": 0.7656, "step": 47200 }, { "epoch": 0.9517820761123827, "grad_norm": 0.5007102489471436, "learning_rate": 1.1939621609968088e-07, "loss": 0.7506, "step": 47300 }, { "epoch": 0.9537943003747767, "grad_norm": 0.49358874559402466, "learning_rate": 1.0966372606428855e-07, "loss": 0.7562, "step": 47400 }, { "epoch": 0.9537943003747767, "eval_loss": 0.5176478624343872, "eval_runtime": 11.3727, "eval_samples_per_second": 33.237, "eval_steps_per_second": 1.143, "step": 47400 }, { "epoch": 0.9558065246371709, "grad_norm": 0.5771644115447998, "learning_rate": 1.0034276258916953e-07, "loss": 0.766, "step": 47500 }, { "epoch": 0.9578187488995649, "grad_norm": 0.5385919213294983, "learning_rate": 9.14337135098764e-08, "loss": 0.7605, "step": 47600 }, { "epoch": 0.9598309731619589, "grad_norm": 0.5119192004203796, "learning_rate": 8.293694952263286e-08, "loss": 0.757, "step": 47700 }, { "epoch": 0.9598309731619589, "eval_loss": 0.5176236033439636, "eval_runtime": 11.3818, "eval_samples_per_second": 33.211, "eval_steps_per_second": 1.142, "step": 47700 }, { "epoch": 0.961843197424353, "grad_norm": 0.5380053520202637, "learning_rate": 7.485282416891393e-08, "loss": 0.7574, "step": 47800 }, { "epoch": 0.963855421686747, "grad_norm": 0.5267532467842102, "learning_rate": 6.718167382072983e-08, "loss": 0.7668, "step": 47900 }, { "epoch": 0.965867645949141, "grad_norm": 0.5199303030967712, "learning_rate": 5.99238176666328e-08, "loss": 0.756, "step": 48000 }, { "epoch": 0.965867645949141, "eval_loss": 0.5175907015800476, "eval_runtime": 11.4752, "eval_samples_per_second": 32.941, "eval_steps_per_second": 1.133, "step": 48000 }, { "epoch": 0.9678798702115351, "grad_norm": 0.5405638217926025, "learning_rate": 5.307955769843443e-08, "loss": 0.7612, "step": 48100 }, { "epoch": 0.9698920944739291, "grad_norm": 0.47063717246055603, "learning_rate": 4.664917869864338e-08, "loss": 0.7667, "step": 48200 }, { "epoch": 0.9719043187363232, "grad_norm": 0.48465442657470703, "learning_rate": 4.063294822861163e-08, "loss": 0.7605, "step": 48300 }, { "epoch": 0.9719043187363232, "eval_loss": 0.5175836682319641, "eval_runtime": 11.3838, "eval_samples_per_second": 33.205, "eval_steps_per_second": 1.142, "step": 48300 }, { "epoch": 0.9739165429987172, "grad_norm": 0.48423367738723755, "learning_rate": 3.5031116617404435e-08, "loss": 0.7574, "step": 48400 }, { "epoch": 0.9759287672611112, "grad_norm": 0.5320655107498169, "learning_rate": 2.9843916951382e-08, "loss": 0.767, "step": 48500 }, { "epoch": 0.9779409915235053, "grad_norm": 0.5267395377159119, "learning_rate": 2.5071565064506143e-08, "loss": 0.7593, "step": 48600 }, { "epoch": 0.9779409915235053, "eval_loss": 0.5175591707229614, "eval_runtime": 11.3501, "eval_samples_per_second": 33.304, "eval_steps_per_second": 1.145, "step": 48600 }, { "epoch": 0.9799532157858993, "grad_norm": 0.514837920665741, "learning_rate": 2.071425952934969e-08, "loss": 0.7641, "step": 48700 }, { "epoch": 0.9819654400482933, "grad_norm": 0.5345449447631836, "learning_rate": 1.677218164884753e-08, "loss": 0.7685, "step": 48800 }, { "epoch": 0.9839776643106874, "grad_norm": 0.5339971780776978, "learning_rate": 1.3245495448739321e-08, "loss": 0.7612, "step": 48900 }, { "epoch": 0.9839776643106874, "eval_loss": 0.5175919532775879, "eval_runtime": 11.3829, "eval_samples_per_second": 33.208, "eval_steps_per_second": 1.142, "step": 48900 }, { "epoch": 0.9859898885730815, "grad_norm": 0.49889686703681946, "learning_rate": 1.013434767075605e-08, "loss": 0.7692, "step": 49000 }, { "epoch": 0.9880021128354756, "grad_norm": 0.5119482278823853, "learning_rate": 7.438867766504931e-09, "loss": 0.7578, "step": 49100 }, { "epoch": 0.9900143370978696, "grad_norm": 0.5316244959831238, "learning_rate": 5.159167892089256e-09, "loss": 0.7568, "step": 49200 }, { "epoch": 0.9900143370978696, "eval_loss": 0.5176030993461609, "eval_runtime": 11.4046, "eval_samples_per_second": 33.145, "eval_steps_per_second": 1.14, "step": 49200 }, { "epoch": 0.9920265613602636, "grad_norm": 7.261257648468018, "learning_rate": 3.2953429034399133e-09, "loss": 0.7576, "step": 49300 }, { "epoch": 0.9940387856226577, "grad_norm": 0.48430758714675903, "learning_rate": 1.847470352367431e-09, "loss": 0.7577, "step": 49400 }, { "epoch": 0.9960510098850517, "grad_norm": 0.4918181598186493, "learning_rate": 8.156104833345613e-10, "loss": 0.7649, "step": 49500 }, { "epoch": 0.9960510098850517, "eval_loss": 0.5175663232803345, "eval_runtime": 11.4598, "eval_samples_per_second": 32.985, "eval_steps_per_second": 1.134, "step": 49500 }, { "epoch": 0.9980632341474457, "grad_norm": 0.5409220457077026, "learning_rate": 1.9980623095494645e-10, "loss": 0.7531, "step": 49600 } ], "logging_steps": 100, "max_steps": 49697, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.166027593741658e+19, "train_batch_size": 10, "trial_name": null, "trial_params": null }