diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -2,2753 +2,1973 @@ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.7777777777777778, + "epoch": 0.7142857142857143, "eval_steps": 1000, - "global_step": 35000, + "global_step": 25000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 2.2222222222222223e-05, - "grad_norm": 1.0980026721954346, + "epoch": 2.857142857142857e-05, + "grad_norm": 1.1264785528182983, "learning_rate": 0.0, - "loss": 1.6613, + "loss": 1.4622, "step": 1 }, { - "epoch": 0.0022222222222222222, - "grad_norm": 1.037724256515503, - "learning_rate": 1.1e-06, - "loss": 1.5146, + "epoch": 0.002857142857142857, + "grad_norm": 1.0415701866149902, + "learning_rate": 1.4142857142857144e-06, + "loss": 1.4319, "step": 100 }, { - "epoch": 0.0044444444444444444, - "grad_norm": 1.0775114297866821, - "learning_rate": 2.2111111111111113e-06, - "loss": 1.5285, + "epoch": 0.005714285714285714, + "grad_norm": 1.1746091842651367, + "learning_rate": 2.8428571428571432e-06, + "loss": 1.4189, "step": 200 }, { - "epoch": 0.006666666666666667, - "grad_norm": 1.065529227256775, - "learning_rate": 3.322222222222222e-06, - "loss": 1.5211, + "epoch": 0.008571428571428572, + "grad_norm": 1.1301525831222534, + "learning_rate": 4.271428571428572e-06, + "loss": 1.4293, "step": 300 }, { - "epoch": 0.008888888888888889, - "grad_norm": 1.0316686630249023, - "learning_rate": 4.433333333333334e-06, - "loss": 1.502, + "epoch": 0.011428571428571429, + "grad_norm": 1.0607796907424927, + "learning_rate": 4.9999753285470756e-06, + "loss": 1.4205, "step": 400 }, { - "epoch": 0.011111111111111112, - "grad_norm": 1.034487247467041, - "learning_rate": 4.999985075284212e-06, - "loss": 1.5011, + "epoch": 0.014285714285714285, + "grad_norm": 1.1491715908050537, + "learning_rate": 4.999771876927458e-06, + "loss": 1.4197, "step": 500 }, { - "epoch": 0.013333333333333334, - "grad_norm": 1.0352669954299927, - "learning_rate": 4.999861998793635e-06, - "loss": 1.5045, + "epoch": 0.017142857142857144, + "grad_norm": 1.0873078107833862, + "learning_rate": 4.999362935318198e-06, + "loss": 1.4364, "step": 600 }, { - "epoch": 0.015555555555555555, - "grad_norm": 1.096940517425537, - "learning_rate": 4.999614608725878e-06, - "loss": 1.5074, + "epoch": 0.02, + "grad_norm": 1.0659881830215454, + "learning_rate": 4.998748537335728e-06, + "loss": 1.4507, "step": 700 }, { - "epoch": 0.017777777777777778, - "grad_norm": 1.0137542486190796, - "learning_rate": 4.999242917383197e-06, - "loss": 1.525, + "epoch": 0.022857142857142857, + "grad_norm": 1.1764490604400635, + "learning_rate": 4.99792873348571e-06, + "loss": 1.4398, "step": 800 }, { - "epoch": 0.02, - "grad_norm": 1.089389443397522, - "learning_rate": 4.998746943249126e-06, - "loss": 1.5126, + "epoch": 0.025714285714285714, + "grad_norm": 1.0576765537261963, + "learning_rate": 4.996903591158886e-06, + "loss": 1.4203, "step": 900 }, { - "epoch": 0.022222222222222223, - "grad_norm": 1.0824161767959595, - "learning_rate": 4.998126710987552e-06, - "loss": 1.5018, + "epoch": 0.02857142857142857, + "grad_norm": 1.111843228340149, + "learning_rate": 4.995673194625541e-06, + "loss": 1.4203, "step": 1000 }, { - "epoch": 0.022222222222222223, - "eval_loss": 1.5183053016662598, - "eval_runtime": 103.5736, - "eval_samples_per_second": 132.225, - "eval_steps_per_second": 2.066, + "epoch": 0.02857142857142857, + "eval_loss": 1.4505008459091187, + "eval_runtime": 103.009, + "eval_samples_per_second": 132.95, + "eval_steps_per_second": 2.077, "step": 1000 }, { - "epoch": 0.024444444444444446, - "grad_norm": 0.9782128930091858, - "learning_rate": 4.997382251441495e-06, - "loss": 1.4755, + "epoch": 0.03142857142857143, + "grad_norm": 1.037828803062439, + "learning_rate": 4.994237645028573e-06, + "loss": 1.443, "step": 1100 }, { - "epoch": 0.02666666666666667, - "grad_norm": 1.07709538936615, - "learning_rate": 4.996513601631571e-06, - "loss": 1.4818, + "epoch": 0.03428571428571429, + "grad_norm": 1.1225452423095703, + "learning_rate": 4.992597060375177e-06, + "loss": 1.4519, "step": 1200 }, { - "epoch": 0.028888888888888888, - "grad_norm": 1.0353506803512573, - "learning_rate": 4.99552080475415e-06, - "loss": 1.4891, + "epoch": 0.037142857142857144, + "grad_norm": 1.032313346862793, + "learning_rate": 4.990751575527151e-06, + "loss": 1.4358, "step": 1300 }, { - "epoch": 0.03111111111111111, - "grad_norm": 1.0822919607162476, - "learning_rate": 4.994403910179209e-06, - "loss": 1.515, + "epoch": 0.04, + "grad_norm": 1.1252490282058716, + "learning_rate": 4.988701342189802e-06, + "loss": 1.4102, "step": 1400 }, { - "epoch": 0.03333333333333333, - "grad_norm": 1.0209099054336548, - "learning_rate": 4.993162973447879e-06, - "loss": 1.4816, + "epoch": 0.04285714285714286, + "grad_norm": 1.0545426607131958, + "learning_rate": 4.986446528899478e-06, + "loss": 1.4142, "step": 1500 }, { - "epoch": 0.035555555555555556, - "grad_norm": 1.089007019996643, - "learning_rate": 4.99179805626968e-06, - "loss": 1.5098, + "epoch": 0.045714285714285714, + "grad_norm": 1.08208429813385, + "learning_rate": 4.983987321009718e-06, + "loss": 1.4247, "step": 1600 }, { - "epoch": 0.03777777777777778, - "grad_norm": 1.144646167755127, - "learning_rate": 4.990309226519452e-06, - "loss": 1.5034, + "epoch": 0.04857142857142857, + "grad_norm": 1.042827844619751, + "learning_rate": 4.98132392067601e-06, + "loss": 1.4078, "step": 1700 }, { - "epoch": 0.04, - "grad_norm": 1.1199110746383667, - "learning_rate": 4.988696558233985e-06, - "loss": 1.4831, + "epoch": 0.05142857142857143, + "grad_norm": 1.029168725013733, + "learning_rate": 4.978456546839175e-06, + "loss": 1.4255, "step": 1800 }, { - "epoch": 0.042222222222222223, - "grad_norm": 1.0624563694000244, - "learning_rate": 4.986960131608329e-06, - "loss": 1.4673, + "epoch": 0.054285714285714284, + "grad_norm": 1.1674017906188965, + "learning_rate": 4.975385435207367e-06, + "loss": 1.4428, "step": 1900 }, { - "epoch": 0.044444444444444446, - "grad_norm": 1.0910651683807373, - "learning_rate": 4.985100032991814e-06, - "loss": 1.4661, + "epoch": 0.05714285714285714, + "grad_norm": 1.2838454246520996, + "learning_rate": 4.972110838236704e-06, + "loss": 1.4327, "step": 2000 }, { - "epoch": 0.044444444444444446, - "eval_loss": 1.5076568126678467, - "eval_runtime": 98.7691, - "eval_samples_per_second": 138.657, - "eval_steps_per_second": 2.167, + "epoch": 0.05714285714285714, + "eval_loss": 1.4425562620162964, + "eval_runtime": 97.037, + "eval_samples_per_second": 141.132, + "eval_steps_per_second": 2.205, "step": 2000 }, { - "epoch": 0.04666666666666667, - "grad_norm": 1.0498489141464233, - "learning_rate": 4.9831163548837506e-06, - "loss": 1.4798, + "epoch": 0.06, + "grad_norm": 1.0720206499099731, + "learning_rate": 4.968633025110507e-06, + "loss": 1.4312, "step": 2100 }, { - "epoch": 0.04888888888888889, - "grad_norm": 1.042786717414856, - "learning_rate": 4.9810091959288324e-06, - "loss": 1.5013, + "epoch": 0.06285714285714286, + "grad_norm": 1.0312304496765137, + "learning_rate": 4.964952281717177e-06, + "loss": 1.4405, "step": 2200 }, { - "epoch": 0.051111111111111114, - "grad_norm": 1.0386837720870972, - "learning_rate": 4.978778660912231e-06, - "loss": 1.4999, + "epoch": 0.06571428571428571, + "grad_norm": 1.0791317224502563, + "learning_rate": 4.961068910626692e-06, + "loss": 1.4407, "step": 2300 }, { - "epoch": 0.05333333333333334, - "grad_norm": 1.0705187320709229, - "learning_rate": 4.976424860754386e-06, - "loss": 1.4647, + "epoch": 0.06857142857142857, + "grad_norm": 1.0809016227722168, + "learning_rate": 4.956983231065733e-06, + "loss": 1.434, "step": 2400 }, { - "epoch": 0.05555555555555555, - "grad_norm": 1.0342007875442505, - "learning_rate": 4.973947912505481e-06, - "loss": 1.4925, + "epoch": 0.07142857142857142, + "grad_norm": 1.059635043144226, + "learning_rate": 4.952695578891449e-06, + "loss": 1.4114, "step": 2500 }, { - "epoch": 0.057777777777777775, - "grad_norm": 1.073187232017517, - "learning_rate": 4.971347939339638e-06, - "loss": 1.4865, + "epoch": 0.07428571428571429, + "grad_norm": 1.0659129619598389, + "learning_rate": 4.948206306563842e-06, + "loss": 1.4374, "step": 2600 }, { - "epoch": 0.06, - "grad_norm": 1.103935718536377, - "learning_rate": 4.968625070548778e-06, - "loss": 1.4802, + "epoch": 0.07714285714285714, + "grad_norm": 1.0818511247634888, + "learning_rate": 4.943515783116794e-06, + "loss": 1.4196, "step": 2700 }, { - "epoch": 0.06222222222222222, - "grad_norm": 1.07283616065979, - "learning_rate": 4.965779441536202e-06, - "loss": 1.4717, + "epoch": 0.08, + "grad_norm": 1.1003646850585938, + "learning_rate": 4.9386243941277374e-06, + "loss": 1.4508, "step": 2800 }, { - "epoch": 0.06444444444444444, - "grad_norm": 1.0456539392471313, - "learning_rate": 4.962811193809845e-06, - "loss": 1.5038, + "epoch": 0.08285714285714285, + "grad_norm": 1.086207628250122, + "learning_rate": 4.933532541685949e-06, + "loss": 1.4354, "step": 2900 }, { - "epoch": 0.06666666666666667, - "grad_norm": 1.0673285722732544, - "learning_rate": 4.959720474975257e-06, - "loss": 1.4866, + "epoch": 0.08571428571428572, + "grad_norm": 1.0702838897705078, + "learning_rate": 4.928240644359507e-06, + "loss": 1.4262, "step": 3000 }, { - "epoch": 0.06666666666666667, - "eval_loss": 1.501826524734497, - "eval_runtime": 99.5576, - "eval_samples_per_second": 137.559, - "eval_steps_per_second": 2.15, + "epoch": 0.08571428571428572, + "eval_loss": 1.438844919204712, + "eval_runtime": 96.9421, + "eval_samples_per_second": 141.27, + "eval_steps_per_second": 2.208, "step": 3000 }, { - "epoch": 0.06888888888888889, - "grad_norm": 1.043306827545166, - "learning_rate": 4.956507438728246e-06, - "loss": 1.5114, + "epoch": 0.08857142857142856, + "grad_norm": 1.1206424236297607, + "learning_rate": 4.922749137160875e-06, + "loss": 1.4445, "step": 3100 }, { - "epoch": 0.07111111111111111, - "grad_norm": 1.004623293876648, - "learning_rate": 4.953172244847243e-06, - "loss": 1.483, + "epoch": 0.09142857142857143, + "grad_norm": 1.0971518754959106, + "learning_rate": 4.917058471511149e-06, + "loss": 1.4117, "step": 3200 }, { - "epoch": 0.07333333333333333, - "grad_norm": 1.0519309043884277, - "learning_rate": 4.949715059185356e-06, - "loss": 1.4949, + "epoch": 0.09428571428571429, + "grad_norm": 1.1263982057571411, + "learning_rate": 4.9111691152029436e-06, + "loss": 1.4294, "step": 3300 }, { - "epoch": 0.07555555555555556, - "grad_norm": 1.048803687095642, - "learning_rate": 4.9461360536621225e-06, - "loss": 1.4688, + "epoch": 0.09714285714285714, + "grad_norm": 1.0150455236434937, + "learning_rate": 4.905081552361943e-06, + "loss": 1.4357, "step": 3400 }, { - "epoch": 0.07777777777777778, - "grad_norm": 1.002150058746338, - "learning_rate": 4.942435406254959e-06, - "loss": 1.4721, + "epoch": 0.1, + "grad_norm": 1.0511361360549927, + "learning_rate": 4.898796283407099e-06, + "loss": 1.438, "step": 3500 }, { - "epoch": 0.08, - "grad_norm": 1.043863296508789, - "learning_rate": 4.9386133009903086e-06, - "loss": 1.4819, + "epoch": 0.10285714285714286, + "grad_norm": 1.1033008098602295, + "learning_rate": 4.892313825009499e-06, + "loss": 1.4162, "step": 3600 }, { - "epoch": 0.08222222222222222, - "grad_norm": 1.0765719413757324, - "learning_rate": 4.934669927934496e-06, - "loss": 1.4905, + "epoch": 0.10571428571428572, + "grad_norm": 1.1107470989227295, + "learning_rate": 4.885634710049891e-06, + "loss": 1.4267, "step": 3700 }, { - "epoch": 0.08444444444444445, - "grad_norm": 1.0662930011749268, - "learning_rate": 4.93060548318427e-06, - "loss": 1.491, + "epoch": 0.10857142857142857, + "grad_norm": 1.0580041408538818, + "learning_rate": 4.878759487574882e-06, + "loss": 1.4287, "step": 3800 }, { - "epoch": 0.08666666666666667, - "grad_norm": 1.0453810691833496, - "learning_rate": 4.926420168857054e-06, - "loss": 1.496, + "epoch": 0.11142857142857143, + "grad_norm": 1.0198274850845337, + "learning_rate": 4.871688722751799e-06, + "loss": 1.438, "step": 3900 }, { - "epoch": 0.08888888888888889, - "grad_norm": 0.9957125782966614, - "learning_rate": 4.922114193080893e-06, - "loss": 1.5116, + "epoch": 0.11428571428571428, + "grad_norm": 1.1063220500946045, + "learning_rate": 4.864422996822239e-06, + "loss": 1.4078, "step": 4000 }, { - "epoch": 0.08888888888888889, - "eval_loss": 1.5021920204162598, - "eval_runtime": 99.9798, - "eval_samples_per_second": 136.978, - "eval_steps_per_second": 2.14, + "epoch": 0.11428571428571428, + "eval_loss": 1.4405826330184937, + "eval_runtime": 97.1575, + "eval_samples_per_second": 140.957, + "eval_steps_per_second": 2.203, "step": 4000 }, { - "epoch": 0.09111111111111111, - "grad_norm": 1.0647602081298828, - "learning_rate": 4.917687769984112e-06, - "loss": 1.4787, + "epoch": 0.11714285714285715, + "grad_norm": 1.0360065698623657, + "learning_rate": 4.8569629070542775e-06, + "loss": 1.414, "step": 4100 }, { - "epoch": 0.09333333333333334, - "grad_norm": 1.0057638883590698, - "learning_rate": 4.9131411196846536e-06, - "loss": 1.479, + "epoch": 0.12, + "grad_norm": 1.0539647340774536, + "learning_rate": 4.849309066693382e-06, + "loss": 1.3992, "step": 4200 }, { - "epoch": 0.09555555555555556, - "grad_norm": 1.0767698287963867, - "learning_rate": 4.908474468279145e-06, - "loss": 1.4646, + "epoch": 0.12285714285714286, + "grad_norm": 1.0678602457046509, + "learning_rate": 4.8414621049119935e-06, + "loss": 1.4226, "step": 4300 }, { - "epoch": 0.09777777777777778, - "grad_norm": 1.0766099691390991, - "learning_rate": 4.903688047831647e-06, - "loss": 1.4902, + "epoch": 0.12571428571428572, + "grad_norm": 1.1174051761627197, + "learning_rate": 4.833422666757811e-06, + "loss": 1.4149, "step": 4400 }, { - "epoch": 0.1, - "grad_norm": 1.0425220727920532, - "learning_rate": 4.89878209636212e-06, - "loss": 1.4865, + "epoch": 0.12857142857142856, + "grad_norm": 1.1076269149780273, + "learning_rate": 4.825191413100764e-06, + "loss": 1.4219, "step": 4500 }, { - "epoch": 0.10222222222222223, - "grad_norm": 1.1266907453536987, - "learning_rate": 4.893756857834579e-06, - "loss": 1.4701, + "epoch": 0.13142857142857142, + "grad_norm": 1.0237882137298584, + "learning_rate": 4.816769020578685e-06, + "loss": 1.4063, "step": 4600 }, { - "epoch": 0.10444444444444445, - "grad_norm": 1.0631765127182007, - "learning_rate": 4.888612582144971e-06, - "loss": 1.4875, + "epoch": 0.13428571428571429, + "grad_norm": 1.0634537935256958, + "learning_rate": 4.808156181541694e-06, + "loss": 1.4077, "step": 4700 }, { - "epoch": 0.10666666666666667, - "grad_norm": 1.0484580993652344, - "learning_rate": 4.8833495251087415e-06, - "loss": 1.4834, + "epoch": 0.13714285714285715, + "grad_norm": 1.1134625673294067, + "learning_rate": 4.799353603995275e-06, + "loss": 1.4589, "step": 4800 }, { - "epoch": 0.10888888888888888, - "grad_norm": 1.0509796142578125, - "learning_rate": 4.877967948448117e-06, - "loss": 1.4828, + "epoch": 0.14, + "grad_norm": 1.069698691368103, + "learning_rate": 4.790362011542085e-06, + "loss": 1.4063, "step": 4900 }, { - "epoch": 0.1111111111111111, - "grad_norm": 1.0983880758285522, - "learning_rate": 4.872468119779088e-06, - "loss": 1.458, + "epoch": 0.14285714285714285, + "grad_norm": 1.1093010902404785, + "learning_rate": 4.7811821433224665e-06, + "loss": 1.4225, "step": 5000 }, { - "epoch": 0.1111111111111111, - "eval_loss": 1.5034407377243042, - "eval_runtime": 100.2041, - "eval_samples_per_second": 136.671, - "eval_steps_per_second": 2.136, + "epoch": 0.14285714285714285, + "eval_loss": 1.4438061714172363, + "eval_runtime": 98.2976, + "eval_samples_per_second": 139.322, + "eval_steps_per_second": 2.177, "step": 5000 }, { - "epoch": 0.11333333333333333, - "grad_norm": 1.065687894821167, - "learning_rate": 4.866850312598101e-06, - "loss": 1.4696, + "epoch": 0.1457142857142857, + "grad_norm": 1.0884599685668945, + "learning_rate": 4.7718147539536865e-06, + "loss": 1.4347, "step": 5100 }, { - "epoch": 0.11555555555555555, - "grad_norm": 1.0348268747329712, - "learning_rate": 4.8611148062684625e-06, - "loss": 1.4943, + "epoch": 0.14857142857142858, + "grad_norm": 1.0088622570037842, + "learning_rate": 4.762260613467909e-06, + "loss": 1.4254, "step": 5200 }, { - "epoch": 0.11777777777777777, - "grad_norm": 1.1039857864379883, - "learning_rate": 4.855261886006437e-06, - "loss": 1.4881, + "epoch": 0.15142857142857144, + "grad_norm": 1.1340473890304565, + "learning_rate": 4.75252050724889e-06, + "loss": 1.4101, "step": 5300 }, { - "epoch": 0.12, - "grad_norm": 1.048181414604187, - "learning_rate": 4.849291842867075e-06, - "loss": 1.4849, + "epoch": 0.15428571428571428, + "grad_norm": 1.093491554260254, + "learning_rate": 4.7425952359674225e-06, + "loss": 1.4256, "step": 5400 }, { - "epoch": 0.12222222222222222, - "grad_norm": 1.1050995588302612, - "learning_rate": 4.84320497372973e-06, - "loss": 1.4778, + "epoch": 0.15714285714285714, + "grad_norm": 1.0808088779449463, + "learning_rate": 4.732485615515511e-06, + "loss": 1.4093, "step": 5500 }, { - "epoch": 0.12444444444444444, - "grad_norm": 1.084206223487854, - "learning_rate": 4.837001581283301e-06, - "loss": 1.4881, + "epoch": 0.16, + "grad_norm": 1.100080132484436, + "learning_rate": 4.722192476939309e-06, + "loss": 1.4263, "step": 5600 }, { - "epoch": 0.12666666666666668, - "grad_norm": 1.0708404779434204, - "learning_rate": 4.83068197401118e-06, - "loss": 1.4873, + "epoch": 0.16285714285714287, + "grad_norm": 1.096901774406433, + "learning_rate": 4.7117166663708025e-06, + "loss": 1.4084, "step": 5700 }, { - "epoch": 0.1288888888888889, - "grad_norm": 1.043641448020935, - "learning_rate": 4.824246466175909e-06, - "loss": 1.4772, + "epoch": 0.1657142857142857, + "grad_norm": 1.1885929107666016, + "learning_rate": 4.7010590449582525e-06, + "loss": 1.4146, "step": 5800 }, { - "epoch": 0.13111111111111112, - "grad_norm": 1.0868722200393677, - "learning_rate": 4.817695377803553e-06, - "loss": 1.476, + "epoch": 0.16857142857142857, + "grad_norm": 1.082043170928955, + "learning_rate": 4.690220488795406e-06, + "loss": 1.4201, "step": 5900 }, { - "epoch": 0.13333333333333333, - "grad_norm": 1.1160252094268799, - "learning_rate": 4.8110290346677875e-06, - "loss": 1.4829, + "epoch": 0.17142857142857143, + "grad_norm": 1.0647767782211304, + "learning_rate": 4.679201888849481e-06, + "loss": 1.436, "step": 6000 }, { - "epoch": 0.13333333333333333, - "eval_loss": 1.492410659790039, - "eval_runtime": 100.2411, - "eval_samples_per_second": 136.621, - "eval_steps_per_second": 2.135, + "epoch": 0.17142857142857143, + "eval_loss": 1.4336808919906616, + "eval_runtime": 97.8921, + "eval_samples_per_second": 139.899, + "eval_steps_per_second": 2.186, "step": 6000 }, { - "epoch": 0.13555555555555557, - "grad_norm": 1.0023784637451172, - "learning_rate": 4.804247768273695e-06, - "loss": 1.4704, + "epoch": 0.1742857142857143, + "grad_norm": 1.1146217584609985, + "learning_rate": 4.668004150887924e-06, + "loss": 1.4132, "step": 6100 }, { - "epoch": 0.13777777777777778, - "grad_norm": 1.0485386848449707, - "learning_rate": 4.797351915841285e-06, - "loss": 1.484, + "epoch": 0.17714285714285713, + "grad_norm": 1.0890520811080933, + "learning_rate": 4.656628195403952e-06, + "loss": 1.4047, "step": 6200 }, { - "epoch": 0.14, - "grad_norm": 1.0743563175201416, - "learning_rate": 4.79034182028872e-06, - "loss": 1.4831, + "epoch": 0.18, + "grad_norm": 1.033389687538147, + "learning_rate": 4.645074957540887e-06, + "loss": 1.4272, "step": 6300 }, { - "epoch": 0.14222222222222222, - "grad_norm": 0.961910605430603, - "learning_rate": 4.783217830215264e-06, - "loss": 1.4857, + "epoch": 0.18285714285714286, + "grad_norm": 1.1013028621673584, + "learning_rate": 4.63334538701528e-06, + "loss": 1.4402, "step": 6400 }, { - "epoch": 0.14444444444444443, - "grad_norm": 1.056557536125183, - "learning_rate": 4.775980299883949e-06, - "loss": 1.4888, + "epoch": 0.18571428571428572, + "grad_norm": 1.0814400911331177, + "learning_rate": 4.6214404480388455e-06, + "loss": 1.4031, "step": 6500 }, { - "epoch": 0.14666666666666667, - "grad_norm": 1.0664172172546387, - "learning_rate": 4.768629589203955e-06, - "loss": 1.478, + "epoch": 0.18857142857142858, + "grad_norm": 1.0447463989257812, + "learning_rate": 4.609361119239197e-06, + "loss": 1.4453, "step": 6600 }, { - "epoch": 0.14888888888888888, - "grad_norm": 1.0604673624038696, - "learning_rate": 4.761166063712719e-06, - "loss": 1.476, + "epoch": 0.19142857142857142, + "grad_norm": 1.1220800876617432, + "learning_rate": 4.5971083935794026e-06, + "loss": 1.4148, "step": 6700 }, { - "epoch": 0.1511111111111111, - "grad_norm": 1.0558980703353882, - "learning_rate": 4.753590094557745e-06, - "loss": 1.4832, + "epoch": 0.19428571428571428, + "grad_norm": 1.107762098312378, + "learning_rate": 4.584683278276356e-06, + "loss": 1.4285, "step": 6800 }, { - "epoch": 0.15333333333333332, - "grad_norm": 1.0912115573883057, - "learning_rate": 4.745902058478163e-06, - "loss": 1.4816, + "epoch": 0.19714285714285715, + "grad_norm": 1.1005544662475586, + "learning_rate": 4.572086794717985e-06, + "loss": 1.4328, "step": 6900 }, { - "epoch": 0.15555555555555556, - "grad_norm": 1.0631625652313232, - "learning_rate": 4.738102337785981e-06, - "loss": 1.4655, + "epoch": 0.2, + "grad_norm": 1.033148169517517, + "learning_rate": 4.559319978379287e-06, + "loss": 1.4111, "step": 7000 }, { - "epoch": 0.15555555555555556, - "eval_loss": 1.4969485998153687, - "eval_runtime": 101.1288, - "eval_samples_per_second": 135.421, - "eval_steps_per_second": 2.116, + "epoch": 0.2, + "eval_loss": 1.4392390251159668, + "eval_runtime": 97.5914, + "eval_samples_per_second": 140.33, + "eval_steps_per_second": 2.193, "step": 7000 }, { - "epoch": 0.15777777777777777, - "grad_norm": 1.080399990081787, - "learning_rate": 4.730191320347084e-06, - "loss": 1.4823, + "epoch": 0.20285714285714285, + "grad_norm": 1.052509069442749, + "learning_rate": 4.546383878737207e-06, + "loss": 1.4113, "step": 7100 }, { - "epoch": 0.16, - "grad_norm": 1.0477869510650635, - "learning_rate": 4.722169399561937e-06, - "loss": 1.4784, + "epoch": 0.2057142857142857, + "grad_norm": 1.0561904907226562, + "learning_rate": 4.533279559184373e-06, + "loss": 1.4275, "step": 7200 }, { - "epoch": 0.1622222222222222, - "grad_norm": 1.0932061672210693, - "learning_rate": 4.714036974346028e-06, - "loss": 1.4761, + "epoch": 0.20857142857142857, + "grad_norm": 1.0787992477416992, + "learning_rate": 4.520008096941676e-06, + "loss": 1.4084, "step": 7300 }, { - "epoch": 0.16444444444444445, - "grad_norm": 1.0640238523483276, - "learning_rate": 4.705794449110029e-06, - "loss": 1.4732, + "epoch": 0.21142857142857144, + "grad_norm": 1.0198429822921753, + "learning_rate": 4.506570582969719e-06, + "loss": 1.4029, "step": 7400 }, { - "epoch": 0.16666666666666666, - "grad_norm": 1.0664732456207275, - "learning_rate": 4.697442233739684e-06, - "loss": 1.4606, + "epoch": 0.21428571428571427, + "grad_norm": 1.0664575099945068, + "learning_rate": 4.492968121879142e-06, + "loss": 1.4049, "step": 7500 }, { - "epoch": 0.1688888888888889, - "grad_norm": 1.0107780694961548, - "learning_rate": 4.688980743575429e-06, - "loss": 1.4571, + "epoch": 0.21714285714285714, + "grad_norm": 1.0929675102233887, + "learning_rate": 4.479201831839812e-06, + "loss": 1.4169, "step": 7600 }, { - "epoch": 0.1711111111111111, - "grad_norm": 1.1194672584533691, - "learning_rate": 4.680410399391734e-06, - "loss": 1.4884, + "epoch": 0.22, + "grad_norm": 1.1445673704147339, + "learning_rate": 4.465272844488908e-06, + "loss": 1.4033, "step": 7700 }, { - "epoch": 0.17333333333333334, - "grad_norm": 1.0674372911453247, - "learning_rate": 4.671731627376184e-06, - "loss": 1.478, + "epoch": 0.22285714285714286, + "grad_norm": 1.064433217048645, + "learning_rate": 4.4511823048378986e-06, + "loss": 1.43, "step": 7800 }, { - "epoch": 0.17555555555555555, - "grad_norm": 1.0933210849761963, - "learning_rate": 4.662944859108278e-06, - "loss": 1.4611, + "epoch": 0.2257142857142857, + "grad_norm": 1.0845831632614136, + "learning_rate": 4.436931371178416e-06, + "loss": 1.4441, "step": 7900 }, { - "epoch": 0.17777777777777778, - "grad_norm": 1.0179678201675415, - "learning_rate": 4.654050531537975e-06, - "loss": 1.4983, + "epoch": 0.22857142857142856, + "grad_norm": 1.0980095863342285, + "learning_rate": 4.42252121498704e-06, + "loss": 1.4015, "step": 8000 }, { - "epoch": 0.17777777777777778, - "eval_loss": 1.4869590997695923, - "eval_runtime": 100.5, - "eval_samples_per_second": 136.269, - "eval_steps_per_second": 2.129, + "epoch": 0.22857142857142856, + "eval_loss": 1.4309405088424683, + "eval_runtime": 97.7381, + "eval_samples_per_second": 140.119, + "eval_steps_per_second": 2.19, "step": 8000 }, { - "epoch": 0.18, - "grad_norm": 1.1465884447097778, - "learning_rate": 4.645049086963961e-06, - "loss": 1.4749, + "epoch": 0.23142857142857143, + "grad_norm": 1.1431641578674316, + "learning_rate": 4.407953020829001e-06, + "loss": 1.4249, "step": 8100 }, { - "epoch": 0.18222222222222223, - "grad_norm": 1.0336287021636963, - "learning_rate": 4.6359409730116546e-06, - "loss": 1.4854, + "epoch": 0.2342857142857143, + "grad_norm": 1.0139048099517822, + "learning_rate": 4.393227986260801e-06, + "loss": 1.3958, "step": 8200 }, { - "epoch": 0.18444444444444444, - "grad_norm": 1.102246642112732, - "learning_rate": 4.62672664261095e-06, - "loss": 1.4625, + "epoch": 0.23714285714285716, + "grad_norm": 1.0676871538162231, + "learning_rate": 4.378347321731773e-06, + "loss": 1.4204, "step": 8300 }, { - "epoch": 0.18666666666666668, - "grad_norm": 1.0804390907287598, - "learning_rate": 4.617406553973687e-06, - "loss": 1.4831, + "epoch": 0.24, + "grad_norm": 1.1097986698150635, + "learning_rate": 4.363312250484577e-06, + "loss": 1.4335, "step": 8400 }, { - "epoch": 0.18888888888888888, - "grad_norm": 0.9855327606201172, - "learning_rate": 4.607981170570875e-06, - "loss": 1.4745, + "epoch": 0.24285714285714285, + "grad_norm": 1.083742380142212, + "learning_rate": 4.348124008454644e-06, + "loss": 1.436, "step": 8500 }, { - "epoch": 0.19111111111111112, - "grad_norm": 1.0226573944091797, - "learning_rate": 4.598450961109637e-06, - "loss": 1.493, + "epoch": 0.24571428571428572, + "grad_norm": 1.072716474533081, + "learning_rate": 4.332783844168581e-06, + "loss": 1.424, "step": 8600 }, { - "epoch": 0.19333333333333333, - "grad_norm": 1.175445556640625, - "learning_rate": 4.588816399509905e-06, - "loss": 1.4798, + "epoch": 0.24857142857142858, + "grad_norm": 1.1168031692504883, + "learning_rate": 4.317293018641536e-06, + "loss": 1.4262, "step": 8700 }, { - "epoch": 0.19555555555555557, - "grad_norm": 1.1075881719589233, - "learning_rate": 4.579077964880855e-06, - "loss": 1.4634, + "epoch": 0.25142857142857145, + "grad_norm": 1.1102938652038574, + "learning_rate": 4.301652805273535e-06, + "loss": 1.4141, "step": 8800 }, { - "epoch": 0.19777777777777777, - "grad_norm": 1.0150587558746338, - "learning_rate": 4.569236141497075e-06, - "loss": 1.4736, + "epoch": 0.2542857142857143, + "grad_norm": 1.1052049398422241, + "learning_rate": 4.285864489744809e-06, + "loss": 1.4221, "step": 8900 }, { - "epoch": 0.2, - "grad_norm": 1.0608808994293213, - "learning_rate": 4.559291418774489e-06, - "loss": 1.4796, + "epoch": 0.2571428571428571, + "grad_norm": 1.0475815534591675, + "learning_rate": 4.269929369910103e-06, + "loss": 1.4145, "step": 9000 }, { - "epoch": 0.2, - "eval_loss": 1.4837573766708374, - "eval_runtime": 102.4509, - "eval_samples_per_second": 133.674, - "eval_steps_per_second": 2.089, + "epoch": 0.2571428571428571, + "eval_loss": 1.428357481956482, + "eval_runtime": 98.1292, + "eval_samples_per_second": 139.561, + "eval_steps_per_second": 2.181, "step": 9000 }, { - "epoch": 0.20222222222222222, - "grad_norm": 0.9918996095657349, - "learning_rate": 4.54924429124602e-06, - "loss": 1.4812, + "epoch": 0.26, + "grad_norm": 1.0066262483596802, + "learning_rate": 4.253848755691992e-06, + "loss": 1.4049, "step": 9100 }, { - "epoch": 0.20444444444444446, - "grad_norm": 1.1614868640899658, - "learning_rate": 4.539095258536991e-06, - "loss": 1.4708, + "epoch": 0.26285714285714284, + "grad_norm": 1.131996512413025, + "learning_rate": 4.2376239689731955e-06, + "loss": 1.3991, "step": 9200 }, { - "epoch": 0.20666666666666667, - "grad_norm": 1.1184130907058716, - "learning_rate": 4.528844825340286e-06, - "loss": 1.4598, + "epoch": 0.26571428571428574, + "grad_norm": 1.1413109302520752, + "learning_rate": 4.2212563434879175e-06, + "loss": 1.3744, "step": 9300 }, { - "epoch": 0.2088888888888889, - "grad_norm": 1.1380512714385986, - "learning_rate": 4.5184935013912505e-06, - "loss": 1.4683, + "epoch": 0.26857142857142857, + "grad_norm": 1.073792576789856, + "learning_rate": 4.204747224712209e-06, + "loss": 1.422, "step": 9400 }, { - "epoch": 0.2111111111111111, - "grad_norm": 1.0119566917419434, - "learning_rate": 4.508041801442342e-06, - "loss": 1.4877, + "epoch": 0.2714285714285714, + "grad_norm": 1.0397651195526123, + "learning_rate": 4.188097969753363e-06, + "loss": 1.4064, "step": 9500 }, { - "epoch": 0.21333333333333335, - "grad_norm": 1.0013171434402466, - "learning_rate": 4.497490245237534e-06, - "loss": 1.4825, + "epoch": 0.2742857142857143, + "grad_norm": 1.1306557655334473, + "learning_rate": 4.171309947238357e-06, + "loss": 1.4408, "step": 9600 }, { - "epoch": 0.21555555555555556, - "grad_norm": 1.0520776510238647, - "learning_rate": 4.48683935748647e-06, - "loss": 1.4602, + "epoch": 0.27714285714285714, + "grad_norm": 1.1982935667037964, + "learning_rate": 4.154384537201347e-06, + "loss": 1.4151, "step": 9700 }, { - "epoch": 0.21777777777777776, - "grad_norm": 1.013963222503662, - "learning_rate": 4.476089667838369e-06, - "loss": 1.4701, + "epoch": 0.28, + "grad_norm": 1.1465263366699219, + "learning_rate": 4.137323130970225e-06, + "loss": 1.4211, "step": 9800 }, { - "epoch": 0.22, - "grad_norm": 1.0059009790420532, - "learning_rate": 4.465241710855688e-06, - "loss": 1.4631, + "epoch": 0.28285714285714286, + "grad_norm": 0.9817516803741455, + "learning_rate": 4.120127131052244e-06, + "loss": 1.4089, "step": 9900 }, { - "epoch": 0.2222222222222222, - "grad_norm": 1.0225909948349, - "learning_rate": 4.4542960259875415e-06, - "loss": 1.4647, + "epoch": 0.2857142857142857, + "grad_norm": 1.150546908378601, + "learning_rate": 4.1027979510187285e-06, + "loss": 1.4191, "step": 10000 }, { - "epoch": 0.2222222222222222, - "eval_loss": 1.4831452369689941, - "eval_runtime": 102.5084, - "eval_samples_per_second": 133.599, - "eval_steps_per_second": 2.088, + "epoch": 0.2857142857142857, + "eval_loss": 1.429203748703003, + "eval_runtime": 98.6571, + "eval_samples_per_second": 138.814, + "eval_steps_per_second": 2.169, "step": 10000 }, { - "epoch": 0.22444444444444445, - "grad_norm": 1.0490069389343262, - "learning_rate": 4.44325315754287e-06, - "loss": 1.4887, + "epoch": 0.2885714285714286, + "grad_norm": 1.0928316116333008, + "learning_rate": 4.085337015388876e-06, + "loss": 1.4155, "step": 10100 }, { - "epoch": 0.22666666666666666, - "grad_norm": 1.0999810695648193, - "learning_rate": 4.432113654663379e-06, - "loss": 1.4609, + "epoch": 0.2914285714285714, + "grad_norm": 1.1372452974319458, + "learning_rate": 4.067745759512654e-06, + "loss": 1.4229, "step": 10200 }, { - "epoch": 0.2288888888888889, - "grad_norm": 1.093153715133667, - "learning_rate": 4.420878071296227e-06, - "loss": 1.4731, + "epoch": 0.29428571428571426, + "grad_norm": 1.1249101161956787, + "learning_rate": 4.0500256294528084e-06, + "loss": 1.4178, "step": 10300 }, { - "epoch": 0.2311111111111111, - "grad_norm": 1.0634323358535767, - "learning_rate": 4.4095469661664794e-06, - "loss": 1.4376, + "epoch": 0.29714285714285715, + "grad_norm": 1.1340339183807373, + "learning_rate": 4.032178081865995e-06, + "loss": 1.4125, "step": 10400 }, { - "epoch": 0.23333333333333334, - "grad_norm": 1.0290603637695312, - "learning_rate": 4.398120902749326e-06, - "loss": 1.4456, + "epoch": 0.3, + "grad_norm": 1.0652027130126953, + "learning_rate": 4.014204583883038e-06, + "loss": 1.4283, "step": 10500 }, { - "epoch": 0.23555555555555555, - "grad_norm": 1.0774028301239014, - "learning_rate": 4.386600449242057e-06, - "loss": 1.4789, + "epoch": 0.3028571428571429, + "grad_norm": 1.1057724952697754, + "learning_rate": 3.996106612988321e-06, + "loss": 1.4046, "step": 10600 }, { - "epoch": 0.23777777777777778, - "grad_norm": 1.0706713199615479, - "learning_rate": 4.374986178535812e-06, - "loss": 1.4872, + "epoch": 0.3057142857142857, + "grad_norm": 1.089181661605835, + "learning_rate": 3.977885656898337e-06, + "loss": 1.4199, "step": 10700 }, { - "epoch": 0.24, - "grad_norm": 1.0337339639663696, - "learning_rate": 4.363278668187086e-06, - "loss": 1.4742, + "epoch": 0.30857142857142855, + "grad_norm": 1.0804879665374756, + "learning_rate": 3.959543213439393e-06, + "loss": 1.4259, "step": 10800 }, { - "epoch": 0.24222222222222223, - "grad_norm": 1.1255840063095093, - "learning_rate": 4.351478500389014e-06, - "loss": 1.4554, + "epoch": 0.31142857142857144, + "grad_norm": 1.0948872566223145, + "learning_rate": 3.941080790424483e-06, + "loss": 1.4143, "step": 10900 }, { - "epoch": 0.24444444444444444, - "grad_norm": 1.110970377922058, - "learning_rate": 4.3395862619424164e-06, - "loss": 1.4661, + "epoch": 0.3142857142857143, + "grad_norm": 1.1653496026992798, + "learning_rate": 3.92249990552934e-06, + "loss": 1.4343, "step": 11000 }, { - "epoch": 0.24444444444444444, - "eval_loss": 1.4758148193359375, - "eval_runtime": 102.2906, - "eval_samples_per_second": 133.883, - "eval_steps_per_second": 2.092, + "epoch": 0.3142857142857143, + "eval_loss": 1.4226573705673218, + "eval_runtime": 98.9066, + "eval_samples_per_second": 138.464, + "eval_steps_per_second": 2.164, "step": 11000 }, { - "epoch": 0.24666666666666667, - "grad_norm": 1.094433307647705, - "learning_rate": 4.3276025442266175e-06, - "loss": 1.4567, + "epoch": 0.3171428571428571, + "grad_norm": 1.0654685497283936, + "learning_rate": 3.903802086167676e-06, + "loss": 1.4102, "step": 11100 }, { - "epoch": 0.24888888888888888, - "grad_norm": 1.097430944442749, - "learning_rate": 4.31552794317004e-06, - "loss": 1.4803, + "epoch": 0.32, + "grad_norm": 1.01749849319458, + "learning_rate": 3.884988869365626e-06, + "loss": 1.408, "step": 11200 }, { - "epoch": 0.2511111111111111, - "grad_norm": 1.0710690021514893, - "learning_rate": 4.3033630592205675e-06, - "loss": 1.437, + "epoch": 0.32285714285714284, + "grad_norm": 1.1105825901031494, + "learning_rate": 3.866061801635399e-06, + "loss": 1.4276, "step": 11300 }, { - "epoch": 0.25333333333333335, - "grad_norm": 1.1534795761108398, - "learning_rate": 4.291108497315691e-06, - "loss": 1.4698, + "epoch": 0.32571428571428573, + "grad_norm": 1.0666981935501099, + "learning_rate": 3.8470224388481485e-06, + "loss": 1.3964, "step": 11400 }, { - "epoch": 0.25555555555555554, - "grad_norm": 1.019482970237732, - "learning_rate": 4.27876486685242e-06, - "loss": 1.4622, + "epoch": 0.32857142857142857, + "grad_norm": 1.090728759765625, + "learning_rate": 3.827872346106073e-06, + "loss": 1.3981, "step": 11500 }, { - "epoch": 0.2577777777777778, - "grad_norm": 1.1455426216125488, - "learning_rate": 4.26633278165698e-06, - "loss": 1.4656, + "epoch": 0.3314285714285714, + "grad_norm": 1.069846272468567, + "learning_rate": 3.808613097613759e-06, + "loss": 1.4053, "step": 11600 }, { - "epoch": 0.26, - "grad_norm": 1.0760246515274048, - "learning_rate": 4.253812859954292e-06, - "loss": 1.4768, + "epoch": 0.3342857142857143, + "grad_norm": 1.1465699672698975, + "learning_rate": 3.7892462765487836e-06, + "loss": 1.3944, "step": 11700 }, { - "epoch": 0.26222222222222225, - "grad_norm": 1.0122239589691162, - "learning_rate": 4.241205724337223e-06, - "loss": 1.4773, + "epoch": 0.33714285714285713, + "grad_norm": 1.068352222442627, + "learning_rate": 3.769773474931558e-06, + "loss": 1.4284, "step": 11800 }, { - "epoch": 0.2644444444444444, - "grad_norm": 1.0859477519989014, - "learning_rate": 4.22851200173563e-06, - "loss": 1.4737, + "epoch": 0.34, + "grad_norm": 1.1487313508987427, + "learning_rate": 3.7501962934944704e-06, + "loss": 1.3894, "step": 11900 }, { - "epoch": 0.26666666666666666, - "grad_norm": 1.0537041425704956, - "learning_rate": 4.2157323233851855e-06, - "loss": 1.4655, + "epoch": 0.34285714285714286, + "grad_norm": 1.1034648418426514, + "learning_rate": 3.7305163415502936e-06, + "loss": 1.4184, "step": 12000 }, { - "epoch": 0.26666666666666666, - "eval_loss": 1.4727766513824463, - "eval_runtime": 102.6842, - "eval_samples_per_second": 133.37, - "eval_steps_per_second": 2.084, + "epoch": 0.34285714285714286, + "eval_loss": 1.4204550981521606, + "eval_runtime": 99.0133, + "eval_samples_per_second": 138.315, + "eval_steps_per_second": 2.161, "step": 12000 }, { - "epoch": 0.2688888888888889, - "grad_norm": 1.1021567583084106, - "learning_rate": 4.202867324795982e-06, - "loss": 1.4693, + "epoch": 0.3457142857142857, + "grad_norm": 1.08807373046875, + "learning_rate": 3.710735236859895e-06, + "loss": 1.4208, "step": 12100 }, { - "epoch": 0.27111111111111114, - "grad_norm": 0.9724922180175781, - "learning_rate": 4.18991764572093e-06, - "loss": 1.4585, + "epoch": 0.3485714285714286, + "grad_norm": 1.142823338508606, + "learning_rate": 3.6908546054992523e-06, + "loss": 1.4292, "step": 12200 }, { - "epoch": 0.2733333333333333, - "grad_norm": 1.140974760055542, - "learning_rate": 4.17688393012395e-06, - "loss": 1.4437, + "epoch": 0.3514285714285714, + "grad_norm": 1.0997464656829834, + "learning_rate": 3.670876081725784e-06, + "loss": 1.4058, "step": 12300 }, { - "epoch": 0.27555555555555555, - "grad_norm": 1.05283784866333, - "learning_rate": 4.163766826147943e-06, - "loss": 1.477, + "epoch": 0.35428571428571426, + "grad_norm": 1.1083920001983643, + "learning_rate": 3.650801307844004e-06, + "loss": 1.4152, "step": 12400 }, { - "epoch": 0.2777777777777778, - "grad_norm": 1.0787185430526733, - "learning_rate": 4.1505669860825645e-06, - "loss": 1.4659, + "epoch": 0.35714285714285715, + "grad_norm": 1.1371042728424072, + "learning_rate": 3.630631934070527e-06, + "loss": 1.4259, "step": 12500 }, { - "epoch": 0.28, - "grad_norm": 1.0746320486068726, - "learning_rate": 4.137285066331781e-06, - "loss": 1.459, + "epoch": 0.36, + "grad_norm": 1.0470432043075562, + "learning_rate": 3.610369618398404e-06, + "loss": 1.3952, "step": 12600 }, { - "epoch": 0.2822222222222222, - "grad_norm": 1.0252412557601929, - "learning_rate": 4.123921727381234e-06, - "loss": 1.4802, + "epoch": 0.3628571428571429, + "grad_norm": 1.0853626728057861, + "learning_rate": 3.5900160264608395e-06, + "loss": 1.4005, "step": 12700 }, { - "epoch": 0.28444444444444444, - "grad_norm": 1.0793858766555786, - "learning_rate": 4.110477633765395e-06, - "loss": 1.4574, + "epoch": 0.3657142857142857, + "grad_norm": 1.0409729480743408, + "learning_rate": 3.569572831394265e-06, + "loss": 1.431, "step": 12800 }, { - "epoch": 0.2866666666666667, - "grad_norm": 1.1628038883209229, - "learning_rate": 4.096953454034514e-06, - "loss": 1.4711, + "epoch": 0.36857142857142855, + "grad_norm": 1.1226378679275513, + "learning_rate": 3.5490417137007997e-06, + "loss": 1.4112, "step": 12900 }, { - "epoch": 0.28888888888888886, - "grad_norm": 1.0130637884140015, - "learning_rate": 4.08334986072138e-06, - "loss": 1.4693, + "epoch": 0.37142857142857144, + "grad_norm": 1.0430322885513306, + "learning_rate": 3.528424361110115e-06, + "loss": 1.3999, "step": 13000 }, { - "epoch": 0.28888888888888886, - "eval_loss": 1.4742803573608398, - "eval_runtime": 102.7825, - "eval_samples_per_second": 133.242, - "eval_steps_per_second": 2.082, + "epoch": 0.37142857142857144, + "eval_loss": 1.423007845878601, + "eval_runtime": 99.2113, + "eval_samples_per_second": 138.039, + "eval_steps_per_second": 2.157, "step": 13000 }, { - "epoch": 0.2911111111111111, - "grad_norm": 1.0166757106781006, - "learning_rate": 4.069667530307876e-06, - "loss": 1.4463, + "epoch": 0.3742857142857143, + "grad_norm": 1.1154820919036865, + "learning_rate": 3.507722468440688e-06, + "loss": 1.4097, "step": 13100 }, { - "epoch": 0.29333333333333333, - "grad_norm": 1.0768380165100098, - "learning_rate": 4.0559071431913335e-06, - "loss": 1.4553, + "epoch": 0.37714285714285717, + "grad_norm": 1.1299182176589966, + "learning_rate": 3.4869377374604886e-06, + "loss": 1.4064, "step": 13200 }, { - "epoch": 0.29555555555555557, - "grad_norm": 1.0084826946258545, - "learning_rate": 4.042069383650704e-06, - "loss": 1.4671, + "epoch": 0.38, + "grad_norm": 1.1046215295791626, + "learning_rate": 3.4660718767470854e-06, + "loss": 1.4234, "step": 13300 }, { - "epoch": 0.29777777777777775, - "grad_norm": 1.0666933059692383, - "learning_rate": 4.028154939812527e-06, - "loss": 1.4785, + "epoch": 0.38285714285714284, + "grad_norm": 1.0251668691635132, + "learning_rate": 3.445126601547193e-06, + "loss": 1.4097, "step": 13400 }, { - "epoch": 0.3, - "grad_norm": 1.0286009311676025, - "learning_rate": 4.014164503616713e-06, - "loss": 1.4562, + "epoch": 0.38571428571428573, + "grad_norm": 1.0839489698410034, + "learning_rate": 3.4241036336356757e-06, + "loss": 1.401, "step": 13500 }, { - "epoch": 0.3022222222222222, - "grad_norm": 1.1066243648529053, - "learning_rate": 4.000098770782136e-06, - "loss": 1.4636, + "epoch": 0.38857142857142857, + "grad_norm": 1.0709606409072876, + "learning_rate": 3.40300470117401e-06, + "loss": 1.4164, "step": 13600 }, { - "epoch": 0.30444444444444446, - "grad_norm": 1.0686429738998413, - "learning_rate": 3.985958440772031e-06, - "loss": 1.4483, + "epoch": 0.3914285714285714, + "grad_norm": 1.0628767013549805, + "learning_rate": 3.3818315385682255e-06, + "loss": 1.409, "step": 13700 }, { - "epoch": 0.30666666666666664, - "grad_norm": 0.9844673871994019, - "learning_rate": 3.971744216759216e-06, - "loss": 1.47, + "epoch": 0.3942857142857143, + "grad_norm": 1.0831209421157837, + "learning_rate": 3.3605858863263274e-06, + "loss": 1.4073, "step": 13800 }, { - "epoch": 0.3088888888888889, - "grad_norm": 1.184783697128296, - "learning_rate": 3.957456805591122e-06, - "loss": 1.4476, + "epoch": 0.39714285714285713, + "grad_norm": 1.1459494829177856, + "learning_rate": 3.339269490915223e-06, + "loss": 1.4147, "step": 13900 }, { - "epoch": 0.3111111111111111, - "grad_norm": 1.0774155855178833, - "learning_rate": 3.943096917754647e-06, - "loss": 1.4708, + "epoch": 0.4, + "grad_norm": 1.0614882707595825, + "learning_rate": 3.317884104617155e-06, + "loss": 1.4089, "step": 14000 }, { - "epoch": 0.3111111111111111, - "eval_loss": 1.46890389919281, - "eval_runtime": 102.6188, - "eval_samples_per_second": 133.455, - "eval_steps_per_second": 2.085, + "epoch": 0.4, + "eval_loss": 1.4181102514266968, + "eval_runtime": 99.8701, + "eval_samples_per_second": 137.128, + "eval_steps_per_second": 2.143, "step": 14000 }, { - "epoch": 0.31333333333333335, - "grad_norm": 1.1095852851867676, - "learning_rate": 3.928665267340817e-06, - "loss": 1.4476, + "epoch": 0.40285714285714286, + "grad_norm": 1.0587329864501953, + "learning_rate": 3.2964314853856593e-06, + "loss": 1.3895, "step": 14100 }, { - "epoch": 0.31555555555555553, - "grad_norm": 1.0171401500701904, - "learning_rate": 3.9141625720092825e-06, - "loss": 1.4399, + "epoch": 0.4057142857142857, + "grad_norm": 1.1020365953445435, + "learning_rate": 3.2749133967010545e-06, + "loss": 1.4037, "step": 14200 }, { - "epoch": 0.31777777777777777, - "grad_norm": 1.0451526641845703, - "learning_rate": 3.899589552952628e-06, - "loss": 1.463, + "epoch": 0.4085714285714286, + "grad_norm": 1.1230683326721191, + "learning_rate": 3.253331607425475e-06, + "loss": 1.4018, "step": 14300 }, { - "epoch": 0.32, - "grad_norm": 1.0842229127883911, - "learning_rate": 3.8849469348605085e-06, - "loss": 1.4435, + "epoch": 0.4114285714285714, + "grad_norm": 1.0774966478347778, + "learning_rate": 3.231687891657469e-06, + "loss": 1.4087, "step": 14400 }, { - "epoch": 0.32222222222222224, - "grad_norm": 1.0401058197021484, - "learning_rate": 3.8702354458836124e-06, - "loss": 1.4554, + "epoch": 0.4142857142857143, + "grad_norm": 1.0514012575149536, + "learning_rate": 3.209984028586157e-06, + "loss": 1.3861, "step": 14500 }, { - "epoch": 0.3244444444444444, - "grad_norm": 1.1635191440582275, - "learning_rate": 3.855455817597451e-06, - "loss": 1.4612, + "epoch": 0.41714285714285715, + "grad_norm": 1.1025465726852417, + "learning_rate": 3.188221802344978e-06, + "loss": 1.4038, "step": 14600 }, { - "epoch": 0.32666666666666666, - "grad_norm": 1.0651050806045532, - "learning_rate": 3.840608784965979e-06, - "loss": 1.4636, + "epoch": 0.42, + "grad_norm": 1.139419436454773, + "learning_rate": 3.16640300186503e-06, + "loss": 1.4033, "step": 14700 }, { - "epoch": 0.3288888888888889, - "grad_norm": 1.0558030605316162, - "learning_rate": 3.825695086305046e-06, - "loss": 1.4519, + "epoch": 0.4228571428571429, + "grad_norm": 1.043289303779602, + "learning_rate": 3.1445294207280093e-06, + "loss": 1.3867, "step": 14800 }, { - "epoch": 0.33111111111111113, - "grad_norm": 1.0227941274642944, - "learning_rate": 3.8107154632456845e-06, - "loss": 1.4731, + "epoch": 0.4257142857142857, + "grad_norm": 1.101967453956604, + "learning_rate": 3.1226028570187737e-06, + "loss": 1.391, "step": 14900 }, { - "epoch": 0.3333333333333333, - "grad_norm": 1.0800951719284058, - "learning_rate": 3.7956706606972214e-06, - "loss": 1.4547, + "epoch": 0.42857142857142855, + "grad_norm": 1.0626415014266968, + "learning_rate": 3.1006251131775342e-06, + "loss": 1.3949, "step": 15000 }, { - "epoch": 0.3333333333333333, - "eval_loss": 1.4689823389053345, - "eval_runtime": 102.3222, - "eval_samples_per_second": 133.842, - "eval_steps_per_second": 2.091, + "epoch": 0.42857142857142855, + "eval_loss": 1.4195818901062012, + "eval_runtime": 99.5817, + "eval_samples_per_second": 137.525, + "eval_steps_per_second": 2.149, "step": 15000 }, { - "epoch": 0.33555555555555555, - "grad_norm": 1.0613428354263306, - "learning_rate": 3.780561426810247e-06, - "loss": 1.446, + "epoch": 0.43142857142857144, + "grad_norm": 1.1212193965911865, + "learning_rate": 3.078597995851689e-06, + "loss": 1.4007, "step": 15100 }, { - "epoch": 0.3377777777777778, - "grad_norm": 1.0628856420516968, - "learning_rate": 3.7653885129393996e-06, - "loss": 1.4708, + "epoch": 0.4342857142857143, + "grad_norm": 1.0601767301559448, + "learning_rate": 3.056523315747308e-06, + "loss": 1.4098, "step": 15200 }, { - "epoch": 0.34, - "grad_norm": 1.077598214149475, - "learning_rate": 3.7501526736060113e-06, - "loss": 1.4434, + "epoch": 0.43714285714285717, + "grad_norm": 1.0668915510177612, + "learning_rate": 3.034402887480287e-06, + "loss": 1.3885, "step": 15300 }, { - "epoch": 0.3422222222222222, - "grad_norm": 1.0646636486053467, - "learning_rate": 3.7348546664605777e-06, - "loss": 1.4583, + "epoch": 0.44, + "grad_norm": 1.0714190006256104, + "learning_rate": 3.012238529427181e-06, + "loss": 1.4018, "step": 15400 }, { - "epoch": 0.34444444444444444, - "grad_norm": 1.07997465133667, - "learning_rate": 3.7194952522450905e-06, - "loss": 1.4523, + "epoch": 0.44285714285714284, + "grad_norm": 1.1230597496032715, + "learning_rate": 2.9900320635757293e-06, + "loss": 1.4086, "step": 15500 }, { - "epoch": 0.3466666666666667, - "grad_norm": 1.0550297498703003, - "learning_rate": 3.7040751947552013e-06, - "loss": 1.4629, + "epoch": 0.44571428571428573, + "grad_norm": 1.0094853639602661, + "learning_rate": 2.9677853153750763e-06, + "loss": 1.3801, "step": 15600 }, { - "epoch": 0.3488888888888889, - "grad_norm": 1.084794044494629, - "learning_rate": 3.68859526080224e-06, - "loss": 1.4588, + "epoch": 0.44857142857142857, + "grad_norm": 1.0972274541854858, + "learning_rate": 2.9455001135857194e-06, + "loss": 1.3985, "step": 15700 }, { - "epoch": 0.3511111111111111, - "grad_norm": 1.0864589214324951, - "learning_rate": 3.6730562201750864e-06, - "loss": 1.427, + "epoch": 0.4514285714285714, + "grad_norm": 1.0266581773757935, + "learning_rate": 2.9231782901291726e-06, + "loss": 1.4124, "step": 15800 }, { - "epoch": 0.35333333333333333, - "grad_norm": 1.1361236572265625, - "learning_rate": 3.657458845601885e-06, - "loss": 1.45, + "epoch": 0.4542857142857143, + "grad_norm": 1.138675332069397, + "learning_rate": 2.900821679937382e-06, + "loss": 1.4173, "step": 15900 }, { - "epoch": 0.35555555555555557, - "grad_norm": 1.110909342765808, - "learning_rate": 3.6418039127116235e-06, - "loss": 1.4784, + "epoch": 0.45714285714285713, + "grad_norm": 1.1691060066223145, + "learning_rate": 2.8784321208018817e-06, + "loss": 1.4123, "step": 16000 }, { - "epoch": 0.35555555555555557, - "eval_loss": 1.4744384288787842, - "eval_runtime": 102.4825, - "eval_samples_per_second": 133.633, - "eval_steps_per_second": 2.088, + "epoch": 0.45714285714285713, + "eval_loss": 1.4248454570770264, + "eval_runtime": 99.8569, + "eval_samples_per_second": 137.146, + "eval_steps_per_second": 2.143, "step": 16000 }, { - "epoch": 0.35777777777777775, - "grad_norm": 1.0018236637115479, - "learning_rate": 3.626092199995557e-06, - "loss": 1.4492, + "epoch": 0.46, + "grad_norm": 1.1149132251739502, + "learning_rate": 2.8560114532227262e-06, + "loss": 1.4171, "step": 16100 }, { - "epoch": 0.36, - "grad_norm": 1.0787028074264526, - "learning_rate": 3.6103244887684985e-06, - "loss": 1.442, + "epoch": 0.46285714285714286, + "grad_norm": 1.0276226997375488, + "learning_rate": 2.8335615202571927e-06, + "loss": 1.4177, "step": 16200 }, { - "epoch": 0.3622222222222222, - "grad_norm": 1.0067973136901855, - "learning_rate": 3.594501563129966e-06, - "loss": 1.448, + "epoch": 0.4657142857142857, + "grad_norm": 1.0828535556793213, + "learning_rate": 2.811084167368276e-06, + "loss": 1.3762, "step": 16300 }, { - "epoch": 0.36444444444444446, - "grad_norm": 1.009955883026123, - "learning_rate": 3.5786242099251895e-06, - "loss": 1.4541, + "epoch": 0.4685714285714286, + "grad_norm": 1.171616554260254, + "learning_rate": 2.788581242272983e-06, + "loss": 1.3965, "step": 16400 }, { - "epoch": 0.36666666666666664, - "grad_norm": 1.0146301984786987, - "learning_rate": 3.5626932187059807e-06, - "loss": 1.444, + "epoch": 0.4714285714285714, + "grad_norm": 1.0692201852798462, + "learning_rate": 2.7660545947904464e-06, + "loss": 1.4066, "step": 16500 }, { - "epoch": 0.3688888888888889, - "grad_norm": 1.093392252922058, - "learning_rate": 3.5467093816914744e-06, - "loss": 1.4682, + "epoch": 0.4742857142857143, + "grad_norm": 1.1563397645950317, + "learning_rate": 2.7435060766898614e-06, + "loss": 1.4008, "step": 16600 }, { - "epoch": 0.3711111111111111, - "grad_norm": 1.0555778741836548, - "learning_rate": 3.530673493728727e-06, - "loss": 1.4741, + "epoch": 0.47714285714285715, + "grad_norm": 1.1032534837722778, + "learning_rate": 2.7209375415382655e-06, + "loss": 1.3905, "step": 16700 }, { - "epoch": 0.37333333333333335, - "grad_norm": 1.0543639659881592, - "learning_rate": 3.5145863522531974e-06, - "loss": 1.4686, + "epoch": 0.48, + "grad_norm": 1.1357022523880005, + "learning_rate": 2.698350844548168e-06, + "loss": 1.406, "step": 16800 }, { - "epoch": 0.37555555555555553, - "grad_norm": 1.0254998207092285, - "learning_rate": 3.498448757249086e-06, - "loss": 1.4523, + "epoch": 0.4828571428571429, + "grad_norm": 1.0574637651443481, + "learning_rate": 2.6757478424250417e-06, + "loss": 1.4049, "step": 16900 }, { - "epoch": 0.37777777777777777, - "grad_norm": 1.180134892463684, - "learning_rate": 3.482261511209556e-06, - "loss": 1.4353, + "epoch": 0.4857142857142857, + "grad_norm": 1.0180025100708008, + "learning_rate": 2.653130393214702e-06, + "loss": 1.3979, "step": 17000 }, { - "epoch": 0.37777777777777777, - "eval_loss": 1.468135952949524, - "eval_runtime": 102.9245, - "eval_samples_per_second": 133.059, - "eval_steps_per_second": 2.079, + "epoch": 0.4857142857142857, + "eval_loss": 1.4195657968521118, + "eval_runtime": 99.9485, + "eval_samples_per_second": 137.021, + "eval_steps_per_second": 2.141, "step": 17000 }, { - "epoch": 0.38, - "grad_norm": 1.0833312273025513, - "learning_rate": 3.4660254190968247e-06, - "loss": 1.4664, + "epoch": 0.48857142857142855, + "grad_norm": 1.0153673887252808, + "learning_rate": 2.630500356150565e-06, + "loss": 1.4138, "step": 17100 }, { - "epoch": 0.38222222222222224, - "grad_norm": 1.0613282918930054, - "learning_rate": 3.4497412883021375e-06, - "loss": 1.46, + "epoch": 0.49142857142857144, + "grad_norm": 1.0832693576812744, + "learning_rate": 2.6078595915008096e-06, + "loss": 1.3934, "step": 17200 }, { - "epoch": 0.3844444444444444, - "grad_norm": 1.073068380355835, - "learning_rate": 3.4334099286056134e-06, - "loss": 1.4558, + "epoch": 0.4942857142857143, + "grad_norm": 1.1552319526672363, + "learning_rate": 2.585209960415464e-06, + "loss": 1.414, "step": 17300 }, { - "epoch": 0.38666666666666666, - "grad_norm": 1.0136057138442993, - "learning_rate": 3.4170321521359816e-06, - "loss": 1.4775, + "epoch": 0.49714285714285716, + "grad_norm": 1.1260509490966797, + "learning_rate": 2.562553324773404e-06, + "loss": 1.3988, "step": 17400 }, { - "epoch": 0.3888888888888889, - "grad_norm": 1.0413048267364502, - "learning_rate": 3.40060877333019e-06, - "loss": 1.4519, + "epoch": 0.5, + "grad_norm": 1.1187398433685303, + "learning_rate": 2.5398915470293077e-06, + "loss": 1.4048, "step": 17500 }, { - "epoch": 0.39111111111111113, - "grad_norm": 1.090867519378662, - "learning_rate": 3.38414060889291e-06, - "loss": 1.4653, + "epoch": 0.5028571428571429, + "grad_norm": 1.0673401355743408, + "learning_rate": 2.5172264900605497e-06, + "loss": 1.4012, "step": 17600 }, { - "epoch": 0.3933333333333333, - "grad_norm": 1.0485255718231201, - "learning_rate": 3.367628477755917e-06, - "loss": 1.434, + "epoch": 0.5057142857142857, + "grad_norm": 1.098514199256897, + "learning_rate": 2.49456001701407e-06, + "loss": 1.4021, "step": 17700 }, { - "epoch": 0.39555555555555555, - "grad_norm": 1.0650479793548584, - "learning_rate": 3.351073201037374e-06, - "loss": 1.4626, + "epoch": 0.5085714285714286, + "grad_norm": 1.1217247247695923, + "learning_rate": 2.471893991153216e-06, + "loss": 1.4041, "step": 17800 }, { - "epoch": 0.3977777777777778, - "grad_norm": 1.0615607500076294, - "learning_rate": 3.3344756020009934e-06, - "loss": 1.4471, + "epoch": 0.5114285714285715, + "grad_norm": 1.1324173212051392, + "learning_rate": 2.4492302757045705e-06, + "loss": 1.3942, "step": 17900 }, { - "epoch": 0.4, - "grad_norm": 0.9974454045295715, - "learning_rate": 3.3178365060150995e-06, - "loss": 1.4449, + "epoch": 0.5142857142857142, + "grad_norm": 1.1281129121780396, + "learning_rate": 2.426570733704798e-06, + "loss": 1.4046, "step": 18000 }, { - "epoch": 0.4, - "eval_loss": 1.464233636856079, - "eval_runtime": 103.2861, - "eval_samples_per_second": 132.593, - "eval_steps_per_second": 2.072, + "epoch": 0.5142857142857142, + "eval_loss": 1.4171615839004517, + "eval_runtime": 99.7442, + "eval_samples_per_second": 137.301, + "eval_steps_per_second": 2.145, "step": 18000 }, { - "epoch": 0.4022222222222222, - "grad_norm": 1.020620346069336, - "learning_rate": 3.301156740511585e-06, - "loss": 1.4585, + "epoch": 0.5171428571428571, + "grad_norm": 1.084283471107483, + "learning_rate": 2.4039172278474864e-06, + "loss": 1.4183, "step": 18100 }, { - "epoch": 0.40444444444444444, - "grad_norm": 1.100151777267456, - "learning_rate": 3.2844371349447607e-06, - "loss": 1.4553, + "epoch": 0.52, + "grad_norm": 1.0714788436889648, + "learning_rate": 2.381271620330034e-06, + "loss": 1.3793, "step": 18200 }, { - "epoch": 0.4066666666666667, - "grad_norm": 1.0522452592849731, - "learning_rate": 3.267678520750115e-06, - "loss": 1.4571, + "epoch": 0.5228571428571429, + "grad_norm": 1.1440812349319458, + "learning_rate": 2.358635772700567e-06, + "loss": 1.3765, "step": 18300 }, { - "epoch": 0.4088888888888889, - "grad_norm": 1.082814335823059, - "learning_rate": 3.2508817313029604e-06, - "loss": 1.4408, + "epoch": 0.5257142857142857, + "grad_norm": 1.0656503438949585, + "learning_rate": 2.336011545704916e-06, + "loss": 1.4153, "step": 18400 }, { - "epoch": 0.4111111111111111, - "grad_norm": 0.9813875555992126, - "learning_rate": 3.234047601876999e-06, - "loss": 1.45, + "epoch": 0.5285714285714286, + "grad_norm": 1.1328638792037964, + "learning_rate": 2.3134007991336523e-06, + "loss": 1.3873, "step": 18500 }, { - "epoch": 0.41333333333333333, - "grad_norm": 1.1190879344940186, - "learning_rate": 3.2171769696027775e-06, - "loss": 1.4611, + "epoch": 0.5314285714285715, + "grad_norm": 1.0806158781051636, + "learning_rate": 2.290805391669212e-06, + "loss": 1.3774, "step": 18600 }, { - "epoch": 0.41555555555555557, - "grad_norm": 1.1239498853683472, - "learning_rate": 3.200270673426068e-06, - "loss": 1.4492, + "epoch": 0.5342857142857143, + "grad_norm": 1.069150686264038, + "learning_rate": 2.2682271807331003e-06, + "loss": 1.3918, "step": 18700 }, { - "epoch": 0.4177777777777778, - "grad_norm": 1.0896953344345093, - "learning_rate": 3.183329554066138e-06, - "loss": 1.4597, + "epoch": 0.5371428571428571, + "grad_norm": 1.1267215013504028, + "learning_rate": 2.2456680223332103e-06, + "loss": 1.3845, "step": 18800 }, { - "epoch": 0.42, - "grad_norm": 1.0593230724334717, - "learning_rate": 3.1663544539739512e-06, - "loss": 1.4318, + "epoch": 0.54, + "grad_norm": 1.142121434211731, + "learning_rate": 2.2231297709112496e-06, + "loss": 1.4109, "step": 18900 }, { - "epoch": 0.4222222222222222, - "grad_norm": 1.0952677726745605, - "learning_rate": 3.1493462172902713e-06, - "loss": 1.443, + "epoch": 0.5428571428571428, + "grad_norm": 1.0814783573150635, + "learning_rate": 2.2006142791902957e-06, + "loss": 1.4098, "step": 19000 }, { - "epoch": 0.4222222222222222, - "eval_loss": 1.4632492065429688, - "eval_runtime": 102.9259, - "eval_samples_per_second": 133.057, - "eval_steps_per_second": 2.079, + "epoch": 0.5428571428571428, + "eval_loss": 1.416707158088684, + "eval_runtime": 100.0528, + "eval_samples_per_second": 136.878, + "eval_steps_per_second": 2.139, "step": 19000 }, { - "epoch": 0.42444444444444446, - "grad_norm": 1.0409597158432007, - "learning_rate": 3.132305689803684e-06, - "loss": 1.4407, + "epoch": 0.5457142857142857, + "grad_norm": 1.0706247091293335, + "learning_rate": 2.1781233980225035e-06, + "loss": 1.4188, "step": 19100 }, { - "epoch": 0.4266666666666667, - "grad_norm": 1.0839999914169312, - "learning_rate": 3.1152337189085372e-06, - "loss": 1.4398, + "epoch": 0.5485714285714286, + "grad_norm": 1.021273136138916, + "learning_rate": 2.1556589762369518e-06, + "loss": 1.3989, "step": 19200 }, { - "epoch": 0.4288888888888889, - "grad_norm": 1.0945332050323486, - "learning_rate": 3.0981311535628024e-06, - "loss": 1.4478, + "epoch": 0.5514285714285714, + "grad_norm": 1.1904112100601196, + "learning_rate": 2.133222860487667e-06, + "loss": 1.4393, "step": 19300 }, { - "epoch": 0.4311111111111111, - "grad_norm": 1.0296911001205444, - "learning_rate": 3.0809988442458567e-06, - "loss": 1.447, + "epoch": 0.5542857142857143, + "grad_norm": 1.1062791347503662, + "learning_rate": 2.1108168951018186e-06, + "loss": 1.4045, "step": 19400 }, { - "epoch": 0.43333333333333335, - "grad_norm": 1.044171690940857, - "learning_rate": 3.063837642916191e-06, - "loss": 1.4706, + "epoch": 0.5571428571428572, + "grad_norm": 1.1809172630310059, + "learning_rate": 2.088442921928113e-06, + "loss": 1.3958, "step": 19500 }, { - "epoch": 0.43555555555555553, - "grad_norm": 1.0633000135421753, - "learning_rate": 3.0466484029690453e-06, - "loss": 1.4572, + "epoch": 0.56, + "grad_norm": 1.0156745910644531, + "learning_rate": 2.066102780185383e-06, + "loss": 1.398, "step": 19600 }, { - "epoch": 0.43777777777777777, - "grad_norm": 0.9554187655448914, - "learning_rate": 3.0294319791939653e-06, - "loss": 1.4392, + "epoch": 0.5628571428571428, + "grad_norm": 1.1121779680252075, + "learning_rate": 2.0437983063114013e-06, + "loss": 1.4122, "step": 19700 }, { - "epoch": 0.44, - "grad_norm": 1.1261570453643799, - "learning_rate": 3.0121892277323016e-06, - "loss": 1.4508, + "epoch": 0.5657142857142857, + "grad_norm": 1.0523419380187988, + "learning_rate": 2.021531333811914e-06, + "loss": 1.4063, "step": 19800 }, { - "epoch": 0.44222222222222224, - "grad_norm": 1.075859785079956, - "learning_rate": 2.9949210060346323e-06, - "loss": 1.4525, + "epoch": 0.5685714285714286, + "grad_norm": 1.099584698677063, + "learning_rate": 1.9993036931099265e-06, + "loss": 1.409, "step": 19900 }, { - "epoch": 0.4444444444444444, - "grad_norm": 1.121369481086731, - "learning_rate": 2.977628172818122e-06, - "loss": 1.458, + "epoch": 0.5714285714285714, + "grad_norm": 1.1999467611312866, + "learning_rate": 1.9771172113952327e-06, + "loss": 1.4, "step": 20000 }, { - "epoch": 0.4444444444444444, - "eval_loss": 1.461082935333252, - "eval_runtime": 103.0425, - "eval_samples_per_second": 132.906, - "eval_steps_per_second": 2.077, + "epoch": 0.5714285714285714, + "eval_loss": 1.415099024772644, + "eval_runtime": 99.755, + "eval_samples_per_second": 137.286, + "eval_steps_per_second": 2.145, "step": 20000 }, { - "epoch": 0.44666666666666666, - "grad_norm": 1.0296210050582886, - "learning_rate": 2.9603115880238228e-06, - "loss": 1.4306, + "epoch": 0.5742857142857143, + "grad_norm": 1.0494403839111328, + "learning_rate": 1.9549737124742104e-06, + "loss": 1.4095, "step": 20100 }, { - "epoch": 0.4488888888888889, - "grad_norm": 1.0637251138687134, - "learning_rate": 2.9429721127739095e-06, - "loss": 1.4592, + "epoch": 0.5771428571428572, + "grad_norm": 1.1081063747406006, + "learning_rate": 1.9328750166199046e-06, + "loss": 1.3992, "step": 20200 }, { - "epoch": 0.45111111111111113, - "grad_norm": 1.0740017890930176, - "learning_rate": 2.925610609328858e-06, - "loss": 1.4439, + "epoch": 0.58, + "grad_norm": 1.1197865009307861, + "learning_rate": 1.91082294042239e-06, + "loss": 1.3917, "step": 20300 }, { - "epoch": 0.4533333333333333, - "grad_norm": 1.136069893836975, - "learning_rate": 2.9082279410445637e-06, - "loss": 1.4483, + "epoch": 0.5828571428571429, + "grad_norm": 1.140148639678955, + "learning_rate": 1.8888192966394448e-06, + "loss": 1.3907, "step": 20400 }, { - "epoch": 0.45555555555555555, - "grad_norm": 1.0300533771514893, - "learning_rate": 2.890824972329414e-06, - "loss": 1.4462, + "epoch": 0.5857142857142857, + "grad_norm": 1.0425162315368652, + "learning_rate": 1.8668658940475298e-06, + "loss": 1.4006, "step": 20500 }, { - "epoch": 0.4577777777777778, - "grad_norm": 1.0724245309829712, - "learning_rate": 2.8734025686012967e-06, - "loss": 1.4387, + "epoch": 0.5885714285714285, + "grad_norm": 1.1035826206207275, + "learning_rate": 1.8449645372931068e-06, + "loss": 1.4033, "step": 20600 }, { - "epoch": 0.46, - "grad_norm": 1.0145208835601807, - "learning_rate": 2.855961596244571e-06, - "loss": 1.452, + "epoch": 0.5914285714285714, + "grad_norm": 1.1139192581176758, + "learning_rate": 1.823117026744287e-06, + "loss": 1.3964, "step": 20700 }, { - "epoch": 0.4622222222222222, - "grad_norm": 1.0359569787979126, - "learning_rate": 2.8385029225669757e-06, - "loss": 1.4524, + "epoch": 0.5942857142857143, + "grad_norm": 1.1130657196044922, + "learning_rate": 1.8013251583428366e-06, + "loss": 1.3972, "step": 20800 }, { - "epoch": 0.46444444444444444, - "grad_norm": 1.0701048374176025, - "learning_rate": 2.8210274157565078e-06, - "loss": 1.4563, + "epoch": 0.5971428571428572, + "grad_norm": 1.0860106945037842, + "learning_rate": 1.7795907234565385e-06, + "loss": 1.3931, "step": 20900 }, { - "epoch": 0.4666666666666667, - "grad_norm": 1.106961727142334, - "learning_rate": 2.803535944838243e-06, - "loss": 1.4433, + "epoch": 0.6, + "grad_norm": 1.05580472946167, + "learning_rate": 1.7579155087319443e-06, + "loss": 1.3874, "step": 21000 }, { - "epoch": 0.4666666666666667, - "eval_loss": 1.4546458721160889, - "eval_runtime": 103.0443, - "eval_samples_per_second": 132.904, - "eval_steps_per_second": 2.077, + "epoch": 0.6, + "eval_loss": 1.4096276760101318, + "eval_runtime": 99.8984, + "eval_samples_per_second": 137.089, + "eval_steps_per_second": 2.142, "step": 21000 }, { - "epoch": 0.4688888888888889, - "grad_norm": 1.019301414489746, - "learning_rate": 2.786029379631124e-06, - "loss": 1.4205, + "epoch": 0.6028571428571429, + "grad_norm": 1.1223632097244263, + "learning_rate": 1.7363012959475e-06, + "loss": 1.3793, "step": 21100 }, { - "epoch": 0.4711111111111111, - "grad_norm": 1.132878065109253, - "learning_rate": 2.7685085907047038e-06, - "loss": 1.4365, + "epoch": 0.6057142857142858, + "grad_norm": 1.115355372428894, + "learning_rate": 1.7147498618670778e-06, + "loss": 1.4093, "step": 21200 }, { - "epoch": 0.47333333333333333, - "grad_norm": 1.1317287683486938, - "learning_rate": 2.7509744493358574e-06, - "loss": 1.4391, + "epoch": 0.6085714285714285, + "grad_norm": 1.0437370538711548, + "learning_rate": 1.6932629780939225e-06, + "loss": 1.3875, "step": 21300 }, { - "epoch": 0.47555555555555556, - "grad_norm": 1.0661753416061401, - "learning_rate": 2.7334278274654507e-06, - "loss": 1.4392, + "epoch": 0.6114285714285714, + "grad_norm": 1.0260958671569824, + "learning_rate": 1.6718424109250154e-06, + "loss": 1.4035, "step": 21400 }, { - "epoch": 0.4777777777777778, - "grad_norm": 1.0728217363357544, - "learning_rate": 2.7158695976549808e-06, - "loss": 1.4284, + "epoch": 0.6142857142857143, + "grad_norm": 0.9281340837478638, + "learning_rate": 1.6504899212058837e-06, + "loss": 1.3853, "step": 21500 }, { - "epoch": 0.48, - "grad_norm": 1.138945460319519, - "learning_rate": 2.6983006330431904e-06, - "loss": 1.4318, + "epoch": 0.6171428571428571, + "grad_norm": 1.1064680814743042, + "learning_rate": 1.6292072641858478e-06, + "loss": 1.4016, "step": 21600 }, { - "epoch": 0.4822222222222222, - "grad_norm": 1.0607870817184448, - "learning_rate": 2.6807218073026394e-06, - "loss": 1.4507, + "epoch": 0.62, + "grad_norm": 1.0704963207244873, + "learning_rate": 1.6079961893737384e-06, + "loss": 1.3848, "step": 21700 }, { - "epoch": 0.48444444444444446, - "grad_norm": 1.1019628047943115, - "learning_rate": 2.6631339945962682e-06, - "loss": 1.4508, + "epoch": 0.6228571428571429, + "grad_norm": 1.0652328729629517, + "learning_rate": 1.5868584403940768e-06, + "loss": 1.3749, "step": 21800 }, { - "epoch": 0.4866666666666667, - "grad_norm": 1.104874849319458, - "learning_rate": 2.6455380695339194e-06, - "loss": 1.434, + "epoch": 0.6257142857142857, + "grad_norm": 1.0742926597595215, + "learning_rate": 1.5657957548437447e-06, + "loss": 1.404, "step": 21900 }, { - "epoch": 0.4888888888888889, - "grad_norm": 1.1439390182495117, - "learning_rate": 2.6279349071288506e-06, - "loss": 1.4434, + "epoch": 0.6285714285714286, + "grad_norm": 1.0579770803451538, + "learning_rate": 1.5448098641491487e-06, + "loss": 1.4036, "step": 22000 }, { - "epoch": 0.4888888888888889, - "eval_loss": 1.4595842361450195, - "eval_runtime": 103.7757, - "eval_samples_per_second": 131.967, - "eval_steps_per_second": 2.062, + "epoch": 0.6285714285714286, + "eval_loss": 1.4158315658569336, + "eval_runtime": 100.3868, + "eval_samples_per_second": 136.422, + "eval_steps_per_second": 2.132, "step": 22000 }, { - "epoch": 0.4911111111111111, - "grad_norm": 1.002807378768921, - "learning_rate": 2.610325382754217e-06, - "loss": 1.4423, + "epoch": 0.6314285714285715, + "grad_norm": 1.101526141166687, + "learning_rate": 1.5239024934238874e-06, + "loss": 1.4188, "step": 22100 }, { - "epoch": 0.49333333333333335, - "grad_norm": 1.157097578048706, - "learning_rate": 2.5927103720995443e-06, - "loss": 1.4464, + "epoch": 0.6342857142857142, + "grad_norm": 1.0752556324005127, + "learning_rate": 1.5030753613269455e-06, + "loss": 1.3847, "step": 22200 }, { - "epoch": 0.4955555555555556, - "grad_norm": 1.0830250978469849, - "learning_rate": 2.5750907511271804e-06, - "loss": 1.4259, + "epoch": 0.6371428571428571, + "grad_norm": 1.0786316394805908, + "learning_rate": 1.4823301799214101e-06, + "loss": 1.3867, "step": 22300 }, { - "epoch": 0.49777777777777776, - "grad_norm": 1.1380789279937744, - "learning_rate": 2.5574673960287377e-06, - "loss": 1.453, + "epoch": 0.64, + "grad_norm": 1.0339590311050415, + "learning_rate": 1.4616686545337374e-06, + "loss": 1.3662, "step": 22400 }, { - "epoch": 0.5, - "grad_norm": 0.9866058826446533, - "learning_rate": 2.5398411831815173e-06, - "loss": 1.4254, + "epoch": 0.6428571428571429, + "grad_norm": 1.0901203155517578, + "learning_rate": 1.4410924836135625e-06, + "loss": 1.3836, "step": 22500 }, { - "epoch": 0.5022222222222222, - "grad_norm": 1.0832710266113281, - "learning_rate": 2.522212989104932e-06, - "loss": 1.4408, + "epoch": 0.6457142857142857, + "grad_norm": 1.0557289123535156, + "learning_rate": 1.4206033585940895e-06, + "loss": 1.375, "step": 22600 }, { - "epoch": 0.5044444444444445, - "grad_norm": 1.0661131143569946, - "learning_rate": 2.5045836904169164e-06, - "loss": 1.4567, + "epoch": 0.6485714285714286, + "grad_norm": 1.049706220626831, + "learning_rate": 1.40020296375304e-06, + "loss": 1.3977, "step": 22700 }, { - "epoch": 0.5066666666666667, - "grad_norm": 1.0910335779190063, - "learning_rate": 2.4869541637903384e-06, - "loss": 1.4464, + "epoch": 0.6514285714285715, + "grad_norm": 1.170900821685791, + "learning_rate": 1.379892976074209e-06, + "loss": 1.384, "step": 22800 }, { - "epoch": 0.5088888888888888, - "grad_norm": 1.051774501800537, - "learning_rate": 2.4693252859093982e-06, - "loss": 1.4341, + "epoch": 0.6542857142857142, + "grad_norm": 1.10288667678833, + "learning_rate": 1.3596750651096047e-06, + "loss": 1.4045, "step": 22900 }, { - "epoch": 0.5111111111111111, - "grad_norm": 1.0804002285003662, - "learning_rate": 2.4516979334260345e-06, - "loss": 1.4552, + "epoch": 0.6571428571428571, + "grad_norm": 1.0909626483917236, + "learning_rate": 1.3395508928422074e-06, + "loss": 1.4018, "step": 23000 }, { - "epoch": 0.5111111111111111, - "eval_loss": 1.458953619003296, - "eval_runtime": 103.1044, - "eval_samples_per_second": 132.827, - "eval_steps_per_second": 2.076, + "epoch": 0.6571428571428571, + "eval_loss": 1.4156948328018188, + "eval_runtime": 100.678, + "eval_samples_per_second": 136.028, + "eval_steps_per_second": 2.126, "step": 23000 }, { - "epoch": 0.5133333333333333, - "grad_norm": 1.0643597841262817, - "learning_rate": 2.4340729829163335e-06, - "loss": 1.4319, + "epoch": 0.66, + "grad_norm": 1.1391985416412354, + "learning_rate": 1.3195221135493503e-06, + "loss": 1.372, "step": 23100 }, { - "epoch": 0.5155555555555555, - "grad_norm": 1.0562705993652344, - "learning_rate": 2.416451310836935e-06, - "loss": 1.4293, + "epoch": 0.6628571428571428, + "grad_norm": 1.124377965927124, + "learning_rate": 1.2995903736667267e-06, + "loss": 1.3998, "step": 23200 }, { - "epoch": 0.5177777777777778, - "grad_norm": 1.038696050643921, - "learning_rate": 2.3988337934814466e-06, - "loss": 1.4542, + "epoch": 0.6657142857142857, + "grad_norm": 1.1056832075119019, + "learning_rate": 1.279757311653056e-06, + "loss": 1.3677, "step": 23300 }, { - "epoch": 0.52, - "grad_norm": 1.053084373474121, - "learning_rate": 2.38122130693687e-06, - "loss": 1.4673, + "epoch": 0.6685714285714286, + "grad_norm": 1.0959793329238892, + "learning_rate": 1.2600245578553866e-06, + "loss": 1.3801, "step": 23400 }, { - "epoch": 0.5222222222222223, - "grad_norm": 1.0920089483261108, - "learning_rate": 2.363614727040034e-06, - "loss": 1.4402, + "epoch": 0.6714285714285714, + "grad_norm": 1.0466543436050415, + "learning_rate": 1.240393734375086e-06, + "loss": 1.3866, "step": 23500 }, { - "epoch": 0.5244444444444445, - "grad_norm": 1.0549384355545044, - "learning_rate": 2.346014929334041e-06, - "loss": 1.4415, + "epoch": 0.6742857142857143, + "grad_norm": 1.0811994075775146, + "learning_rate": 1.2208664549344884e-06, + "loss": 1.3885, "step": 23600 }, { - "epoch": 0.5266666666666666, - "grad_norm": 1.1243116855621338, - "learning_rate": 2.328422789024727e-06, - "loss": 1.4323, + "epoch": 0.6771428571428572, + "grad_norm": 1.1533517837524414, + "learning_rate": 1.2014443247442498e-06, + "loss": 1.3684, "step": 23700 }, { - "epoch": 0.5288888888888889, - "grad_norm": 1.0348533391952515, - "learning_rate": 2.3108391809371384e-06, - "loss": 1.4137, + "epoch": 0.68, + "grad_norm": 1.0400276184082031, + "learning_rate": 1.1821289403713865e-06, + "loss": 1.3733, "step": 23800 }, { - "epoch": 0.5311111111111111, - "grad_norm": 1.0663659572601318, - "learning_rate": 2.2932649794720327e-06, - "loss": 1.4574, + "epoch": 0.6828571428571428, + "grad_norm": 1.0742322206497192, + "learning_rate": 1.1629218896080382e-06, + "loss": 1.3884, "step": 23900 }, { - "epoch": 0.5333333333333333, - "grad_norm": 1.1175079345703125, - "learning_rate": 2.275701058562391e-06, - "loss": 1.4467, + "epoch": 0.6857142857142857, + "grad_norm": 1.0781975984573364, + "learning_rate": 1.1438247513409423e-06, + "loss": 1.3611, "step": 24000 }, { - "epoch": 0.5333333333333333, - "eval_loss": 1.4563220739364624, - "eval_runtime": 103.6075, - "eval_samples_per_second": 132.182, - "eval_steps_per_second": 2.065, + "epoch": 0.6857142857142857, + "eval_loss": 1.4142818450927734, + "eval_runtime": 100.8413, + "eval_samples_per_second": 135.807, + "eval_steps_per_second": 2.122, "step": 24000 }, { - "epoch": 0.5355555555555556, - "grad_norm": 1.1649941205978394, - "learning_rate": 2.2581482916299596e-06, - "loss": 1.4493, + "epoch": 0.6885714285714286, + "grad_norm": 1.1411370038986206, + "learning_rate": 1.1248390954216437e-06, + "loss": 1.3838, "step": 24100 }, { - "epoch": 0.5377777777777778, - "grad_norm": 1.0690239667892456, - "learning_rate": 2.240607551541823e-06, - "loss": 1.4552, + "epoch": 0.6914285714285714, + "grad_norm": 1.0584548711776733, + "learning_rate": 1.1059664825374511e-06, + "loss": 1.3482, "step": 24200 }, { - "epoch": 0.54, - "grad_norm": 1.074440836906433, - "learning_rate": 2.2230797105669876e-06, - "loss": 1.4635, + "epoch": 0.6942857142857143, + "grad_norm": 1.096170425415039, + "learning_rate": 1.0872084640831356e-06, + "loss": 1.3704, "step": 24300 }, { - "epoch": 0.5422222222222223, - "grad_norm": 1.0503307580947876, - "learning_rate": 2.205565640333014e-06, - "loss": 1.424, + "epoch": 0.6971428571428572, + "grad_norm": 1.0241279602050781, + "learning_rate": 1.068566582033411e-06, + "loss": 1.3735, "step": 24400 }, { - "epoch": 0.5444444444444444, - "grad_norm": 1.0343286991119385, - "learning_rate": 2.188066211782666e-06, - "loss": 1.448, + "epoch": 0.7, + "grad_norm": 1.0666210651397705, + "learning_rate": 1.050042368816168e-06, + "loss": 1.3893, "step": 24500 }, { - "epoch": 0.5466666666666666, - "grad_norm": 1.054713249206543, - "learning_rate": 2.1705822951306043e-06, - "loss": 1.4259, + "epoch": 0.7028571428571428, + "grad_norm": 1.0070935487747192, + "learning_rate": 1.0316373471865108e-06, + "loss": 1.3807, "step": 24600 }, { - "epoch": 0.5488888888888889, - "grad_norm": 1.0660899877548218, - "learning_rate": 2.153114759820113e-06, - "loss": 1.4519, + "epoch": 0.7057142857142857, + "grad_norm": 1.0485628843307495, + "learning_rate": 1.013353030101576e-06, + "loss": 1.3817, "step": 24700 }, { - "epoch": 0.5511111111111111, - "grad_norm": 1.0503852367401123, - "learning_rate": 2.1356644744798592e-06, - "loss": 1.4447, + "epoch": 0.7085714285714285, + "grad_norm": 0.9520274996757507, + "learning_rate": 9.951909205961665e-07, + "loss": 1.3201, "step": 24800 }, { - "epoch": 0.5533333333333333, - "grad_norm": 1.0827716588974, - "learning_rate": 2.1182323068807016e-06, - "loss": 1.4503, + "epoch": 0.7114285714285714, + "grad_norm": 1.0479100942611694, + "learning_rate": 9.77152511659194e-07, + "loss": 1.2627, "step": 24900 }, { - "epoch": 0.5555555555555556, - "grad_norm": 1.128435730934143, - "learning_rate": 2.1008191238925376e-06, - "loss": 1.4327, + "epoch": 0.7142857142857143, + "grad_norm": 1.0204826593399048, + "learning_rate": 9.59239286110952e-07, + "loss": 1.2352, "step": 25000 }, { - "epoch": 0.5555555555555556, - "eval_loss": 1.449695348739624, - "eval_runtime": 102.9301, - "eval_samples_per_second": 133.051, - "eval_steps_per_second": 2.079, + "epoch": 0.7142857142857143, + "eval_loss": 1.4112086296081543, + "eval_runtime": 101.0868, + "eval_samples_per_second": 135.478, + "eval_steps_per_second": 2.117, "step": 25000 - }, - { - "epoch": 0.5577777777777778, - "grad_norm": 1.0508419275283813, - "learning_rate": 2.083425791441193e-06, - "loss": 1.4257, - "step": 25100 - }, - { - "epoch": 0.56, - "grad_norm": 1.131657600402832, - "learning_rate": 2.066053174465362e-06, - "loss": 1.4478, - "step": 25200 - }, - { - "epoch": 0.5622222222222222, - "grad_norm": 1.082889199256897, - "learning_rate": 2.0487021368736002e-06, - "loss": 1.4415, - "step": 25300 - }, - { - "epoch": 0.5644444444444444, - "grad_norm": 1.1404916048049927, - "learning_rate": 2.0313735415013548e-06, - "loss": 1.4375, - "step": 25400 - }, - { - "epoch": 0.5666666666666667, - "grad_norm": 1.0749248266220093, - "learning_rate": 2.0140682500680656e-06, - "loss": 1.4283, - "step": 25500 - }, - { - "epoch": 0.5688888888888889, - "grad_norm": 1.1225584745407104, - "learning_rate": 1.9967871231343077e-06, - "loss": 1.4431, - "step": 25600 - }, - { - "epoch": 0.5711111111111111, - "grad_norm": 1.1027264595031738, - "learning_rate": 1.9795310200590015e-06, - "loss": 1.4287, - "step": 25700 - }, - { - "epoch": 0.5733333333333334, - "grad_norm": 1.0218228101730347, - "learning_rate": 1.962300798956676e-06, - "loss": 1.4234, - "step": 25800 - }, - { - "epoch": 0.5755555555555556, - "grad_norm": 1.0777612924575806, - "learning_rate": 1.9450973166547965e-06, - "loss": 1.4519, - "step": 25900 - }, - { - "epoch": 0.5777777777777777, - "grad_norm": 1.067936658859253, - "learning_rate": 1.9279214286511566e-06, - "loss": 1.4201, - "step": 26000 - }, - { - "epoch": 0.5777777777777777, - "eval_loss": 1.45020592212677, - "eval_runtime": 103.4404, - "eval_samples_per_second": 132.395, - "eval_steps_per_second": 2.069, - "step": 26000 - }, - { - "epoch": 0.58, - "grad_norm": 1.053848147392273, - "learning_rate": 1.910773989071337e-06, - "loss": 1.4288, - "step": 26100 - }, - { - "epoch": 0.5822222222222222, - "grad_norm": 1.0881634950637817, - "learning_rate": 1.8936558506262287e-06, - "loss": 1.4404, - "step": 26200 - }, - { - "epoch": 0.5844444444444444, - "grad_norm": 1.0691543817520142, - "learning_rate": 1.87656786456963e-06, - "loss": 1.4286, - "step": 26300 - }, - { - "epoch": 0.5866666666666667, - "grad_norm": 1.0484671592712402, - "learning_rate": 1.8595108806559193e-06, - "loss": 1.4273, - "step": 26400 - }, - { - "epoch": 0.5888888888888889, - "grad_norm": 1.0702567100524902, - "learning_rate": 1.8424857470977914e-06, - "loss": 1.427, - "step": 26500 - }, - { - "epoch": 0.5911111111111111, - "grad_norm": 1.0914722681045532, - "learning_rate": 1.8254933105240832e-06, - "loss": 1.4324, - "step": 26600 - }, - { - "epoch": 0.5933333333333334, - "grad_norm": 1.0758668184280396, - "learning_rate": 1.8085344159376694e-06, - "loss": 1.4302, - "step": 26700 - }, - { - "epoch": 0.5955555555555555, - "grad_norm": 1.0367869138717651, - "learning_rate": 1.791609906673442e-06, - "loss": 1.4322, - "step": 26800 - }, - { - "epoch": 0.5977777777777777, - "grad_norm": 1.142140507698059, - "learning_rate": 1.7747206243563758e-06, - "loss": 1.4228, - "step": 26900 - }, - { - "epoch": 0.6, - "grad_norm": 1.112724781036377, - "learning_rate": 1.757867408859672e-06, - "loss": 1.436, - "step": 27000 - }, - { - "epoch": 0.6, - "eval_loss": 1.4449330568313599, - "eval_runtime": 103.9341, - "eval_samples_per_second": 131.766, - "eval_steps_per_second": 2.059, - "step": 27000 - }, - { - "epoch": 0.6022222222222222, - "grad_norm": 1.027860403060913, - "learning_rate": 1.7410510982629953e-06, - "loss": 1.4382, - "step": 27100 - }, - { - "epoch": 0.6044444444444445, - "grad_norm": 1.035631775856018, - "learning_rate": 1.7242725288107959e-06, - "loss": 1.4201, - "step": 27200 - }, - { - "epoch": 0.6066666666666667, - "grad_norm": 1.1015000343322754, - "learning_rate": 1.7075325348707283e-06, - "loss": 1.4371, - "step": 27300 - }, - { - "epoch": 0.6088888888888889, - "grad_norm": 1.0760738849639893, - "learning_rate": 1.6908319488921538e-06, - "loss": 1.4333, - "step": 27400 - }, - { - "epoch": 0.6111111111111112, - "grad_norm": 1.0578911304473877, - "learning_rate": 1.6741716013647512e-06, - "loss": 1.4551, - "step": 27500 - }, - { - "epoch": 0.6133333333333333, - "grad_norm": 1.0643107891082764, - "learning_rate": 1.657552320777212e-06, - "loss": 1.4336, - "step": 27600 - }, - { - "epoch": 0.6155555555555555, - "grad_norm": 1.124281406402588, - "learning_rate": 1.6409749335760447e-06, - "loss": 1.4425, - "step": 27700 - }, - { - "epoch": 0.6177777777777778, - "grad_norm": 1.1095889806747437, - "learning_rate": 1.6244402641244755e-06, - "loss": 1.4472, - "step": 27800 - }, - { - "epoch": 0.62, - "grad_norm": 1.05001962184906, - "learning_rate": 1.6079491346614557e-06, - "loss": 1.4329, - "step": 27900 - }, - { - "epoch": 0.6222222222222222, - "grad_norm": 1.0860042572021484, - "learning_rate": 1.5915023652607709e-06, - "loss": 1.4324, - "step": 28000 - }, - { - "epoch": 0.6222222222222222, - "eval_loss": 1.4518882036209106, - "eval_runtime": 104.2615, - "eval_samples_per_second": 131.352, - "eval_steps_per_second": 2.053, - "step": 28000 - }, - { - "epoch": 0.6244444444444445, - "grad_norm": 1.1069949865341187, - "learning_rate": 1.5751007737902634e-06, - "loss": 1.4316, - "step": 28100 - }, - { - "epoch": 0.6266666666666667, - "grad_norm": 1.0824823379516602, - "learning_rate": 1.5587451758711589e-06, - "loss": 1.4155, - "step": 28200 - }, - { - "epoch": 0.6288888888888889, - "grad_norm": 1.1121022701263428, - "learning_rate": 1.542436384837507e-06, - "loss": 1.453, - "step": 28300 - }, - { - "epoch": 0.6311111111111111, - "grad_norm": 1.0836877822875977, - "learning_rate": 1.526175211695736e-06, - "loss": 1.43, - "step": 28400 - }, - { - "epoch": 0.6333333333333333, - "grad_norm": 1.1165013313293457, - "learning_rate": 1.5099624650843264e-06, - "loss": 1.4568, - "step": 28500 - }, - { - "epoch": 0.6355555555555555, - "grad_norm": 1.1114059686660767, - "learning_rate": 1.4937989512335923e-06, - "loss": 1.4432, - "step": 28600 - }, - { - "epoch": 0.6377777777777778, - "grad_norm": 1.0963460206985474, - "learning_rate": 1.4776854739255933e-06, - "loss": 1.4282, - "step": 28700 - }, - { - "epoch": 0.64, - "grad_norm": 1.0809634923934937, - "learning_rate": 1.4616228344541627e-06, - "loss": 1.4282, - "step": 28800 - }, - { - "epoch": 0.6422222222222222, - "grad_norm": 1.084708333015442, - "learning_rate": 1.4456118315850618e-06, - "loss": 1.4525, - "step": 28900 - }, - { - "epoch": 0.6444444444444445, - "grad_norm": 1.0788947343826294, - "learning_rate": 1.4296532615162588e-06, - "loss": 1.432, - "step": 29000 - }, - { - "epoch": 0.6444444444444445, - "eval_loss": 1.4509940147399902, - "eval_runtime": 104.7265, - "eval_samples_per_second": 130.769, - "eval_steps_per_second": 2.043, - "step": 29000 - }, - { - "epoch": 0.6466666666666666, - "grad_norm": 1.082764744758606, - "learning_rate": 1.4137479178383307e-06, - "loss": 1.4224, - "step": 29100 - }, - { - "epoch": 0.6488888888888888, - "grad_norm": 1.0896496772766113, - "learning_rate": 1.3978965914950065e-06, - "loss": 1.4193, - "step": 29200 - }, - { - "epoch": 0.6511111111111111, - "grad_norm": 1.1591044664382935, - "learning_rate": 1.3821000707438325e-06, - "loss": 1.4246, - "step": 29300 - }, - { - "epoch": 0.6533333333333333, - "grad_norm": 1.067596673965454, - "learning_rate": 1.366359141116968e-06, - "loss": 1.4308, - "step": 29400 - }, - { - "epoch": 0.6555555555555556, - "grad_norm": 1.0923844575881958, - "learning_rate": 1.3506745853821318e-06, - "loss": 1.4376, - "step": 29500 - }, - { - "epoch": 0.6577777777777778, - "grad_norm": 0.9823111891746521, - "learning_rate": 1.3350471835036702e-06, - "loss": 1.4265, - "step": 29600 - }, - { - "epoch": 0.66, - "grad_norm": 1.122013807296753, - "learning_rate": 1.3194777126037726e-06, - "loss": 1.4407, - "step": 29700 - }, - { - "epoch": 0.6622222222222223, - "grad_norm": 1.2428700923919678, - "learning_rate": 1.303966946923827e-06, - "loss": 1.4516, - "step": 29800 - }, - { - "epoch": 0.6644444444444444, - "grad_norm": 1.0549967288970947, - "learning_rate": 1.2885156577859159e-06, - "loss": 1.4232, - "step": 29900 - }, - { - "epoch": 0.6666666666666666, - "grad_norm": 1.0705960988998413, - "learning_rate": 1.273124613554464e-06, - "loss": 1.4486, - "step": 30000 - }, - { - "epoch": 0.6666666666666666, - "eval_loss": 1.4483963251113892, - "eval_runtime": 103.4915, - "eval_samples_per_second": 132.33, - "eval_steps_per_second": 2.068, - "step": 30000 - }, - { - "epoch": 0.6688888888888889, - "grad_norm": 1.1372791528701782, - "learning_rate": 1.2577945795980268e-06, - "loss": 1.4192, - "step": 30100 - }, - { - "epoch": 0.6711111111111111, - "grad_norm": 1.0534753799438477, - "learning_rate": 1.2425263182512276e-06, - "loss": 1.4325, - "step": 30200 - }, - { - "epoch": 0.6733333333333333, - "grad_norm": 1.0935174226760864, - "learning_rate": 1.2273205887768544e-06, - "loss": 1.4232, - "step": 30300 - }, - { - "epoch": 0.6755555555555556, - "grad_norm": 1.0945768356323242, - "learning_rate": 1.2121781473280947e-06, - "loss": 1.4281, - "step": 30400 - }, - { - "epoch": 0.6777777777777778, - "grad_norm": 1.0865614414215088, - "learning_rate": 1.1970997469109412e-06, - "loss": 1.4343, - "step": 30500 - }, - { - "epoch": 0.68, - "grad_norm": 1.0924773216247559, - "learning_rate": 1.1820861373467426e-06, - "loss": 1.4389, - "step": 30600 - }, - { - "epoch": 0.6822222222222222, - "grad_norm": 1.1250981092453003, - "learning_rate": 1.1671380652349156e-06, - "loss": 1.4456, - "step": 30700 - }, - { - "epoch": 0.6844444444444444, - "grad_norm": 1.097658634185791, - "learning_rate": 1.1522562739158195e-06, - "loss": 1.426, - "step": 30800 - }, - { - "epoch": 0.6866666666666666, - "grad_norm": 1.1177273988723755, - "learning_rate": 1.1374415034337912e-06, - "loss": 1.4314, - "step": 30900 - }, - { - "epoch": 0.6888888888888889, - "grad_norm": 1.0766289234161377, - "learning_rate": 1.122694490500342e-06, - "loss": 1.4218, - "step": 31000 - }, - { - "epoch": 0.6888888888888889, - "eval_loss": 1.4482488632202148, - "eval_runtime": 104.3689, - "eval_samples_per_second": 131.217, - "eval_steps_per_second": 2.05, - "step": 31000 - }, - { - "epoch": 0.6911111111111111, - "grad_norm": 1.099563717842102, - "learning_rate": 1.1080159684575258e-06, - "loss": 1.4393, - "step": 31100 - }, - { - "epoch": 0.6933333333333334, - "grad_norm": 1.13261878490448, - "learning_rate": 1.0934066672414676e-06, - "loss": 1.4581, - "step": 31200 - }, - { - "epoch": 0.6955555555555556, - "grad_norm": 1.0697300434112549, - "learning_rate": 1.078867313346068e-06, - "loss": 1.4266, - "step": 31300 - }, - { - "epoch": 0.6977777777777778, - "grad_norm": 1.0712445974349976, - "learning_rate": 1.0643986297868766e-06, - "loss": 1.4267, - "step": 31400 - }, - { - "epoch": 0.7, - "grad_norm": 1.0139946937561035, - "learning_rate": 1.0500013360651318e-06, - "loss": 1.4228, - "step": 31500 - }, - { - "epoch": 0.7022222222222222, - "grad_norm": 1.0279037952423096, - "learning_rate": 1.0356761481319897e-06, - "loss": 1.4385, - "step": 31600 - }, - { - "epoch": 0.7044444444444444, - "grad_norm": 1.133392572402954, - "learning_rate": 1.021423778352915e-06, - "loss": 1.4099, - "step": 31700 - }, - { - "epoch": 0.7066666666666667, - "grad_norm": 1.0647848844528198, - "learning_rate": 1.0072449354722591e-06, - "loss": 1.4229, - "step": 31800 - }, - { - "epoch": 0.7088888888888889, - "grad_norm": 1.0599104166030884, - "learning_rate": 9.931403245780157e-07, - "loss": 1.4366, - "step": 31900 - }, - { - "epoch": 0.7111111111111111, - "grad_norm": 1.0552103519439697, - "learning_rate": 9.791106470667545e-07, - "loss": 1.4239, - "step": 32000 - }, - { - "epoch": 0.7111111111111111, - "eval_loss": 1.4449114799499512, - "eval_runtime": 104.4483, - "eval_samples_per_second": 131.118, - "eval_steps_per_second": 2.049, - "step": 32000 - }, - { - "epoch": 0.7133333333333334, - "grad_norm": 1.0780894756317139, - "learning_rate": 9.651566006087473e-07, - "loss": 1.4223, - "step": 32100 - }, - { - "epoch": 0.7155555555555555, - "grad_norm": 1.077383041381836, - "learning_rate": 9.512788791132715e-07, - "loss": 1.437, - "step": 32200 - }, - { - "epoch": 0.7177777777777777, - "grad_norm": 1.125698208808899, - "learning_rate": 9.37478172694101e-07, - "loss": 1.4324, - "step": 32300 - }, - { - "epoch": 0.72, - "grad_norm": 1.0730408430099487, - "learning_rate": 9.237551676351927e-07, - "loss": 1.4393, - "step": 32400 - }, - { - "epoch": 0.7222222222222222, - "grad_norm": 1.0632646083831787, - "learning_rate": 9.101105463565572e-07, - "loss": 1.446, - "step": 32500 - }, - { - "epoch": 0.7244444444444444, - "grad_norm": 1.049178957939148, - "learning_rate": 8.965449873803192e-07, - "loss": 1.4301, - "step": 32600 - }, - { - "epoch": 0.7266666666666667, - "grad_norm": 1.1717880964279175, - "learning_rate": 8.830591652969827e-07, - "loss": 1.4345, - "step": 32700 - }, - { - "epoch": 0.7288888888888889, - "grad_norm": 1.0648304224014282, - "learning_rate": 8.696537507318803e-07, - "loss": 1.4278, - "step": 32800 - }, - { - "epoch": 0.7311111111111112, - "grad_norm": 1.0418579578399658, - "learning_rate": 8.563294103118252e-07, - "loss": 1.4365, - "step": 32900 - }, - { - "epoch": 0.7333333333333333, - "grad_norm": 1.035050630569458, - "learning_rate": 8.430868066319625e-07, - "loss": 1.41, - "step": 33000 - }, - { - "epoch": 0.7333333333333333, - "eval_loss": 1.446353554725647, - "eval_runtime": 105.0303, - "eval_samples_per_second": 130.391, - "eval_steps_per_second": 2.038, - "step": 33000 - }, - { - "epoch": 0.7355555555555555, - "grad_norm": 1.071336030960083, - "learning_rate": 8.299265982228155e-07, - "loss": 1.439, - "step": 33100 - }, - { - "epoch": 0.7377777777777778, - "grad_norm": 1.0849003791809082, - "learning_rate": 8.168494395175436e-07, - "loss": 1.4172, - "step": 33200 - }, - { - "epoch": 0.74, - "grad_norm": 1.086184024810791, - "learning_rate": 8.038559808193958e-07, - "loss": 1.4232, - "step": 33300 - }, - { - "epoch": 0.7422222222222222, - "grad_norm": 1.0608620643615723, - "learning_rate": 7.909468682693699e-07, - "loss": 1.4154, - "step": 33400 - }, - { - "epoch": 0.7444444444444445, - "grad_norm": 1.0586633682250977, - "learning_rate": 7.781227438140868e-07, - "loss": 1.4354, - "step": 33500 - }, - { - "epoch": 0.7466666666666667, - "grad_norm": 1.0531774759292603, - "learning_rate": 7.65384245173863e-07, - "loss": 1.4394, - "step": 33600 - }, - { - "epoch": 0.7488888888888889, - "grad_norm": 1.134748935699463, - "learning_rate": 7.527320058109985e-07, - "loss": 1.4063, - "step": 33700 - }, - { - "epoch": 0.7511111111111111, - "grad_norm": 1.0640100240707397, - "learning_rate": 7.401666548982786e-07, - "loss": 1.4378, - "step": 33800 - }, - { - "epoch": 0.7533333333333333, - "grad_norm": 1.1123132705688477, - "learning_rate": 7.276888172876831e-07, - "loss": 1.4395, - "step": 33900 - }, - { - "epoch": 0.7555555555555555, - "grad_norm": 1.028074026107788, - "learning_rate": 7.152991134793158e-07, - "loss": 1.4091, - "step": 34000 - }, - { - "epoch": 0.7555555555555555, - "eval_loss": 1.4453147649765015, - "eval_runtime": 104.0869, - "eval_samples_per_second": 131.573, - "eval_steps_per_second": 2.056, - "step": 34000 - }, - { - "epoch": 0.7577777777777778, - "grad_norm": 1.0832644701004028, - "learning_rate": 7.029981595905477e-07, - "loss": 1.4245, - "step": 34100 - }, - { - "epoch": 0.76, - "grad_norm": 1.0776340961456299, - "learning_rate": 6.907865673253758e-07, - "loss": 1.4116, - "step": 34200 - }, - { - "epoch": 0.7622222222222222, - "grad_norm": 1.1135038137435913, - "learning_rate": 6.786649439440091e-07, - "loss": 1.4335, - "step": 34300 - }, - { - "epoch": 0.7644444444444445, - "grad_norm": 1.097848892211914, - "learning_rate": 6.666338922326676e-07, - "loss": 1.4422, - "step": 34400 - }, - { - "epoch": 0.7666666666666667, - "grad_norm": 1.1417701244354248, - "learning_rate": 6.546940104736058e-07, - "loss": 1.4445, - "step": 34500 - }, - { - "epoch": 0.7688888888888888, - "grad_norm": 1.0211708545684814, - "learning_rate": 6.428458924153644e-07, - "loss": 1.4341, - "step": 34600 - }, - { - "epoch": 0.7711111111111111, - "grad_norm": 1.049847960472107, - "learning_rate": 6.310901272432433e-07, - "loss": 1.4531, - "step": 34700 - }, - { - "epoch": 0.7733333333333333, - "grad_norm": 1.156039834022522, - "learning_rate": 6.194272995499986e-07, - "loss": 1.4254, - "step": 34800 - }, - { - "epoch": 0.7755555555555556, - "grad_norm": 1.1008870601654053, - "learning_rate": 6.078579893067787e-07, - "loss": 1.4354, - "step": 34900 - }, - { - "epoch": 0.7777777777777778, - "grad_norm": 1.064918041229248, - "learning_rate": 5.96382771834278e-07, - "loss": 1.4495, - "step": 35000 - }, - { - "epoch": 0.7777777777777778, - "eval_loss": 1.450618863105774, - "eval_runtime": 104.1774, - "eval_samples_per_second": 131.458, - "eval_steps_per_second": 2.054, - "step": 35000 } ], "logging_steps": 100, - "max_steps": 45000, + "max_steps": 35000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 5000, @@ -2764,7 +1984,7 @@ "attributes": {} } }, - "total_flos": 4.73851590868992e+18, + "total_flos": 3.3846542204928e+18, "train_batch_size": 64, "trial_name": null, "trial_params": null