| { |
| "best_global_step": 89000, |
| "best_metric": 3.533585786819458, |
| "best_model_checkpoint": "/scratch/cl5625/exceptions/models/swap_0.7_resemble_to_push_5039/checkpoint-30000", |
| "epoch": 31.778425655976676, |
| "eval_steps": 1000, |
| "global_step": 109000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.014577259475218658, |
| "grad_norm": 0.7874077558517456, |
| "learning_rate": 0.000294, |
| "loss": 8.5058, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.029154518950437316, |
| "grad_norm": 0.595557451248169, |
| "learning_rate": 0.0005939999999999999, |
| "loss": 6.7716, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.043731778425655975, |
| "grad_norm": 0.5917447209358215, |
| "learning_rate": 0.000599828471411902, |
| "loss": 6.36, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.05830903790087463, |
| "grad_norm": 0.6458065509796143, |
| "learning_rate": 0.0005996534422403733, |
| "loss": 6.1614, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.0728862973760933, |
| "grad_norm": 0.5044608116149902, |
| "learning_rate": 0.0005994784130688448, |
| "loss": 6.0021, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.08746355685131195, |
| "grad_norm": 0.45285528898239136, |
| "learning_rate": 0.0005993033838973162, |
| "loss": 5.8768, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.10204081632653061, |
| "grad_norm": 0.4967013895511627, |
| "learning_rate": 0.0005991283547257876, |
| "loss": 5.7439, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.11661807580174927, |
| "grad_norm": 0.6553553342819214, |
| "learning_rate": 0.000598953325554259, |
| "loss": 5.6286, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.13119533527696792, |
| "grad_norm": 0.5217544436454773, |
| "learning_rate": 0.0005987782963827304, |
| "loss": 5.5172, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.1457725947521866, |
| "grad_norm": 0.4531030058860779, |
| "learning_rate": 0.0005986032672112018, |
| "loss": 5.4139, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.16034985422740525, |
| "grad_norm": 0.4770136773586273, |
| "learning_rate": 0.0005984282380396732, |
| "loss": 5.3499, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.1749271137026239, |
| "grad_norm": 0.5866448283195496, |
| "learning_rate": 0.0005982532088681447, |
| "loss": 5.2578, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.18950437317784258, |
| "grad_norm": 0.4144020676612854, |
| "learning_rate": 0.000598078179696616, |
| "loss": 5.199, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.20408163265306123, |
| "grad_norm": 0.4637407958507538, |
| "learning_rate": 0.0005979031505250875, |
| "loss": 5.1355, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.21865889212827988, |
| "grad_norm": 0.4051854908466339, |
| "learning_rate": 0.0005977281213535589, |
| "loss": 5.0826, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.23323615160349853, |
| "grad_norm": 0.4222441613674164, |
| "learning_rate": 0.0005975530921820304, |
| "loss": 5.0161, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.2478134110787172, |
| "grad_norm": 0.43692415952682495, |
| "learning_rate": 0.0005973780630105017, |
| "loss": 4.9604, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.26239067055393583, |
| "grad_norm": 0.4478825032711029, |
| "learning_rate": 0.0005972030338389731, |
| "loss": 4.9278, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.27696793002915454, |
| "grad_norm": 0.458936870098114, |
| "learning_rate": 0.0005970280046674445, |
| "loss": 4.895, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.2915451895043732, |
| "grad_norm": 0.4285873472690582, |
| "learning_rate": 0.000596852975495916, |
| "loss": 4.8427, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.2915451895043732, |
| "eval_accuracy": 0.25348940253781577, |
| "eval_loss": 4.757139205932617, |
| "eval_runtime": 183.1867, |
| "eval_samples_per_second": 90.749, |
| "eval_steps_per_second": 5.672, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.30612244897959184, |
| "grad_norm": 0.45091402530670166, |
| "learning_rate": 0.0005966779463243873, |
| "loss": 4.7713, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.3206997084548105, |
| "grad_norm": 0.43677228689193726, |
| "learning_rate": 0.0005965029171528587, |
| "loss": 4.7492, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.33527696793002915, |
| "grad_norm": 0.4576217830181122, |
| "learning_rate": 0.0005963278879813302, |
| "loss": 4.7098, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.3498542274052478, |
| "grad_norm": 0.517709493637085, |
| "learning_rate": 0.0005961528588098016, |
| "loss": 4.6695, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.36443148688046645, |
| "grad_norm": 0.5433350801467896, |
| "learning_rate": 0.0005959778296382729, |
| "loss": 4.6536, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.37900874635568516, |
| "grad_norm": 0.41080689430236816, |
| "learning_rate": 0.0005958028004667444, |
| "loss": 4.6072, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.3935860058309038, |
| "grad_norm": 0.4460156559944153, |
| "learning_rate": 0.0005956277712952158, |
| "loss": 4.5744, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.40816326530612246, |
| "grad_norm": 0.4368729889392853, |
| "learning_rate": 0.0005954527421236873, |
| "loss": 4.554, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.4227405247813411, |
| "grad_norm": 0.4751439094543457, |
| "learning_rate": 0.0005952777129521586, |
| "loss": 4.5263, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.43731778425655976, |
| "grad_norm": 0.4113054573535919, |
| "learning_rate": 0.0005951026837806301, |
| "loss": 4.5212, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.4518950437317784, |
| "grad_norm": 0.41762205958366394, |
| "learning_rate": 0.0005949276546091015, |
| "loss": 4.4888, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.46647230320699706, |
| "grad_norm": 0.3934441804885864, |
| "learning_rate": 0.0005947526254375729, |
| "loss": 4.4659, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.48104956268221577, |
| "grad_norm": 0.47389641404151917, |
| "learning_rate": 0.0005945775962660442, |
| "loss": 4.4567, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.4956268221574344, |
| "grad_norm": 0.4341316521167755, |
| "learning_rate": 0.0005944025670945157, |
| "loss": 4.4355, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.5102040816326531, |
| "grad_norm": 0.4070115387439728, |
| "learning_rate": 0.0005942275379229871, |
| "loss": 4.4144, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.5247813411078717, |
| "grad_norm": 0.4244054853916168, |
| "learning_rate": 0.0005940525087514585, |
| "loss": 4.3984, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.5393586005830904, |
| "grad_norm": 0.42848894000053406, |
| "learning_rate": 0.00059387747957993, |
| "loss": 4.3872, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.5539358600583091, |
| "grad_norm": 0.4671320915222168, |
| "learning_rate": 0.0005937024504084013, |
| "loss": 4.3684, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.5685131195335277, |
| "grad_norm": 0.38709279894828796, |
| "learning_rate": 0.0005935274212368728, |
| "loss": 4.3662, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.5830903790087464, |
| "grad_norm": 0.4209407866001129, |
| "learning_rate": 0.0005933523920653442, |
| "loss": 4.3511, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.5830903790087464, |
| "eval_accuracy": 0.29855510341307406, |
| "eval_loss": 4.285789966583252, |
| "eval_runtime": 182.7746, |
| "eval_samples_per_second": 90.954, |
| "eval_steps_per_second": 5.685, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.597667638483965, |
| "grad_norm": 0.39591220021247864, |
| "learning_rate": 0.0005931773628938157, |
| "loss": 4.3369, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.6122448979591837, |
| "grad_norm": 0.36318960785865784, |
| "learning_rate": 0.000593002333722287, |
| "loss": 4.3037, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.6268221574344023, |
| "grad_norm": 0.41896095871925354, |
| "learning_rate": 0.0005928273045507584, |
| "loss": 4.3111, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.641399416909621, |
| "grad_norm": 0.42866140604019165, |
| "learning_rate": 0.0005926522753792298, |
| "loss": 4.2861, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.6559766763848397, |
| "grad_norm": 0.39964374899864197, |
| "learning_rate": 0.0005924772462077013, |
| "loss": 4.2784, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.6705539358600583, |
| "grad_norm": 0.37104126811027527, |
| "learning_rate": 0.0005923022170361726, |
| "loss": 4.2805, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.685131195335277, |
| "grad_norm": 0.37753456830978394, |
| "learning_rate": 0.000592127187864644, |
| "loss": 4.2608, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.6997084548104956, |
| "grad_norm": 0.39387914538383484, |
| "learning_rate": 0.0005919521586931155, |
| "loss": 4.2389, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.7142857142857143, |
| "grad_norm": 0.38472071290016174, |
| "learning_rate": 0.0005917771295215869, |
| "loss": 4.2527, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.7288629737609329, |
| "grad_norm": 0.3931500315666199, |
| "learning_rate": 0.0005916021003500583, |
| "loss": 4.23, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.7434402332361516, |
| "grad_norm": 0.36343446373939514, |
| "learning_rate": 0.0005914270711785297, |
| "loss": 4.2279, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.7580174927113703, |
| "grad_norm": 0.4511328935623169, |
| "learning_rate": 0.0005912520420070012, |
| "loss": 4.2169, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.7725947521865889, |
| "grad_norm": 0.4024074971675873, |
| "learning_rate": 0.0005910770128354726, |
| "loss": 4.2055, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.7871720116618076, |
| "grad_norm": 0.3848779499530792, |
| "learning_rate": 0.0005909019836639439, |
| "loss": 4.1972, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.8017492711370262, |
| "grad_norm": 0.3447161912918091, |
| "learning_rate": 0.0005907269544924154, |
| "loss": 4.1825, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.8163265306122449, |
| "grad_norm": 0.39848795533180237, |
| "learning_rate": 0.0005905519253208868, |
| "loss": 4.1709, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.8309037900874635, |
| "grad_norm": 1.0386348962783813, |
| "learning_rate": 0.0005903768961493582, |
| "loss": 4.1715, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.8454810495626822, |
| "grad_norm": 0.3463401794433594, |
| "learning_rate": 0.0005902018669778295, |
| "loss": 4.1826, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.8600583090379009, |
| "grad_norm": 0.3431394100189209, |
| "learning_rate": 0.000590026837806301, |
| "loss": 4.1599, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.8746355685131195, |
| "grad_norm": 0.37830331921577454, |
| "learning_rate": 0.0005898518086347724, |
| "loss": 4.1523, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.8746355685131195, |
| "eval_accuracy": 0.3145779614600069, |
| "eval_loss": 4.103229522705078, |
| "eval_runtime": 183.0069, |
| "eval_samples_per_second": 90.838, |
| "eval_steps_per_second": 5.677, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.8892128279883382, |
| "grad_norm": 0.366996705532074, |
| "learning_rate": 0.0005896767794632438, |
| "loss": 4.1463, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.9037900874635568, |
| "grad_norm": 0.3830026388168335, |
| "learning_rate": 0.0005895017502917152, |
| "loss": 4.1389, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.9183673469387755, |
| "grad_norm": 0.3352486491203308, |
| "learning_rate": 0.0005893267211201866, |
| "loss": 4.1402, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.9329446064139941, |
| "grad_norm": 0.3650812804698944, |
| "learning_rate": 0.0005891516919486581, |
| "loss": 4.1292, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.9475218658892128, |
| "grad_norm": 0.3521343171596527, |
| "learning_rate": 0.0005889766627771295, |
| "loss": 4.1101, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.9620991253644315, |
| "grad_norm": 0.34803035855293274, |
| "learning_rate": 0.0005888016336056009, |
| "loss": 4.1055, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.9766763848396501, |
| "grad_norm": 0.4116007089614868, |
| "learning_rate": 0.0005886266044340723, |
| "loss": 4.1019, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.9912536443148688, |
| "grad_norm": 0.3484455347061157, |
| "learning_rate": 0.0005884515752625437, |
| "loss": 4.1101, |
| "step": 3400 |
| }, |
| { |
| "epoch": 1.0058309037900874, |
| "grad_norm": 0.3504573702812195, |
| "learning_rate": 0.0005882765460910151, |
| "loss": 4.0796, |
| "step": 3450 |
| }, |
| { |
| "epoch": 1.0204081632653061, |
| "grad_norm": 0.3820534944534302, |
| "learning_rate": 0.0005881015169194865, |
| "loss": 4.0189, |
| "step": 3500 |
| }, |
| { |
| "epoch": 1.0349854227405249, |
| "grad_norm": 0.34919607639312744, |
| "learning_rate": 0.0005879264877479579, |
| "loss": 4.0246, |
| "step": 3550 |
| }, |
| { |
| "epoch": 1.0495626822157433, |
| "grad_norm": 0.36303994059562683, |
| "learning_rate": 0.0005877514585764293, |
| "loss": 4.0159, |
| "step": 3600 |
| }, |
| { |
| "epoch": 1.064139941690962, |
| "grad_norm": 0.3435100018978119, |
| "learning_rate": 0.0005875764294049008, |
| "loss": 4.0158, |
| "step": 3650 |
| }, |
| { |
| "epoch": 1.0787172011661808, |
| "grad_norm": 0.3518615961074829, |
| "learning_rate": 0.0005874014002333721, |
| "loss": 4.0298, |
| "step": 3700 |
| }, |
| { |
| "epoch": 1.0932944606413995, |
| "grad_norm": 0.3429192304611206, |
| "learning_rate": 0.0005872263710618436, |
| "loss": 4.0135, |
| "step": 3750 |
| }, |
| { |
| "epoch": 1.1078717201166182, |
| "grad_norm": 0.3353997468948364, |
| "learning_rate": 0.000587051341890315, |
| "loss": 4.0039, |
| "step": 3800 |
| }, |
| { |
| "epoch": 1.1224489795918366, |
| "grad_norm": 0.33416813611984253, |
| "learning_rate": 0.0005868763127187865, |
| "loss": 4.0146, |
| "step": 3850 |
| }, |
| { |
| "epoch": 1.1370262390670554, |
| "grad_norm": 0.34684616327285767, |
| "learning_rate": 0.0005867012835472578, |
| "loss": 3.9853, |
| "step": 3900 |
| }, |
| { |
| "epoch": 1.151603498542274, |
| "grad_norm": 0.33499616384506226, |
| "learning_rate": 0.0005865262543757292, |
| "loss": 3.9963, |
| "step": 3950 |
| }, |
| { |
| "epoch": 1.1661807580174928, |
| "grad_norm": 0.3389959931373596, |
| "learning_rate": 0.0005863512252042007, |
| "loss": 3.9998, |
| "step": 4000 |
| }, |
| { |
| "epoch": 1.1661807580174928, |
| "eval_accuracy": 0.3246784174531811, |
| "eval_loss": 3.99202299118042, |
| "eval_runtime": 182.9762, |
| "eval_samples_per_second": 90.853, |
| "eval_steps_per_second": 5.678, |
| "step": 4000 |
| }, |
| { |
| "epoch": 1.1807580174927113, |
| "grad_norm": 0.3295277953147888, |
| "learning_rate": 0.0005861761960326721, |
| "loss": 3.9965, |
| "step": 4050 |
| }, |
| { |
| "epoch": 1.19533527696793, |
| "grad_norm": 0.3454665541648865, |
| "learning_rate": 0.0005860011668611434, |
| "loss": 3.9911, |
| "step": 4100 |
| }, |
| { |
| "epoch": 1.2099125364431487, |
| "grad_norm": 0.33836084604263306, |
| "learning_rate": 0.0005858261376896148, |
| "loss": 3.9786, |
| "step": 4150 |
| }, |
| { |
| "epoch": 1.2244897959183674, |
| "grad_norm": 0.3491062521934509, |
| "learning_rate": 0.0005856511085180863, |
| "loss": 3.9913, |
| "step": 4200 |
| }, |
| { |
| "epoch": 1.239067055393586, |
| "grad_norm": 0.3459561765193939, |
| "learning_rate": 0.0005854760793465577, |
| "loss": 3.9858, |
| "step": 4250 |
| }, |
| { |
| "epoch": 1.2536443148688048, |
| "grad_norm": 0.3346427083015442, |
| "learning_rate": 0.0005853010501750292, |
| "loss": 3.97, |
| "step": 4300 |
| }, |
| { |
| "epoch": 1.2682215743440233, |
| "grad_norm": 0.3376955986022949, |
| "learning_rate": 0.0005851260210035005, |
| "loss": 3.9775, |
| "step": 4350 |
| }, |
| { |
| "epoch": 1.282798833819242, |
| "grad_norm": 0.3495452404022217, |
| "learning_rate": 0.000584950991831972, |
| "loss": 3.9714, |
| "step": 4400 |
| }, |
| { |
| "epoch": 1.2973760932944607, |
| "grad_norm": 0.343971848487854, |
| "learning_rate": 0.0005847759626604434, |
| "loss": 3.9665, |
| "step": 4450 |
| }, |
| { |
| "epoch": 1.3119533527696792, |
| "grad_norm": 0.3238019645214081, |
| "learning_rate": 0.0005846009334889148, |
| "loss": 3.9599, |
| "step": 4500 |
| }, |
| { |
| "epoch": 1.3265306122448979, |
| "grad_norm": 0.33076655864715576, |
| "learning_rate": 0.0005844259043173862, |
| "loss": 3.9599, |
| "step": 4550 |
| }, |
| { |
| "epoch": 1.3411078717201166, |
| "grad_norm": 0.34467896819114685, |
| "learning_rate": 0.0005842508751458576, |
| "loss": 3.9573, |
| "step": 4600 |
| }, |
| { |
| "epoch": 1.3556851311953353, |
| "grad_norm": 0.3411879539489746, |
| "learning_rate": 0.000584075845974329, |
| "loss": 3.9609, |
| "step": 4650 |
| }, |
| { |
| "epoch": 1.370262390670554, |
| "grad_norm": 0.33475279808044434, |
| "learning_rate": 0.0005839008168028004, |
| "loss": 3.9461, |
| "step": 4700 |
| }, |
| { |
| "epoch": 1.3848396501457727, |
| "grad_norm": 0.33972883224487305, |
| "learning_rate": 0.0005837257876312718, |
| "loss": 3.9545, |
| "step": 4750 |
| }, |
| { |
| "epoch": 1.3994169096209912, |
| "grad_norm": 0.3224533498287201, |
| "learning_rate": 0.0005835507584597432, |
| "loss": 3.9628, |
| "step": 4800 |
| }, |
| { |
| "epoch": 1.41399416909621, |
| "grad_norm": 0.3264342248439789, |
| "learning_rate": 0.0005833757292882146, |
| "loss": 3.947, |
| "step": 4850 |
| }, |
| { |
| "epoch": 1.4285714285714286, |
| "grad_norm": 0.34044086933135986, |
| "learning_rate": 0.0005832007001166861, |
| "loss": 3.9474, |
| "step": 4900 |
| }, |
| { |
| "epoch": 1.4431486880466473, |
| "grad_norm": 0.3255755305290222, |
| "learning_rate": 0.0005830256709451574, |
| "loss": 3.9333, |
| "step": 4950 |
| }, |
| { |
| "epoch": 1.4577259475218658, |
| "grad_norm": 0.3284819424152374, |
| "learning_rate": 0.0005828506417736289, |
| "loss": 3.9337, |
| "step": 5000 |
| }, |
| { |
| "epoch": 1.4577259475218658, |
| "eval_accuracy": 0.33094985393527193, |
| "eval_loss": 3.9178109169006348, |
| "eval_runtime": 182.8911, |
| "eval_samples_per_second": 90.896, |
| "eval_steps_per_second": 5.681, |
| "step": 5000 |
| }, |
| { |
| "epoch": 1.4723032069970845, |
| "grad_norm": 0.3412162661552429, |
| "learning_rate": 0.0005826756126021003, |
| "loss": 3.9258, |
| "step": 5050 |
| }, |
| { |
| "epoch": 1.4868804664723032, |
| "grad_norm": 0.3400501012802124, |
| "learning_rate": 0.0005825005834305718, |
| "loss": 3.9309, |
| "step": 5100 |
| }, |
| { |
| "epoch": 1.501457725947522, |
| "grad_norm": 0.32828906178474426, |
| "learning_rate": 0.0005823255542590431, |
| "loss": 3.9236, |
| "step": 5150 |
| }, |
| { |
| "epoch": 1.5160349854227406, |
| "grad_norm": 0.32428792119026184, |
| "learning_rate": 0.0005821505250875145, |
| "loss": 3.9297, |
| "step": 5200 |
| }, |
| { |
| "epoch": 1.5306122448979593, |
| "grad_norm": 0.3326531946659088, |
| "learning_rate": 0.000581975495915986, |
| "loss": 3.9204, |
| "step": 5250 |
| }, |
| { |
| "epoch": 1.5451895043731778, |
| "grad_norm": 0.3215237855911255, |
| "learning_rate": 0.0005818004667444574, |
| "loss": 3.9227, |
| "step": 5300 |
| }, |
| { |
| "epoch": 1.5597667638483965, |
| "grad_norm": 0.3173177242279053, |
| "learning_rate": 0.0005816254375729287, |
| "loss": 3.9008, |
| "step": 5350 |
| }, |
| { |
| "epoch": 1.574344023323615, |
| "grad_norm": 0.33386069536209106, |
| "learning_rate": 0.0005814504084014001, |
| "loss": 3.915, |
| "step": 5400 |
| }, |
| { |
| "epoch": 1.5889212827988337, |
| "grad_norm": 0.31523650884628296, |
| "learning_rate": 0.0005812753792298716, |
| "loss": 3.9, |
| "step": 5450 |
| }, |
| { |
| "epoch": 1.6034985422740524, |
| "grad_norm": 0.320574015378952, |
| "learning_rate": 0.000581100350058343, |
| "loss": 3.9083, |
| "step": 5500 |
| }, |
| { |
| "epoch": 1.6180758017492711, |
| "grad_norm": 0.30293840169906616, |
| "learning_rate": 0.0005809253208868144, |
| "loss": 3.9042, |
| "step": 5550 |
| }, |
| { |
| "epoch": 1.6326530612244898, |
| "grad_norm": 0.31950491666793823, |
| "learning_rate": 0.0005807502917152858, |
| "loss": 3.9096, |
| "step": 5600 |
| }, |
| { |
| "epoch": 1.6472303206997085, |
| "grad_norm": 0.3188002109527588, |
| "learning_rate": 0.0005805752625437573, |
| "loss": 3.902, |
| "step": 5650 |
| }, |
| { |
| "epoch": 1.6618075801749272, |
| "grad_norm": 0.3315708041191101, |
| "learning_rate": 0.0005804002333722287, |
| "loss": 3.9093, |
| "step": 5700 |
| }, |
| { |
| "epoch": 1.6763848396501457, |
| "grad_norm": 0.3257363438606262, |
| "learning_rate": 0.0005802252042007, |
| "loss": 3.8996, |
| "step": 5750 |
| }, |
| { |
| "epoch": 1.6909620991253644, |
| "grad_norm": 0.3249954879283905, |
| "learning_rate": 0.0005800501750291715, |
| "loss": 3.8859, |
| "step": 5800 |
| }, |
| { |
| "epoch": 1.7055393586005831, |
| "grad_norm": 0.333447128534317, |
| "learning_rate": 0.0005798751458576429, |
| "loss": 3.9072, |
| "step": 5850 |
| }, |
| { |
| "epoch": 1.7201166180758016, |
| "grad_norm": 0.32005417346954346, |
| "learning_rate": 0.0005797001166861143, |
| "loss": 3.8911, |
| "step": 5900 |
| }, |
| { |
| "epoch": 1.7346938775510203, |
| "grad_norm": 0.3107415735721588, |
| "learning_rate": 0.0005795250875145856, |
| "loss": 3.8948, |
| "step": 5950 |
| }, |
| { |
| "epoch": 1.749271137026239, |
| "grad_norm": 0.3174699544906616, |
| "learning_rate": 0.0005793500583430571, |
| "loss": 3.8708, |
| "step": 6000 |
| }, |
| { |
| "epoch": 1.749271137026239, |
| "eval_accuracy": 0.3363507644148276, |
| "eval_loss": 3.858534812927246, |
| "eval_runtime": 182.5563, |
| "eval_samples_per_second": 91.062, |
| "eval_steps_per_second": 5.691, |
| "step": 6000 |
| }, |
| { |
| "epoch": 1.7638483965014577, |
| "grad_norm": 0.3156437575817108, |
| "learning_rate": 0.0005791750291715285, |
| "loss": 3.8716, |
| "step": 6050 |
| }, |
| { |
| "epoch": 1.7784256559766765, |
| "grad_norm": 0.3218945860862732, |
| "learning_rate": 0.000579, |
| "loss": 3.8865, |
| "step": 6100 |
| }, |
| { |
| "epoch": 1.7930029154518952, |
| "grad_norm": 0.3210793137550354, |
| "learning_rate": 0.0005788249708284714, |
| "loss": 3.8891, |
| "step": 6150 |
| }, |
| { |
| "epoch": 1.8075801749271136, |
| "grad_norm": 0.3157784044742584, |
| "learning_rate": 0.0005786499416569428, |
| "loss": 3.8754, |
| "step": 6200 |
| }, |
| { |
| "epoch": 1.8221574344023324, |
| "grad_norm": 0.3452181816101074, |
| "learning_rate": 0.0005784749124854142, |
| "loss": 3.8703, |
| "step": 6250 |
| }, |
| { |
| "epoch": 1.836734693877551, |
| "grad_norm": 0.3172648549079895, |
| "learning_rate": 0.0005782998833138856, |
| "loss": 3.8612, |
| "step": 6300 |
| }, |
| { |
| "epoch": 1.8513119533527695, |
| "grad_norm": 0.33045291900634766, |
| "learning_rate": 0.0005781248541423571, |
| "loss": 3.8765, |
| "step": 6350 |
| }, |
| { |
| "epoch": 1.8658892128279883, |
| "grad_norm": 0.305176317691803, |
| "learning_rate": 0.0005779498249708284, |
| "loss": 3.8544, |
| "step": 6400 |
| }, |
| { |
| "epoch": 1.880466472303207, |
| "grad_norm": 0.31116318702697754, |
| "learning_rate": 0.0005777747957992998, |
| "loss": 3.8652, |
| "step": 6450 |
| }, |
| { |
| "epoch": 1.8950437317784257, |
| "grad_norm": 0.303443044424057, |
| "learning_rate": 0.0005775997666277713, |
| "loss": 3.8628, |
| "step": 6500 |
| }, |
| { |
| "epoch": 1.9096209912536444, |
| "grad_norm": 0.32800889015197754, |
| "learning_rate": 0.0005774247374562427, |
| "loss": 3.8523, |
| "step": 6550 |
| }, |
| { |
| "epoch": 1.924198250728863, |
| "grad_norm": 0.3365955948829651, |
| "learning_rate": 0.000577249708284714, |
| "loss": 3.8625, |
| "step": 6600 |
| }, |
| { |
| "epoch": 1.9387755102040818, |
| "grad_norm": 0.3247227668762207, |
| "learning_rate": 0.0005770746791131854, |
| "loss": 3.8558, |
| "step": 6650 |
| }, |
| { |
| "epoch": 1.9533527696793003, |
| "grad_norm": 0.33500850200653076, |
| "learning_rate": 0.0005768996499416569, |
| "loss": 3.8482, |
| "step": 6700 |
| }, |
| { |
| "epoch": 1.967930029154519, |
| "grad_norm": 0.307003915309906, |
| "learning_rate": 0.0005767246207701283, |
| "loss": 3.8539, |
| "step": 6750 |
| }, |
| { |
| "epoch": 1.9825072886297375, |
| "grad_norm": 0.31048765778541565, |
| "learning_rate": 0.0005765495915985997, |
| "loss": 3.8608, |
| "step": 6800 |
| }, |
| { |
| "epoch": 1.9970845481049562, |
| "grad_norm": 0.30202147364616394, |
| "learning_rate": 0.0005763745624270711, |
| "loss": 3.8392, |
| "step": 6850 |
| }, |
| { |
| "epoch": 2.011661807580175, |
| "grad_norm": 0.31619763374328613, |
| "learning_rate": 0.0005761995332555426, |
| "loss": 3.7587, |
| "step": 6900 |
| }, |
| { |
| "epoch": 2.0262390670553936, |
| "grad_norm": 0.3118601143360138, |
| "learning_rate": 0.000576024504084014, |
| "loss": 3.7478, |
| "step": 6950 |
| }, |
| { |
| "epoch": 2.0408163265306123, |
| "grad_norm": 0.3118847906589508, |
| "learning_rate": 0.0005758494749124853, |
| "loss": 3.7474, |
| "step": 7000 |
| }, |
| { |
| "epoch": 2.0408163265306123, |
| "eval_accuracy": 0.3408536028357841, |
| "eval_loss": 3.8182103633880615, |
| "eval_runtime": 182.8244, |
| "eval_samples_per_second": 90.929, |
| "eval_steps_per_second": 5.683, |
| "step": 7000 |
| }, |
| { |
| "epoch": 2.055393586005831, |
| "grad_norm": 0.3154442608356476, |
| "learning_rate": 0.0005756744457409568, |
| "loss": 3.7527, |
| "step": 7050 |
| }, |
| { |
| "epoch": 2.0699708454810497, |
| "grad_norm": 0.31959810853004456, |
| "learning_rate": 0.0005754994165694282, |
| "loss": 3.7533, |
| "step": 7100 |
| }, |
| { |
| "epoch": 2.0845481049562684, |
| "grad_norm": 0.3303550183773041, |
| "learning_rate": 0.0005753243873978996, |
| "loss": 3.7602, |
| "step": 7150 |
| }, |
| { |
| "epoch": 2.0991253644314867, |
| "grad_norm": 0.3240067958831787, |
| "learning_rate": 0.0005751493582263709, |
| "loss": 3.7683, |
| "step": 7200 |
| }, |
| { |
| "epoch": 2.1137026239067054, |
| "grad_norm": 0.3330995738506317, |
| "learning_rate": 0.0005749743290548424, |
| "loss": 3.7604, |
| "step": 7250 |
| }, |
| { |
| "epoch": 2.128279883381924, |
| "grad_norm": 0.3294008672237396, |
| "learning_rate": 0.0005747992998833138, |
| "loss": 3.7791, |
| "step": 7300 |
| }, |
| { |
| "epoch": 2.142857142857143, |
| "grad_norm": 0.3313888609409332, |
| "learning_rate": 0.0005746242707117853, |
| "loss": 3.7583, |
| "step": 7350 |
| }, |
| { |
| "epoch": 2.1574344023323615, |
| "grad_norm": 0.32828205823898315, |
| "learning_rate": 0.0005744492415402566, |
| "loss": 3.7621, |
| "step": 7400 |
| }, |
| { |
| "epoch": 2.17201166180758, |
| "grad_norm": 0.339874267578125, |
| "learning_rate": 0.0005742742123687281, |
| "loss": 3.7587, |
| "step": 7450 |
| }, |
| { |
| "epoch": 2.186588921282799, |
| "grad_norm": 0.3092948794364929, |
| "learning_rate": 0.0005740991831971995, |
| "loss": 3.7532, |
| "step": 7500 |
| }, |
| { |
| "epoch": 2.2011661807580176, |
| "grad_norm": 0.320710152387619, |
| "learning_rate": 0.0005739241540256709, |
| "loss": 3.7672, |
| "step": 7550 |
| }, |
| { |
| "epoch": 2.2157434402332363, |
| "grad_norm": 0.31004834175109863, |
| "learning_rate": 0.0005737491248541423, |
| "loss": 3.7649, |
| "step": 7600 |
| }, |
| { |
| "epoch": 2.2303206997084546, |
| "grad_norm": 0.3079014718532562, |
| "learning_rate": 0.0005735740956826137, |
| "loss": 3.7555, |
| "step": 7650 |
| }, |
| { |
| "epoch": 2.2448979591836733, |
| "grad_norm": 0.3182198107242584, |
| "learning_rate": 0.0005733990665110851, |
| "loss": 3.7582, |
| "step": 7700 |
| }, |
| { |
| "epoch": 2.259475218658892, |
| "grad_norm": 0.32147687673568726, |
| "learning_rate": 0.0005732240373395566, |
| "loss": 3.7592, |
| "step": 7750 |
| }, |
| { |
| "epoch": 2.2740524781341107, |
| "grad_norm": 0.31712988018989563, |
| "learning_rate": 0.000573049008168028, |
| "loss": 3.7643, |
| "step": 7800 |
| }, |
| { |
| "epoch": 2.2886297376093294, |
| "grad_norm": 0.3175004720687866, |
| "learning_rate": 0.0005728739789964993, |
| "loss": 3.7593, |
| "step": 7850 |
| }, |
| { |
| "epoch": 2.303206997084548, |
| "grad_norm": 0.30601269006729126, |
| "learning_rate": 0.0005726989498249708, |
| "loss": 3.7656, |
| "step": 7900 |
| }, |
| { |
| "epoch": 2.317784256559767, |
| "grad_norm": 0.3067971467971802, |
| "learning_rate": 0.0005725239206534422, |
| "loss": 3.7622, |
| "step": 7950 |
| }, |
| { |
| "epoch": 2.3323615160349855, |
| "grad_norm": 0.304423451423645, |
| "learning_rate": 0.0005723488914819137, |
| "loss": 3.7592, |
| "step": 8000 |
| }, |
| { |
| "epoch": 2.3323615160349855, |
| "eval_accuracy": 0.34391568835004305, |
| "eval_loss": 3.7847554683685303, |
| "eval_runtime": 182.6857, |
| "eval_samples_per_second": 90.998, |
| "eval_steps_per_second": 5.687, |
| "step": 8000 |
| }, |
| { |
| "epoch": 2.3469387755102042, |
| "grad_norm": 0.31666499376296997, |
| "learning_rate": 0.000572173862310385, |
| "loss": 3.7609, |
| "step": 8050 |
| }, |
| { |
| "epoch": 2.3615160349854225, |
| "grad_norm": 0.3182964324951172, |
| "learning_rate": 0.0005719988331388564, |
| "loss": 3.7533, |
| "step": 8100 |
| }, |
| { |
| "epoch": 2.376093294460641, |
| "grad_norm": 0.3177492916584015, |
| "learning_rate": 0.0005718238039673279, |
| "loss": 3.7578, |
| "step": 8150 |
| }, |
| { |
| "epoch": 2.39067055393586, |
| "grad_norm": 0.3210289776325226, |
| "learning_rate": 0.0005716487747957993, |
| "loss": 3.7552, |
| "step": 8200 |
| }, |
| { |
| "epoch": 2.4052478134110786, |
| "grad_norm": 0.31907418370246887, |
| "learning_rate": 0.0005714737456242706, |
| "loss": 3.7558, |
| "step": 8250 |
| }, |
| { |
| "epoch": 2.4198250728862973, |
| "grad_norm": 0.3094446659088135, |
| "learning_rate": 0.0005712987164527421, |
| "loss": 3.7434, |
| "step": 8300 |
| }, |
| { |
| "epoch": 2.434402332361516, |
| "grad_norm": 0.3326069414615631, |
| "learning_rate": 0.0005711236872812135, |
| "loss": 3.7418, |
| "step": 8350 |
| }, |
| { |
| "epoch": 2.4489795918367347, |
| "grad_norm": 0.32391783595085144, |
| "learning_rate": 0.0005709486581096849, |
| "loss": 3.7372, |
| "step": 8400 |
| }, |
| { |
| "epoch": 2.4635568513119535, |
| "grad_norm": 0.3140561580657959, |
| "learning_rate": 0.0005707736289381562, |
| "loss": 3.7612, |
| "step": 8450 |
| }, |
| { |
| "epoch": 2.478134110787172, |
| "grad_norm": 0.2998807728290558, |
| "learning_rate": 0.0005705985997666277, |
| "loss": 3.7458, |
| "step": 8500 |
| }, |
| { |
| "epoch": 2.4927113702623904, |
| "grad_norm": 0.3217441141605377, |
| "learning_rate": 0.0005704235705950991, |
| "loss": 3.755, |
| "step": 8550 |
| }, |
| { |
| "epoch": 2.5072886297376096, |
| "grad_norm": 0.35609811544418335, |
| "learning_rate": 0.0005702485414235706, |
| "loss": 3.7465, |
| "step": 8600 |
| }, |
| { |
| "epoch": 2.521865889212828, |
| "grad_norm": 0.3114137351512909, |
| "learning_rate": 0.0005700735122520419, |
| "loss": 3.7363, |
| "step": 8650 |
| }, |
| { |
| "epoch": 2.5364431486880465, |
| "grad_norm": 0.3186824321746826, |
| "learning_rate": 0.0005698984830805134, |
| "loss": 3.747, |
| "step": 8700 |
| }, |
| { |
| "epoch": 2.5510204081632653, |
| "grad_norm": 0.29359111189842224, |
| "learning_rate": 0.0005697234539089848, |
| "loss": 3.7405, |
| "step": 8750 |
| }, |
| { |
| "epoch": 2.565597667638484, |
| "grad_norm": 0.3077991008758545, |
| "learning_rate": 0.0005695484247374562, |
| "loss": 3.7468, |
| "step": 8800 |
| }, |
| { |
| "epoch": 2.5801749271137027, |
| "grad_norm": 0.3335474729537964, |
| "learning_rate": 0.0005693733955659276, |
| "loss": 3.7459, |
| "step": 8850 |
| }, |
| { |
| "epoch": 2.5947521865889214, |
| "grad_norm": 0.32796528935432434, |
| "learning_rate": 0.000569198366394399, |
| "loss": 3.7428, |
| "step": 8900 |
| }, |
| { |
| "epoch": 2.60932944606414, |
| "grad_norm": 0.31937655806541443, |
| "learning_rate": 0.0005690233372228704, |
| "loss": 3.7475, |
| "step": 8950 |
| }, |
| { |
| "epoch": 2.6239067055393583, |
| "grad_norm": 0.3045900762081146, |
| "learning_rate": 0.0005688483080513419, |
| "loss": 3.7338, |
| "step": 9000 |
| }, |
| { |
| "epoch": 2.6239067055393583, |
| "eval_accuracy": 0.34635598639366094, |
| "eval_loss": 3.756603240966797, |
| "eval_runtime": 182.4333, |
| "eval_samples_per_second": 91.124, |
| "eval_steps_per_second": 5.695, |
| "step": 9000 |
| }, |
| { |
| "epoch": 2.6384839650145775, |
| "grad_norm": 0.2968880534172058, |
| "learning_rate": 0.0005686732788798132, |
| "loss": 3.7332, |
| "step": 9050 |
| }, |
| { |
| "epoch": 2.6530612244897958, |
| "grad_norm": 0.3103148937225342, |
| "learning_rate": 0.0005684982497082846, |
| "loss": 3.7447, |
| "step": 9100 |
| }, |
| { |
| "epoch": 2.6676384839650145, |
| "grad_norm": 0.3076816201210022, |
| "learning_rate": 0.0005683232205367561, |
| "loss": 3.733, |
| "step": 9150 |
| }, |
| { |
| "epoch": 2.682215743440233, |
| "grad_norm": 0.31646642088890076, |
| "learning_rate": 0.0005681481913652275, |
| "loss": 3.7393, |
| "step": 9200 |
| }, |
| { |
| "epoch": 2.696793002915452, |
| "grad_norm": 0.3197529911994934, |
| "learning_rate": 0.000567973162193699, |
| "loss": 3.7425, |
| "step": 9250 |
| }, |
| { |
| "epoch": 2.7113702623906706, |
| "grad_norm": 0.2880556881427765, |
| "learning_rate": 0.0005677981330221703, |
| "loss": 3.7393, |
| "step": 9300 |
| }, |
| { |
| "epoch": 2.7259475218658893, |
| "grad_norm": 0.3087301552295685, |
| "learning_rate": 0.0005676231038506417, |
| "loss": 3.7316, |
| "step": 9350 |
| }, |
| { |
| "epoch": 2.740524781341108, |
| "grad_norm": 0.3143419027328491, |
| "learning_rate": 0.0005674480746791132, |
| "loss": 3.7278, |
| "step": 9400 |
| }, |
| { |
| "epoch": 2.7551020408163263, |
| "grad_norm": 0.2983132600784302, |
| "learning_rate": 0.0005672730455075845, |
| "loss": 3.7314, |
| "step": 9450 |
| }, |
| { |
| "epoch": 2.7696793002915454, |
| "grad_norm": 0.3293115198612213, |
| "learning_rate": 0.0005670980163360559, |
| "loss": 3.7273, |
| "step": 9500 |
| }, |
| { |
| "epoch": 2.7842565597667637, |
| "grad_norm": 0.30666598677635193, |
| "learning_rate": 0.0005669229871645274, |
| "loss": 3.7354, |
| "step": 9550 |
| }, |
| { |
| "epoch": 2.7988338192419824, |
| "grad_norm": 0.3132167160511017, |
| "learning_rate": 0.0005667479579929988, |
| "loss": 3.7246, |
| "step": 9600 |
| }, |
| { |
| "epoch": 2.813411078717201, |
| "grad_norm": 0.2964831590652466, |
| "learning_rate": 0.0005665729288214701, |
| "loss": 3.7363, |
| "step": 9650 |
| }, |
| { |
| "epoch": 2.82798833819242, |
| "grad_norm": 0.30478906631469727, |
| "learning_rate": 0.0005663978996499416, |
| "loss": 3.7301, |
| "step": 9700 |
| }, |
| { |
| "epoch": 2.8425655976676385, |
| "grad_norm": 0.30541983246803284, |
| "learning_rate": 0.000566222870478413, |
| "loss": 3.7306, |
| "step": 9750 |
| }, |
| { |
| "epoch": 2.857142857142857, |
| "grad_norm": 0.29288506507873535, |
| "learning_rate": 0.0005660478413068845, |
| "loss": 3.7356, |
| "step": 9800 |
| }, |
| { |
| "epoch": 2.871720116618076, |
| "grad_norm": 0.31285762786865234, |
| "learning_rate": 0.0005658728121353559, |
| "loss": 3.7214, |
| "step": 9850 |
| }, |
| { |
| "epoch": 2.8862973760932946, |
| "grad_norm": 0.3204987645149231, |
| "learning_rate": 0.0005656977829638273, |
| "loss": 3.7282, |
| "step": 9900 |
| }, |
| { |
| "epoch": 2.9008746355685133, |
| "grad_norm": 0.30404728651046753, |
| "learning_rate": 0.0005655227537922987, |
| "loss": 3.722, |
| "step": 9950 |
| }, |
| { |
| "epoch": 2.9154518950437316, |
| "grad_norm": 0.3133791387081146, |
| "learning_rate": 0.0005653477246207701, |
| "loss": 3.722, |
| "step": 10000 |
| }, |
| { |
| "epoch": 2.9154518950437316, |
| "eval_accuracy": 0.3486605553661601, |
| "eval_loss": 3.7310969829559326, |
| "eval_runtime": 182.8035, |
| "eval_samples_per_second": 90.939, |
| "eval_steps_per_second": 5.684, |
| "step": 10000 |
| }, |
| { |
| "epoch": 2.9300291545189503, |
| "grad_norm": 0.3230670392513275, |
| "learning_rate": 0.0005651726954492415, |
| "loss": 3.7355, |
| "step": 10050 |
| }, |
| { |
| "epoch": 2.944606413994169, |
| "grad_norm": 0.30999648571014404, |
| "learning_rate": 0.0005649976662777129, |
| "loss": 3.7289, |
| "step": 10100 |
| }, |
| { |
| "epoch": 2.9591836734693877, |
| "grad_norm": 0.3303220868110657, |
| "learning_rate": 0.0005648226371061843, |
| "loss": 3.7192, |
| "step": 10150 |
| }, |
| { |
| "epoch": 2.9737609329446064, |
| "grad_norm": 0.31200259923934937, |
| "learning_rate": 0.0005646476079346557, |
| "loss": 3.7102, |
| "step": 10200 |
| }, |
| { |
| "epoch": 2.988338192419825, |
| "grad_norm": 0.29743126034736633, |
| "learning_rate": 0.0005644725787631272, |
| "loss": 3.7206, |
| "step": 10250 |
| }, |
| { |
| "epoch": 3.002915451895044, |
| "grad_norm": 0.3076407015323639, |
| "learning_rate": 0.0005642975495915985, |
| "loss": 3.7061, |
| "step": 10300 |
| }, |
| { |
| "epoch": 3.0174927113702625, |
| "grad_norm": 0.3168206512928009, |
| "learning_rate": 0.0005641225204200699, |
| "loss": 3.6213, |
| "step": 10350 |
| }, |
| { |
| "epoch": 3.0320699708454812, |
| "grad_norm": 0.3298965096473694, |
| "learning_rate": 0.0005639474912485414, |
| "loss": 3.6165, |
| "step": 10400 |
| }, |
| { |
| "epoch": 3.0466472303206995, |
| "grad_norm": 0.3050922453403473, |
| "learning_rate": 0.0005637724620770128, |
| "loss": 3.6171, |
| "step": 10450 |
| }, |
| { |
| "epoch": 3.061224489795918, |
| "grad_norm": 0.2993483543395996, |
| "learning_rate": 0.0005635974329054842, |
| "loss": 3.6079, |
| "step": 10500 |
| }, |
| { |
| "epoch": 3.075801749271137, |
| "grad_norm": 0.3267271816730499, |
| "learning_rate": 0.0005634224037339556, |
| "loss": 3.6364, |
| "step": 10550 |
| }, |
| { |
| "epoch": 3.0903790087463556, |
| "grad_norm": 0.3062150776386261, |
| "learning_rate": 0.000563247374562427, |
| "loss": 3.6323, |
| "step": 10600 |
| }, |
| { |
| "epoch": 3.1049562682215743, |
| "grad_norm": 0.31601178646087646, |
| "learning_rate": 0.0005630723453908985, |
| "loss": 3.6353, |
| "step": 10650 |
| }, |
| { |
| "epoch": 3.119533527696793, |
| "grad_norm": 0.30633261799812317, |
| "learning_rate": 0.0005628973162193698, |
| "loss": 3.6308, |
| "step": 10700 |
| }, |
| { |
| "epoch": 3.1341107871720117, |
| "grad_norm": 0.3371477425098419, |
| "learning_rate": 0.0005627222870478412, |
| "loss": 3.6318, |
| "step": 10750 |
| }, |
| { |
| "epoch": 3.1486880466472305, |
| "grad_norm": 0.28911158442497253, |
| "learning_rate": 0.0005625472578763127, |
| "loss": 3.64, |
| "step": 10800 |
| }, |
| { |
| "epoch": 3.163265306122449, |
| "grad_norm": 0.3178606927394867, |
| "learning_rate": 0.0005623722287047841, |
| "loss": 3.6445, |
| "step": 10850 |
| }, |
| { |
| "epoch": 3.1778425655976674, |
| "grad_norm": 0.3044661283493042, |
| "learning_rate": 0.0005621971995332554, |
| "loss": 3.6358, |
| "step": 10900 |
| }, |
| { |
| "epoch": 3.192419825072886, |
| "grad_norm": 0.32306867837905884, |
| "learning_rate": 0.0005620221703617269, |
| "loss": 3.6349, |
| "step": 10950 |
| }, |
| { |
| "epoch": 3.206997084548105, |
| "grad_norm": 0.3064355254173279, |
| "learning_rate": 0.0005618471411901983, |
| "loss": 3.6432, |
| "step": 11000 |
| }, |
| { |
| "epoch": 3.206997084548105, |
| "eval_accuracy": 0.3509007324896549, |
| "eval_loss": 3.7172634601593018, |
| "eval_runtime": 182.6273, |
| "eval_samples_per_second": 91.027, |
| "eval_steps_per_second": 5.689, |
| "step": 11000 |
| }, |
| { |
| "epoch": 3.2215743440233235, |
| "grad_norm": 0.3162972629070282, |
| "learning_rate": 0.0005616721120186698, |
| "loss": 3.6392, |
| "step": 11050 |
| }, |
| { |
| "epoch": 3.2361516034985423, |
| "grad_norm": 0.31746646761894226, |
| "learning_rate": 0.0005614970828471411, |
| "loss": 3.64, |
| "step": 11100 |
| }, |
| { |
| "epoch": 3.250728862973761, |
| "grad_norm": 0.31247153878211975, |
| "learning_rate": 0.0005613220536756126, |
| "loss": 3.6393, |
| "step": 11150 |
| }, |
| { |
| "epoch": 3.2653061224489797, |
| "grad_norm": 0.31435567140579224, |
| "learning_rate": 0.000561147024504084, |
| "loss": 3.6414, |
| "step": 11200 |
| }, |
| { |
| "epoch": 3.2798833819241984, |
| "grad_norm": 0.3080807328224182, |
| "learning_rate": 0.0005609719953325554, |
| "loss": 3.627, |
| "step": 11250 |
| }, |
| { |
| "epoch": 3.294460641399417, |
| "grad_norm": 0.30449172854423523, |
| "learning_rate": 0.0005607969661610267, |
| "loss": 3.6524, |
| "step": 11300 |
| }, |
| { |
| "epoch": 3.3090379008746353, |
| "grad_norm": 0.31494206190109253, |
| "learning_rate": 0.0005606219369894982, |
| "loss": 3.6241, |
| "step": 11350 |
| }, |
| { |
| "epoch": 3.323615160349854, |
| "grad_norm": 0.30978018045425415, |
| "learning_rate": 0.0005604469078179696, |
| "loss": 3.6578, |
| "step": 11400 |
| }, |
| { |
| "epoch": 3.3381924198250728, |
| "grad_norm": 0.31096816062927246, |
| "learning_rate": 0.000560271878646441, |
| "loss": 3.6475, |
| "step": 11450 |
| }, |
| { |
| "epoch": 3.3527696793002915, |
| "grad_norm": 0.29989251494407654, |
| "learning_rate": 0.0005600968494749124, |
| "loss": 3.6372, |
| "step": 11500 |
| }, |
| { |
| "epoch": 3.36734693877551, |
| "grad_norm": 0.319997638463974, |
| "learning_rate": 0.0005599218203033838, |
| "loss": 3.6548, |
| "step": 11550 |
| }, |
| { |
| "epoch": 3.381924198250729, |
| "grad_norm": 0.3196156919002533, |
| "learning_rate": 0.0005597467911318553, |
| "loss": 3.6365, |
| "step": 11600 |
| }, |
| { |
| "epoch": 3.3965014577259476, |
| "grad_norm": 0.3064935803413391, |
| "learning_rate": 0.0005595717619603267, |
| "loss": 3.6479, |
| "step": 11650 |
| }, |
| { |
| "epoch": 3.4110787172011663, |
| "grad_norm": 0.30868738889694214, |
| "learning_rate": 0.0005593967327887982, |
| "loss": 3.6481, |
| "step": 11700 |
| }, |
| { |
| "epoch": 3.425655976676385, |
| "grad_norm": 0.31745100021362305, |
| "learning_rate": 0.0005592217036172695, |
| "loss": 3.6495, |
| "step": 11750 |
| }, |
| { |
| "epoch": 3.4402332361516033, |
| "grad_norm": 0.29115235805511475, |
| "learning_rate": 0.0005590466744457409, |
| "loss": 3.6573, |
| "step": 11800 |
| }, |
| { |
| "epoch": 3.454810495626822, |
| "grad_norm": 0.2950221300125122, |
| "learning_rate": 0.0005588716452742123, |
| "loss": 3.6449, |
| "step": 11850 |
| }, |
| { |
| "epoch": 3.4693877551020407, |
| "grad_norm": 0.3181147277355194, |
| "learning_rate": 0.0005586966161026838, |
| "loss": 3.6513, |
| "step": 11900 |
| }, |
| { |
| "epoch": 3.4839650145772594, |
| "grad_norm": 0.3124925196170807, |
| "learning_rate": 0.0005585215869311551, |
| "loss": 3.6395, |
| "step": 11950 |
| }, |
| { |
| "epoch": 3.498542274052478, |
| "grad_norm": 0.3157157301902771, |
| "learning_rate": 0.0005583465577596265, |
| "loss": 3.6514, |
| "step": 12000 |
| }, |
| { |
| "epoch": 3.498542274052478, |
| "eval_accuracy": 0.3525496111532804, |
| "eval_loss": 3.69809889793396, |
| "eval_runtime": 182.4166, |
| "eval_samples_per_second": 91.132, |
| "eval_steps_per_second": 5.696, |
| "step": 12000 |
| }, |
| { |
| "epoch": 3.513119533527697, |
| "grad_norm": 0.3136838972568512, |
| "learning_rate": 0.000558171528588098, |
| "loss": 3.6535, |
| "step": 12050 |
| }, |
| { |
| "epoch": 3.5276967930029155, |
| "grad_norm": 0.32249802350997925, |
| "learning_rate": 0.0005579964994165694, |
| "loss": 3.6552, |
| "step": 12100 |
| }, |
| { |
| "epoch": 3.542274052478134, |
| "grad_norm": 0.32572638988494873, |
| "learning_rate": 0.0005578214702450407, |
| "loss": 3.65, |
| "step": 12150 |
| }, |
| { |
| "epoch": 3.556851311953353, |
| "grad_norm": 0.3193070888519287, |
| "learning_rate": 0.0005576464410735122, |
| "loss": 3.6407, |
| "step": 12200 |
| }, |
| { |
| "epoch": 3.571428571428571, |
| "grad_norm": 0.2971830368041992, |
| "learning_rate": 0.0005574714119019836, |
| "loss": 3.6456, |
| "step": 12250 |
| }, |
| { |
| "epoch": 3.5860058309037903, |
| "grad_norm": 0.3185986280441284, |
| "learning_rate": 0.0005572963827304551, |
| "loss": 3.6469, |
| "step": 12300 |
| }, |
| { |
| "epoch": 3.6005830903790086, |
| "grad_norm": 0.3031637370586395, |
| "learning_rate": 0.0005571213535589264, |
| "loss": 3.6425, |
| "step": 12350 |
| }, |
| { |
| "epoch": 3.6151603498542273, |
| "grad_norm": 0.31252244114875793, |
| "learning_rate": 0.0005569463243873979, |
| "loss": 3.6485, |
| "step": 12400 |
| }, |
| { |
| "epoch": 3.629737609329446, |
| "grad_norm": 0.3112841248512268, |
| "learning_rate": 0.0005567712952158693, |
| "loss": 3.6345, |
| "step": 12450 |
| }, |
| { |
| "epoch": 3.6443148688046647, |
| "grad_norm": 0.3132002353668213, |
| "learning_rate": 0.0005565962660443407, |
| "loss": 3.6553, |
| "step": 12500 |
| }, |
| { |
| "epoch": 3.6588921282798834, |
| "grad_norm": 0.3197864592075348, |
| "learning_rate": 0.000556421236872812, |
| "loss": 3.6468, |
| "step": 12550 |
| }, |
| { |
| "epoch": 3.673469387755102, |
| "grad_norm": 0.3167048692703247, |
| "learning_rate": 0.0005562462077012835, |
| "loss": 3.6386, |
| "step": 12600 |
| }, |
| { |
| "epoch": 3.688046647230321, |
| "grad_norm": 0.319764643907547, |
| "learning_rate": 0.0005560711785297549, |
| "loss": 3.6401, |
| "step": 12650 |
| }, |
| { |
| "epoch": 3.702623906705539, |
| "grad_norm": 0.31712111830711365, |
| "learning_rate": 0.0005558961493582263, |
| "loss": 3.6502, |
| "step": 12700 |
| }, |
| { |
| "epoch": 3.7172011661807582, |
| "grad_norm": 0.3105193078517914, |
| "learning_rate": 0.0005557211201866977, |
| "loss": 3.6501, |
| "step": 12750 |
| }, |
| { |
| "epoch": 3.7317784256559765, |
| "grad_norm": 0.3264790177345276, |
| "learning_rate": 0.0005555460910151691, |
| "loss": 3.6548, |
| "step": 12800 |
| }, |
| { |
| "epoch": 3.746355685131195, |
| "grad_norm": 0.3052196502685547, |
| "learning_rate": 0.0005553710618436406, |
| "loss": 3.6431, |
| "step": 12850 |
| }, |
| { |
| "epoch": 3.760932944606414, |
| "grad_norm": 0.3182891011238098, |
| "learning_rate": 0.000555196032672112, |
| "loss": 3.6298, |
| "step": 12900 |
| }, |
| { |
| "epoch": 3.7755102040816326, |
| "grad_norm": 0.31025004386901855, |
| "learning_rate": 0.0005550210035005834, |
| "loss": 3.6411, |
| "step": 12950 |
| }, |
| { |
| "epoch": 3.7900874635568513, |
| "grad_norm": 0.3146236836910248, |
| "learning_rate": 0.0005548459743290548, |
| "loss": 3.6411, |
| "step": 13000 |
| }, |
| { |
| "epoch": 3.7900874635568513, |
| "eval_accuracy": 0.3540398056990671, |
| "eval_loss": 3.681382179260254, |
| "eval_runtime": 182.6914, |
| "eval_samples_per_second": 90.995, |
| "eval_steps_per_second": 5.687, |
| "step": 13000 |
| }, |
| { |
| "epoch": 3.80466472303207, |
| "grad_norm": 0.2979094982147217, |
| "learning_rate": 0.0005546709451575262, |
| "loss": 3.6437, |
| "step": 13050 |
| }, |
| { |
| "epoch": 3.8192419825072887, |
| "grad_norm": 0.2954643666744232, |
| "learning_rate": 0.0005544959159859976, |
| "loss": 3.643, |
| "step": 13100 |
| }, |
| { |
| "epoch": 3.8338192419825075, |
| "grad_norm": 0.31352123618125916, |
| "learning_rate": 0.000554320886814469, |
| "loss": 3.6516, |
| "step": 13150 |
| }, |
| { |
| "epoch": 3.848396501457726, |
| "grad_norm": 0.312225878238678, |
| "learning_rate": 0.0005541458576429404, |
| "loss": 3.641, |
| "step": 13200 |
| }, |
| { |
| "epoch": 3.8629737609329444, |
| "grad_norm": 0.30493468046188354, |
| "learning_rate": 0.0005539708284714118, |
| "loss": 3.6328, |
| "step": 13250 |
| }, |
| { |
| "epoch": 3.877551020408163, |
| "grad_norm": 0.30350905656814575, |
| "learning_rate": 0.0005537957992998833, |
| "loss": 3.6445, |
| "step": 13300 |
| }, |
| { |
| "epoch": 3.892128279883382, |
| "grad_norm": 0.3183624744415283, |
| "learning_rate": 0.0005536207701283546, |
| "loss": 3.6384, |
| "step": 13350 |
| }, |
| { |
| "epoch": 3.9067055393586005, |
| "grad_norm": 0.29714614152908325, |
| "learning_rate": 0.0005534457409568261, |
| "loss": 3.6475, |
| "step": 13400 |
| }, |
| { |
| "epoch": 3.9212827988338192, |
| "grad_norm": 0.3224017024040222, |
| "learning_rate": 0.0005532707117852975, |
| "loss": 3.6286, |
| "step": 13450 |
| }, |
| { |
| "epoch": 3.935860058309038, |
| "grad_norm": 0.31913307309150696, |
| "learning_rate": 0.000553095682613769, |
| "loss": 3.6291, |
| "step": 13500 |
| }, |
| { |
| "epoch": 3.9504373177842567, |
| "grad_norm": 0.31245312094688416, |
| "learning_rate": 0.0005529206534422404, |
| "loss": 3.6403, |
| "step": 13550 |
| }, |
| { |
| "epoch": 3.9650145772594754, |
| "grad_norm": 0.3012782335281372, |
| "learning_rate": 0.0005527456242707117, |
| "loss": 3.6385, |
| "step": 13600 |
| }, |
| { |
| "epoch": 3.979591836734694, |
| "grad_norm": 0.30770647525787354, |
| "learning_rate": 0.0005525705950991832, |
| "loss": 3.6343, |
| "step": 13650 |
| }, |
| { |
| "epoch": 3.9941690962099123, |
| "grad_norm": 0.31351521611213684, |
| "learning_rate": 0.0005523955659276546, |
| "loss": 3.6295, |
| "step": 13700 |
| }, |
| { |
| "epoch": 4.0087463556851315, |
| "grad_norm": 0.3147214949131012, |
| "learning_rate": 0.000552220536756126, |
| "loss": 3.5772, |
| "step": 13750 |
| }, |
| { |
| "epoch": 4.02332361516035, |
| "grad_norm": 0.32505854964256287, |
| "learning_rate": 0.0005520455075845973, |
| "loss": 3.5361, |
| "step": 13800 |
| }, |
| { |
| "epoch": 4.037900874635569, |
| "grad_norm": 0.29005855321884155, |
| "learning_rate": 0.0005518704784130688, |
| "loss": 3.5295, |
| "step": 13850 |
| }, |
| { |
| "epoch": 4.052478134110787, |
| "grad_norm": 0.328256756067276, |
| "learning_rate": 0.0005516954492415402, |
| "loss": 3.5362, |
| "step": 13900 |
| }, |
| { |
| "epoch": 4.067055393586005, |
| "grad_norm": 0.32246100902557373, |
| "learning_rate": 0.0005515204200700117, |
| "loss": 3.544, |
| "step": 13950 |
| }, |
| { |
| "epoch": 4.081632653061225, |
| "grad_norm": 0.31613844633102417, |
| "learning_rate": 0.000551345390898483, |
| "loss": 3.5514, |
| "step": 14000 |
| }, |
| { |
| "epoch": 4.081632653061225, |
| "eval_accuracy": 0.3552499486748699, |
| "eval_loss": 3.6744368076324463, |
| "eval_runtime": 182.424, |
| "eval_samples_per_second": 91.128, |
| "eval_steps_per_second": 5.696, |
| "step": 14000 |
| }, |
| { |
| "epoch": 4.096209912536443, |
| "grad_norm": 0.3086166977882385, |
| "learning_rate": 0.0005511703617269544, |
| "loss": 3.5407, |
| "step": 14050 |
| }, |
| { |
| "epoch": 4.110787172011662, |
| "grad_norm": 0.32620468735694885, |
| "learning_rate": 0.0005509953325554259, |
| "loss": 3.5484, |
| "step": 14100 |
| }, |
| { |
| "epoch": 4.12536443148688, |
| "grad_norm": 0.31899869441986084, |
| "learning_rate": 0.0005508203033838973, |
| "loss": 3.5562, |
| "step": 14150 |
| }, |
| { |
| "epoch": 4.139941690962099, |
| "grad_norm": 0.3118620812892914, |
| "learning_rate": 0.0005506452742123687, |
| "loss": 3.5522, |
| "step": 14200 |
| }, |
| { |
| "epoch": 4.154518950437318, |
| "grad_norm": 0.3356049954891205, |
| "learning_rate": 0.0005504702450408401, |
| "loss": 3.5622, |
| "step": 14250 |
| }, |
| { |
| "epoch": 4.169096209912537, |
| "grad_norm": 0.30787017941474915, |
| "learning_rate": 0.0005502952158693115, |
| "loss": 3.5693, |
| "step": 14300 |
| }, |
| { |
| "epoch": 4.183673469387755, |
| "grad_norm": 0.3089492619037628, |
| "learning_rate": 0.0005501201866977829, |
| "loss": 3.5681, |
| "step": 14350 |
| }, |
| { |
| "epoch": 4.198250728862973, |
| "grad_norm": 0.3012238144874573, |
| "learning_rate": 0.0005499451575262543, |
| "loss": 3.549, |
| "step": 14400 |
| }, |
| { |
| "epoch": 4.2128279883381925, |
| "grad_norm": 0.3236016631126404, |
| "learning_rate": 0.0005497701283547257, |
| "loss": 3.5571, |
| "step": 14450 |
| }, |
| { |
| "epoch": 4.227405247813411, |
| "grad_norm": 0.31478026509284973, |
| "learning_rate": 0.0005495950991831971, |
| "loss": 3.5518, |
| "step": 14500 |
| }, |
| { |
| "epoch": 4.24198250728863, |
| "grad_norm": 0.3320184350013733, |
| "learning_rate": 0.0005494200700116686, |
| "loss": 3.5767, |
| "step": 14550 |
| }, |
| { |
| "epoch": 4.256559766763848, |
| "grad_norm": 0.331966370344162, |
| "learning_rate": 0.0005492450408401399, |
| "loss": 3.5634, |
| "step": 14600 |
| }, |
| { |
| "epoch": 4.271137026239067, |
| "grad_norm": 0.3124910593032837, |
| "learning_rate": 0.0005490700116686114, |
| "loss": 3.5565, |
| "step": 14650 |
| }, |
| { |
| "epoch": 4.285714285714286, |
| "grad_norm": 0.3230664134025574, |
| "learning_rate": 0.0005488949824970828, |
| "loss": 3.5659, |
| "step": 14700 |
| }, |
| { |
| "epoch": 4.300291545189505, |
| "grad_norm": 0.3078579008579254, |
| "learning_rate": 0.0005487199533255543, |
| "loss": 3.5637, |
| "step": 14750 |
| }, |
| { |
| "epoch": 4.314868804664723, |
| "grad_norm": 0.3371805250644684, |
| "learning_rate": 0.0005485449241540256, |
| "loss": 3.5832, |
| "step": 14800 |
| }, |
| { |
| "epoch": 4.329446064139942, |
| "grad_norm": 0.31937262415885925, |
| "learning_rate": 0.000548369894982497, |
| "loss": 3.5686, |
| "step": 14850 |
| }, |
| { |
| "epoch": 4.34402332361516, |
| "grad_norm": 0.32466357946395874, |
| "learning_rate": 0.0005481948658109685, |
| "loss": 3.566, |
| "step": 14900 |
| }, |
| { |
| "epoch": 4.358600583090379, |
| "grad_norm": 0.3096526861190796, |
| "learning_rate": 0.0005480198366394399, |
| "loss": 3.5755, |
| "step": 14950 |
| }, |
| { |
| "epoch": 4.373177842565598, |
| "grad_norm": 0.3323536515235901, |
| "learning_rate": 0.0005478448074679112, |
| "loss": 3.564, |
| "step": 15000 |
| }, |
| { |
| "epoch": 4.373177842565598, |
| "eval_accuracy": 0.3562713893948155, |
| "eval_loss": 3.663257360458374, |
| "eval_runtime": 182.5662, |
| "eval_samples_per_second": 91.057, |
| "eval_steps_per_second": 5.691, |
| "step": 15000 |
| }, |
| { |
| "epoch": 4.387755102040816, |
| "grad_norm": 0.3572273254394531, |
| "learning_rate": 0.0005476697782963826, |
| "loss": 3.5658, |
| "step": 15050 |
| }, |
| { |
| "epoch": 4.402332361516035, |
| "grad_norm": 0.3238251507282257, |
| "learning_rate": 0.0005474947491248541, |
| "loss": 3.5685, |
| "step": 15100 |
| }, |
| { |
| "epoch": 4.4169096209912535, |
| "grad_norm": 0.32899653911590576, |
| "learning_rate": 0.0005473197199533255, |
| "loss": 3.5743, |
| "step": 15150 |
| }, |
| { |
| "epoch": 4.431486880466473, |
| "grad_norm": 0.3337746560573578, |
| "learning_rate": 0.000547144690781797, |
| "loss": 3.5724, |
| "step": 15200 |
| }, |
| { |
| "epoch": 4.446064139941691, |
| "grad_norm": 0.305427610874176, |
| "learning_rate": 0.0005469696616102683, |
| "loss": 3.566, |
| "step": 15250 |
| }, |
| { |
| "epoch": 4.460641399416909, |
| "grad_norm": 0.33104851841926575, |
| "learning_rate": 0.0005467946324387398, |
| "loss": 3.5729, |
| "step": 15300 |
| }, |
| { |
| "epoch": 4.475218658892128, |
| "grad_norm": 0.3255021572113037, |
| "learning_rate": 0.0005466196032672112, |
| "loss": 3.5754, |
| "step": 15350 |
| }, |
| { |
| "epoch": 4.489795918367347, |
| "grad_norm": 0.3108825087547302, |
| "learning_rate": 0.0005464445740956825, |
| "loss": 3.5705, |
| "step": 15400 |
| }, |
| { |
| "epoch": 4.504373177842566, |
| "grad_norm": 0.31072282791137695, |
| "learning_rate": 0.000546269544924154, |
| "loss": 3.5848, |
| "step": 15450 |
| }, |
| { |
| "epoch": 4.518950437317784, |
| "grad_norm": 0.31263136863708496, |
| "learning_rate": 0.0005460945157526254, |
| "loss": 3.5806, |
| "step": 15500 |
| }, |
| { |
| "epoch": 4.533527696793003, |
| "grad_norm": 0.30347201228141785, |
| "learning_rate": 0.0005459194865810968, |
| "loss": 3.5909, |
| "step": 15550 |
| }, |
| { |
| "epoch": 4.548104956268221, |
| "grad_norm": 0.30945679545402527, |
| "learning_rate": 0.0005457444574095681, |
| "loss": 3.5817, |
| "step": 15600 |
| }, |
| { |
| "epoch": 4.562682215743441, |
| "grad_norm": 0.3195247948169708, |
| "learning_rate": 0.0005455694282380396, |
| "loss": 3.5663, |
| "step": 15650 |
| }, |
| { |
| "epoch": 4.577259475218659, |
| "grad_norm": 0.30114883184432983, |
| "learning_rate": 0.000545394399066511, |
| "loss": 3.5784, |
| "step": 15700 |
| }, |
| { |
| "epoch": 4.591836734693878, |
| "grad_norm": 0.3206930160522461, |
| "learning_rate": 0.0005452193698949825, |
| "loss": 3.5735, |
| "step": 15750 |
| }, |
| { |
| "epoch": 4.606413994169096, |
| "grad_norm": 0.30759623646736145, |
| "learning_rate": 0.0005450443407234539, |
| "loss": 3.5805, |
| "step": 15800 |
| }, |
| { |
| "epoch": 4.6209912536443145, |
| "grad_norm": 0.30484503507614136, |
| "learning_rate": 0.0005448693115519253, |
| "loss": 3.5848, |
| "step": 15850 |
| }, |
| { |
| "epoch": 4.635568513119534, |
| "grad_norm": 0.32482293248176575, |
| "learning_rate": 0.0005446942823803967, |
| "loss": 3.5718, |
| "step": 15900 |
| }, |
| { |
| "epoch": 4.650145772594752, |
| "grad_norm": 0.3221893012523651, |
| "learning_rate": 0.0005445192532088681, |
| "loss": 3.5716, |
| "step": 15950 |
| }, |
| { |
| "epoch": 4.664723032069971, |
| "grad_norm": 0.31135523319244385, |
| "learning_rate": 0.0005443442240373396, |
| "loss": 3.5733, |
| "step": 16000 |
| }, |
| { |
| "epoch": 4.664723032069971, |
| "eval_accuracy": 0.35775051843090133, |
| "eval_loss": 3.649756669998169, |
| "eval_runtime": 182.7531, |
| "eval_samples_per_second": 90.964, |
| "eval_steps_per_second": 5.685, |
| "step": 16000 |
| }, |
| { |
| "epoch": 4.679300291545189, |
| "grad_norm": 0.3205879330635071, |
| "learning_rate": 0.0005441691948658109, |
| "loss": 3.577, |
| "step": 16050 |
| }, |
| { |
| "epoch": 4.6938775510204085, |
| "grad_norm": 0.31886178255081177, |
| "learning_rate": 0.0005439941656942823, |
| "loss": 3.5797, |
| "step": 16100 |
| }, |
| { |
| "epoch": 4.708454810495627, |
| "grad_norm": 0.2951045334339142, |
| "learning_rate": 0.0005438191365227538, |
| "loss": 3.5806, |
| "step": 16150 |
| }, |
| { |
| "epoch": 4.723032069970845, |
| "grad_norm": 0.31141290068626404, |
| "learning_rate": 0.0005436441073512252, |
| "loss": 3.5731, |
| "step": 16200 |
| }, |
| { |
| "epoch": 4.737609329446064, |
| "grad_norm": 0.3445282280445099, |
| "learning_rate": 0.0005434690781796965, |
| "loss": 3.5716, |
| "step": 16250 |
| }, |
| { |
| "epoch": 4.752186588921282, |
| "grad_norm": 0.30949369072914124, |
| "learning_rate": 0.0005432940490081679, |
| "loss": 3.5776, |
| "step": 16300 |
| }, |
| { |
| "epoch": 4.766763848396502, |
| "grad_norm": 0.3139699101448059, |
| "learning_rate": 0.0005431190198366394, |
| "loss": 3.5786, |
| "step": 16350 |
| }, |
| { |
| "epoch": 4.78134110787172, |
| "grad_norm": 0.3123582601547241, |
| "learning_rate": 0.0005429439906651108, |
| "loss": 3.5759, |
| "step": 16400 |
| }, |
| { |
| "epoch": 4.795918367346939, |
| "grad_norm": 0.3623657822608948, |
| "learning_rate": 0.0005427689614935822, |
| "loss": 3.5991, |
| "step": 16450 |
| }, |
| { |
| "epoch": 4.810495626822157, |
| "grad_norm": 0.32249704003334045, |
| "learning_rate": 0.0005425939323220536, |
| "loss": 3.5727, |
| "step": 16500 |
| }, |
| { |
| "epoch": 4.825072886297376, |
| "grad_norm": 0.3091285824775696, |
| "learning_rate": 0.0005424189031505251, |
| "loss": 3.5919, |
| "step": 16550 |
| }, |
| { |
| "epoch": 4.839650145772595, |
| "grad_norm": 0.2962740659713745, |
| "learning_rate": 0.0005422438739789965, |
| "loss": 3.585, |
| "step": 16600 |
| }, |
| { |
| "epoch": 4.854227405247814, |
| "grad_norm": 0.29750195145606995, |
| "learning_rate": 0.0005420688448074678, |
| "loss": 3.5741, |
| "step": 16650 |
| }, |
| { |
| "epoch": 4.868804664723032, |
| "grad_norm": 0.30312204360961914, |
| "learning_rate": 0.0005418938156359393, |
| "loss": 3.5765, |
| "step": 16700 |
| }, |
| { |
| "epoch": 4.88338192419825, |
| "grad_norm": 0.2950429916381836, |
| "learning_rate": 0.0005417187864644107, |
| "loss": 3.5762, |
| "step": 16750 |
| }, |
| { |
| "epoch": 4.8979591836734695, |
| "grad_norm": 0.3145018517971039, |
| "learning_rate": 0.0005415437572928821, |
| "loss": 3.5797, |
| "step": 16800 |
| }, |
| { |
| "epoch": 4.912536443148688, |
| "grad_norm": 0.3003164827823639, |
| "learning_rate": 0.0005413687281213534, |
| "loss": 3.5873, |
| "step": 16850 |
| }, |
| { |
| "epoch": 4.927113702623907, |
| "grad_norm": 0.308309942483902, |
| "learning_rate": 0.0005411936989498249, |
| "loss": 3.5877, |
| "step": 16900 |
| }, |
| { |
| "epoch": 4.941690962099125, |
| "grad_norm": 0.29779040813446045, |
| "learning_rate": 0.0005410186697782963, |
| "loss": 3.5722, |
| "step": 16950 |
| }, |
| { |
| "epoch": 4.956268221574344, |
| "grad_norm": 0.3193836212158203, |
| "learning_rate": 0.0005408436406067678, |
| "loss": 3.5731, |
| "step": 17000 |
| }, |
| { |
| "epoch": 4.956268221574344, |
| "eval_accuracy": 0.3591342957344579, |
| "eval_loss": 3.6355204582214355, |
| "eval_runtime": 182.745, |
| "eval_samples_per_second": 90.968, |
| "eval_steps_per_second": 5.686, |
| "step": 17000 |
| }, |
| { |
| "epoch": 4.970845481049563, |
| "grad_norm": 0.3114926517009735, |
| "learning_rate": 0.0005406686114352391, |
| "loss": 3.571, |
| "step": 17050 |
| }, |
| { |
| "epoch": 4.985422740524781, |
| "grad_norm": 0.3078295588493347, |
| "learning_rate": 0.0005404935822637106, |
| "loss": 3.579, |
| "step": 17100 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 0.33132073283195496, |
| "learning_rate": 0.000540318553092182, |
| "loss": 3.5752, |
| "step": 17150 |
| }, |
| { |
| "epoch": 5.014577259475218, |
| "grad_norm": 0.33800360560417175, |
| "learning_rate": 0.0005401435239206534, |
| "loss": 3.4588, |
| "step": 17200 |
| }, |
| { |
| "epoch": 5.029154518950437, |
| "grad_norm": 0.33555933833122253, |
| "learning_rate": 0.0005399684947491249, |
| "loss": 3.475, |
| "step": 17250 |
| }, |
| { |
| "epoch": 5.043731778425656, |
| "grad_norm": 0.3214545249938965, |
| "learning_rate": 0.0005397934655775962, |
| "loss": 3.4702, |
| "step": 17300 |
| }, |
| { |
| "epoch": 5.058309037900875, |
| "grad_norm": 0.32993388175964355, |
| "learning_rate": 0.0005396184364060676, |
| "loss": 3.4876, |
| "step": 17350 |
| }, |
| { |
| "epoch": 5.072886297376093, |
| "grad_norm": 0.3097357153892517, |
| "learning_rate": 0.0005394434072345391, |
| "loss": 3.4873, |
| "step": 17400 |
| }, |
| { |
| "epoch": 5.087463556851312, |
| "grad_norm": 0.3072371482849121, |
| "learning_rate": 0.0005392683780630105, |
| "loss": 3.4899, |
| "step": 17450 |
| }, |
| { |
| "epoch": 5.1020408163265305, |
| "grad_norm": 0.3170117139816284, |
| "learning_rate": 0.0005390933488914818, |
| "loss": 3.4837, |
| "step": 17500 |
| }, |
| { |
| "epoch": 5.11661807580175, |
| "grad_norm": 0.30793899297714233, |
| "learning_rate": 0.0005389183197199533, |
| "loss": 3.4947, |
| "step": 17550 |
| }, |
| { |
| "epoch": 5.131195335276968, |
| "grad_norm": 0.32247328758239746, |
| "learning_rate": 0.0005387432905484247, |
| "loss": 3.479, |
| "step": 17600 |
| }, |
| { |
| "epoch": 5.145772594752186, |
| "grad_norm": 0.3052769601345062, |
| "learning_rate": 0.0005385682613768962, |
| "loss": 3.4999, |
| "step": 17650 |
| }, |
| { |
| "epoch": 5.160349854227405, |
| "grad_norm": 0.31978240609169006, |
| "learning_rate": 0.0005383932322053675, |
| "loss": 3.495, |
| "step": 17700 |
| }, |
| { |
| "epoch": 5.174927113702624, |
| "grad_norm": 0.3358873426914215, |
| "learning_rate": 0.0005382182030338389, |
| "loss": 3.5045, |
| "step": 17750 |
| }, |
| { |
| "epoch": 5.189504373177843, |
| "grad_norm": 0.3461468815803528, |
| "learning_rate": 0.0005380431738623104, |
| "loss": 3.4943, |
| "step": 17800 |
| }, |
| { |
| "epoch": 5.204081632653061, |
| "grad_norm": 0.3337807357311249, |
| "learning_rate": 0.0005378681446907818, |
| "loss": 3.5, |
| "step": 17850 |
| }, |
| { |
| "epoch": 5.21865889212828, |
| "grad_norm": 0.34245598316192627, |
| "learning_rate": 0.0005376931155192531, |
| "loss": 3.5111, |
| "step": 17900 |
| }, |
| { |
| "epoch": 5.233236151603498, |
| "grad_norm": 0.32653409242630005, |
| "learning_rate": 0.0005375180863477246, |
| "loss": 3.5028, |
| "step": 17950 |
| }, |
| { |
| "epoch": 5.247813411078718, |
| "grad_norm": 0.330037921667099, |
| "learning_rate": 0.000537343057176196, |
| "loss": 3.5143, |
| "step": 18000 |
| }, |
| { |
| "epoch": 5.247813411078718, |
| "eval_accuracy": 0.3593785609751963, |
| "eval_loss": 3.6401000022888184, |
| "eval_runtime": 182.7236, |
| "eval_samples_per_second": 90.979, |
| "eval_steps_per_second": 5.686, |
| "step": 18000 |
| }, |
| { |
| "epoch": 5.262390670553936, |
| "grad_norm": 0.30893272161483765, |
| "learning_rate": 0.0005371680280046674, |
| "loss": 3.5003, |
| "step": 18050 |
| }, |
| { |
| "epoch": 5.276967930029155, |
| "grad_norm": 0.3184139132499695, |
| "learning_rate": 0.0005369929988331387, |
| "loss": 3.5157, |
| "step": 18100 |
| }, |
| { |
| "epoch": 5.291545189504373, |
| "grad_norm": 0.3190644681453705, |
| "learning_rate": 0.0005368179696616102, |
| "loss": 3.5079, |
| "step": 18150 |
| }, |
| { |
| "epoch": 5.3061224489795915, |
| "grad_norm": 0.31882479786872864, |
| "learning_rate": 0.0005366429404900816, |
| "loss": 3.499, |
| "step": 18200 |
| }, |
| { |
| "epoch": 5.320699708454811, |
| "grad_norm": 0.31382501125335693, |
| "learning_rate": 0.0005364679113185531, |
| "loss": 3.5072, |
| "step": 18250 |
| }, |
| { |
| "epoch": 5.335276967930029, |
| "grad_norm": 0.3288552463054657, |
| "learning_rate": 0.0005362928821470244, |
| "loss": 3.504, |
| "step": 18300 |
| }, |
| { |
| "epoch": 5.349854227405248, |
| "grad_norm": 0.29884201288223267, |
| "learning_rate": 0.0005361178529754959, |
| "loss": 3.5116, |
| "step": 18350 |
| }, |
| { |
| "epoch": 5.364431486880466, |
| "grad_norm": 0.3015066087245941, |
| "learning_rate": 0.0005359428238039673, |
| "loss": 3.5072, |
| "step": 18400 |
| }, |
| { |
| "epoch": 5.3790087463556855, |
| "grad_norm": 0.334290474653244, |
| "learning_rate": 0.0005357677946324387, |
| "loss": 3.5136, |
| "step": 18450 |
| }, |
| { |
| "epoch": 5.393586005830904, |
| "grad_norm": 0.3181033730506897, |
| "learning_rate": 0.0005355927654609101, |
| "loss": 3.524, |
| "step": 18500 |
| }, |
| { |
| "epoch": 5.408163265306122, |
| "grad_norm": 0.33072495460510254, |
| "learning_rate": 0.0005354177362893815, |
| "loss": 3.5146, |
| "step": 18550 |
| }, |
| { |
| "epoch": 5.422740524781341, |
| "grad_norm": 0.33785611391067505, |
| "learning_rate": 0.0005352427071178529, |
| "loss": 3.5204, |
| "step": 18600 |
| }, |
| { |
| "epoch": 5.437317784256559, |
| "grad_norm": 0.31333351135253906, |
| "learning_rate": 0.0005350676779463244, |
| "loss": 3.5243, |
| "step": 18650 |
| }, |
| { |
| "epoch": 5.451895043731779, |
| "grad_norm": 0.32065486907958984, |
| "learning_rate": 0.0005348926487747957, |
| "loss": 3.5175, |
| "step": 18700 |
| }, |
| { |
| "epoch": 5.466472303206997, |
| "grad_norm": 0.3304148018360138, |
| "learning_rate": 0.0005347176196032671, |
| "loss": 3.528, |
| "step": 18750 |
| }, |
| { |
| "epoch": 5.481049562682216, |
| "grad_norm": 0.31169185042381287, |
| "learning_rate": 0.0005345425904317386, |
| "loss": 3.5387, |
| "step": 18800 |
| }, |
| { |
| "epoch": 5.495626822157434, |
| "grad_norm": 0.3135135769844055, |
| "learning_rate": 0.00053436756126021, |
| "loss": 3.5116, |
| "step": 18850 |
| }, |
| { |
| "epoch": 5.510204081632653, |
| "grad_norm": 0.30703073740005493, |
| "learning_rate": 0.0005341925320886814, |
| "loss": 3.5188, |
| "step": 18900 |
| }, |
| { |
| "epoch": 5.524781341107872, |
| "grad_norm": 0.302874892950058, |
| "learning_rate": 0.0005340175029171528, |
| "loss": 3.5403, |
| "step": 18950 |
| }, |
| { |
| "epoch": 5.539358600583091, |
| "grad_norm": 0.32535436749458313, |
| "learning_rate": 0.0005338424737456243, |
| "loss": 3.5218, |
| "step": 19000 |
| }, |
| { |
| "epoch": 5.539358600583091, |
| "eval_accuracy": 0.36021494870312226, |
| "eval_loss": 3.628293752670288, |
| "eval_runtime": 182.4386, |
| "eval_samples_per_second": 91.121, |
| "eval_steps_per_second": 5.695, |
| "step": 19000 |
| }, |
| { |
| "epoch": 5.553935860058309, |
| "grad_norm": 0.3248957395553589, |
| "learning_rate": 0.0005336674445740957, |
| "loss": 3.5256, |
| "step": 19050 |
| }, |
| { |
| "epoch": 5.568513119533527, |
| "grad_norm": 0.31554681062698364, |
| "learning_rate": 0.000533492415402567, |
| "loss": 3.5352, |
| "step": 19100 |
| }, |
| { |
| "epoch": 5.5830903790087465, |
| "grad_norm": 0.3147410452365875, |
| "learning_rate": 0.0005333173862310384, |
| "loss": 3.5234, |
| "step": 19150 |
| }, |
| { |
| "epoch": 5.597667638483965, |
| "grad_norm": 0.3296838402748108, |
| "learning_rate": 0.0005331423570595099, |
| "loss": 3.5218, |
| "step": 19200 |
| }, |
| { |
| "epoch": 5.612244897959184, |
| "grad_norm": 0.34120234847068787, |
| "learning_rate": 0.0005329673278879813, |
| "loss": 3.5284, |
| "step": 19250 |
| }, |
| { |
| "epoch": 5.626822157434402, |
| "grad_norm": 0.3368380665779114, |
| "learning_rate": 0.0005327922987164526, |
| "loss": 3.5254, |
| "step": 19300 |
| }, |
| { |
| "epoch": 5.641399416909621, |
| "grad_norm": 0.31956803798675537, |
| "learning_rate": 0.0005326172695449241, |
| "loss": 3.5199, |
| "step": 19350 |
| }, |
| { |
| "epoch": 5.65597667638484, |
| "grad_norm": 0.3001638352870941, |
| "learning_rate": 0.0005324422403733955, |
| "loss": 3.5285, |
| "step": 19400 |
| }, |
| { |
| "epoch": 5.670553935860058, |
| "grad_norm": 0.3221629559993744, |
| "learning_rate": 0.000532267211201867, |
| "loss": 3.5179, |
| "step": 19450 |
| }, |
| { |
| "epoch": 5.685131195335277, |
| "grad_norm": 0.30188408493995667, |
| "learning_rate": 0.0005320921820303384, |
| "loss": 3.5368, |
| "step": 19500 |
| }, |
| { |
| "epoch": 5.699708454810495, |
| "grad_norm": 0.3237511217594147, |
| "learning_rate": 0.0005319171528588098, |
| "loss": 3.5325, |
| "step": 19550 |
| }, |
| { |
| "epoch": 5.714285714285714, |
| "grad_norm": 0.30857592821121216, |
| "learning_rate": 0.0005317421236872812, |
| "loss": 3.5249, |
| "step": 19600 |
| }, |
| { |
| "epoch": 5.728862973760933, |
| "grad_norm": 0.3192301094532013, |
| "learning_rate": 0.0005315670945157526, |
| "loss": 3.5469, |
| "step": 19650 |
| }, |
| { |
| "epoch": 5.743440233236152, |
| "grad_norm": 0.29896432161331177, |
| "learning_rate": 0.000531392065344224, |
| "loss": 3.526, |
| "step": 19700 |
| }, |
| { |
| "epoch": 5.75801749271137, |
| "grad_norm": 0.3077335059642792, |
| "learning_rate": 0.0005312170361726954, |
| "loss": 3.535, |
| "step": 19750 |
| }, |
| { |
| "epoch": 5.772594752186589, |
| "grad_norm": 0.3408810496330261, |
| "learning_rate": 0.0005310420070011668, |
| "loss": 3.5384, |
| "step": 19800 |
| }, |
| { |
| "epoch": 5.7871720116618075, |
| "grad_norm": 0.32832178473472595, |
| "learning_rate": 0.0005308669778296382, |
| "loss": 3.5298, |
| "step": 19850 |
| }, |
| { |
| "epoch": 5.801749271137027, |
| "grad_norm": 0.3311011493206024, |
| "learning_rate": 0.0005306919486581097, |
| "loss": 3.5337, |
| "step": 19900 |
| }, |
| { |
| "epoch": 5.816326530612245, |
| "grad_norm": 0.3176785707473755, |
| "learning_rate": 0.000530516919486581, |
| "loss": 3.5446, |
| "step": 19950 |
| }, |
| { |
| "epoch": 5.830903790087463, |
| "grad_norm": 0.321762353181839, |
| "learning_rate": 0.0005303418903150524, |
| "loss": 3.528, |
| "step": 20000 |
| }, |
| { |
| "epoch": 5.830903790087463, |
| "eval_accuracy": 0.3612652303792033, |
| "eval_loss": 3.6179208755493164, |
| "eval_runtime": 182.0938, |
| "eval_samples_per_second": 91.294, |
| "eval_steps_per_second": 5.706, |
| "step": 20000 |
| }, |
| { |
| "epoch": 5.845481049562682, |
| "grad_norm": 0.30524224042892456, |
| "learning_rate": 0.0005301668611435239, |
| "loss": 3.5328, |
| "step": 20050 |
| }, |
| { |
| "epoch": 5.860058309037901, |
| "grad_norm": 0.333676278591156, |
| "learning_rate": 0.0005299918319719953, |
| "loss": 3.5335, |
| "step": 20100 |
| }, |
| { |
| "epoch": 5.87463556851312, |
| "grad_norm": 0.30587267875671387, |
| "learning_rate": 0.0005298168028004667, |
| "loss": 3.528, |
| "step": 20150 |
| }, |
| { |
| "epoch": 5.889212827988338, |
| "grad_norm": 0.309893399477005, |
| "learning_rate": 0.0005296417736289381, |
| "loss": 3.5382, |
| "step": 20200 |
| }, |
| { |
| "epoch": 5.903790087463557, |
| "grad_norm": 0.3243180215358734, |
| "learning_rate": 0.0005294667444574096, |
| "loss": 3.5356, |
| "step": 20250 |
| }, |
| { |
| "epoch": 5.918367346938775, |
| "grad_norm": 0.3317978084087372, |
| "learning_rate": 0.000529291715285881, |
| "loss": 3.5329, |
| "step": 20300 |
| }, |
| { |
| "epoch": 5.932944606413994, |
| "grad_norm": 0.3489520847797394, |
| "learning_rate": 0.0005291166861143523, |
| "loss": 3.5281, |
| "step": 20350 |
| }, |
| { |
| "epoch": 5.947521865889213, |
| "grad_norm": 0.3286189138889313, |
| "learning_rate": 0.0005289416569428237, |
| "loss": 3.53, |
| "step": 20400 |
| }, |
| { |
| "epoch": 5.962099125364432, |
| "grad_norm": 0.32896900177001953, |
| "learning_rate": 0.0005287666277712952, |
| "loss": 3.5365, |
| "step": 20450 |
| }, |
| { |
| "epoch": 5.97667638483965, |
| "grad_norm": 0.3125670254230499, |
| "learning_rate": 0.0005285915985997666, |
| "loss": 3.5392, |
| "step": 20500 |
| }, |
| { |
| "epoch": 5.9912536443148685, |
| "grad_norm": 0.3274867534637451, |
| "learning_rate": 0.0005284165694282379, |
| "loss": 3.5389, |
| "step": 20550 |
| }, |
| { |
| "epoch": 6.005830903790088, |
| "grad_norm": 0.3022937774658203, |
| "learning_rate": 0.0005282415402567094, |
| "loss": 3.4769, |
| "step": 20600 |
| }, |
| { |
| "epoch": 6.020408163265306, |
| "grad_norm": 0.319007009267807, |
| "learning_rate": 0.0005280665110851808, |
| "loss": 3.423, |
| "step": 20650 |
| }, |
| { |
| "epoch": 6.034985422740525, |
| "grad_norm": 0.33166149258613586, |
| "learning_rate": 0.0005278914819136523, |
| "loss": 3.4239, |
| "step": 20700 |
| }, |
| { |
| "epoch": 6.049562682215743, |
| "grad_norm": 0.31625524163246155, |
| "learning_rate": 0.0005277164527421236, |
| "loss": 3.4286, |
| "step": 20750 |
| }, |
| { |
| "epoch": 6.0641399416909625, |
| "grad_norm": 0.32785487174987793, |
| "learning_rate": 0.0005275414235705951, |
| "loss": 3.4299, |
| "step": 20800 |
| }, |
| { |
| "epoch": 6.078717201166181, |
| "grad_norm": 0.32926735281944275, |
| "learning_rate": 0.0005273663943990665, |
| "loss": 3.4416, |
| "step": 20850 |
| }, |
| { |
| "epoch": 6.093294460641399, |
| "grad_norm": 0.316875696182251, |
| "learning_rate": 0.0005271913652275379, |
| "loss": 3.4401, |
| "step": 20900 |
| }, |
| { |
| "epoch": 6.107871720116618, |
| "grad_norm": 0.3242247402667999, |
| "learning_rate": 0.0005270163360560092, |
| "loss": 3.4508, |
| "step": 20950 |
| }, |
| { |
| "epoch": 6.122448979591836, |
| "grad_norm": 0.3237857222557068, |
| "learning_rate": 0.0005268413068844807, |
| "loss": 3.446, |
| "step": 21000 |
| }, |
| { |
| "epoch": 6.122448979591836, |
| "eval_accuracy": 0.36163439461773605, |
| "eval_loss": 3.6209189891815186, |
| "eval_runtime": 182.3861, |
| "eval_samples_per_second": 91.147, |
| "eval_steps_per_second": 5.697, |
| "step": 21000 |
| }, |
| { |
| "epoch": 6.137026239067056, |
| "grad_norm": 0.34047290682792664, |
| "learning_rate": 0.0005266662777129521, |
| "loss": 3.4436, |
| "step": 21050 |
| }, |
| { |
| "epoch": 6.151603498542274, |
| "grad_norm": 0.31786465644836426, |
| "learning_rate": 0.0005264912485414235, |
| "loss": 3.4569, |
| "step": 21100 |
| }, |
| { |
| "epoch": 6.166180758017493, |
| "grad_norm": 0.3193897604942322, |
| "learning_rate": 0.000526316219369895, |
| "loss": 3.4581, |
| "step": 21150 |
| }, |
| { |
| "epoch": 6.180758017492711, |
| "grad_norm": 0.32340869307518005, |
| "learning_rate": 0.0005261411901983663, |
| "loss": 3.4661, |
| "step": 21200 |
| }, |
| { |
| "epoch": 6.19533527696793, |
| "grad_norm": 0.3188331723213196, |
| "learning_rate": 0.0005259661610268378, |
| "loss": 3.4469, |
| "step": 21250 |
| }, |
| { |
| "epoch": 6.209912536443149, |
| "grad_norm": 0.3093229830265045, |
| "learning_rate": 0.0005257911318553092, |
| "loss": 3.4518, |
| "step": 21300 |
| }, |
| { |
| "epoch": 6.224489795918367, |
| "grad_norm": 0.3273768424987793, |
| "learning_rate": 0.0005256161026837807, |
| "loss": 3.4675, |
| "step": 21350 |
| }, |
| { |
| "epoch": 6.239067055393586, |
| "grad_norm": 0.3344630300998688, |
| "learning_rate": 0.000525441073512252, |
| "loss": 3.4561, |
| "step": 21400 |
| }, |
| { |
| "epoch": 6.253644314868804, |
| "grad_norm": 0.3129141628742218, |
| "learning_rate": 0.0005252660443407234, |
| "loss": 3.4563, |
| "step": 21450 |
| }, |
| { |
| "epoch": 6.2682215743440235, |
| "grad_norm": 0.32025453448295593, |
| "learning_rate": 0.0005250910151691949, |
| "loss": 3.4571, |
| "step": 21500 |
| }, |
| { |
| "epoch": 6.282798833819242, |
| "grad_norm": 0.3402857482433319, |
| "learning_rate": 0.0005249159859976663, |
| "loss": 3.4777, |
| "step": 21550 |
| }, |
| { |
| "epoch": 6.297376093294461, |
| "grad_norm": 0.32615235447883606, |
| "learning_rate": 0.0005247409568261376, |
| "loss": 3.4708, |
| "step": 21600 |
| }, |
| { |
| "epoch": 6.311953352769679, |
| "grad_norm": 0.33531254529953003, |
| "learning_rate": 0.000524565927654609, |
| "loss": 3.4549, |
| "step": 21650 |
| }, |
| { |
| "epoch": 6.326530612244898, |
| "grad_norm": 0.33214399218559265, |
| "learning_rate": 0.0005243908984830805, |
| "loss": 3.4745, |
| "step": 21700 |
| }, |
| { |
| "epoch": 6.341107871720117, |
| "grad_norm": 0.34065625071525574, |
| "learning_rate": 0.0005242158693115519, |
| "loss": 3.4731, |
| "step": 21750 |
| }, |
| { |
| "epoch": 6.355685131195335, |
| "grad_norm": 0.35216084122657776, |
| "learning_rate": 0.0005240408401400232, |
| "loss": 3.4692, |
| "step": 21800 |
| }, |
| { |
| "epoch": 6.370262390670554, |
| "grad_norm": 0.337293803691864, |
| "learning_rate": 0.0005238658109684947, |
| "loss": 3.4719, |
| "step": 21850 |
| }, |
| { |
| "epoch": 6.384839650145772, |
| "grad_norm": 0.3386717140674591, |
| "learning_rate": 0.0005236907817969661, |
| "loss": 3.4759, |
| "step": 21900 |
| }, |
| { |
| "epoch": 6.399416909620991, |
| "grad_norm": 0.3118914067745209, |
| "learning_rate": 0.0005235157526254376, |
| "loss": 3.4722, |
| "step": 21950 |
| }, |
| { |
| "epoch": 6.41399416909621, |
| "grad_norm": 0.31701526045799255, |
| "learning_rate": 0.0005233407234539089, |
| "loss": 3.479, |
| "step": 22000 |
| }, |
| { |
| "epoch": 6.41399416909621, |
| "eval_accuracy": 0.36226359833424054, |
| "eval_loss": 3.6132073402404785, |
| "eval_runtime": 182.1237, |
| "eval_samples_per_second": 91.279, |
| "eval_steps_per_second": 5.705, |
| "step": 22000 |
| }, |
| { |
| "epoch": 6.428571428571429, |
| "grad_norm": 0.33094874024391174, |
| "learning_rate": 0.0005231656942823804, |
| "loss": 3.4769, |
| "step": 22050 |
| }, |
| { |
| "epoch": 6.443148688046647, |
| "grad_norm": 0.31896036863327026, |
| "learning_rate": 0.0005229906651108518, |
| "loss": 3.4843, |
| "step": 22100 |
| }, |
| { |
| "epoch": 6.457725947521866, |
| "grad_norm": 0.33175987005233765, |
| "learning_rate": 0.0005228156359393232, |
| "loss": 3.4735, |
| "step": 22150 |
| }, |
| { |
| "epoch": 6.4723032069970845, |
| "grad_norm": 0.3212224543094635, |
| "learning_rate": 0.0005226406067677945, |
| "loss": 3.4855, |
| "step": 22200 |
| }, |
| { |
| "epoch": 6.486880466472304, |
| "grad_norm": 0.3497124910354614, |
| "learning_rate": 0.000522465577596266, |
| "loss": 3.4818, |
| "step": 22250 |
| }, |
| { |
| "epoch": 6.501457725947522, |
| "grad_norm": 0.3271764814853668, |
| "learning_rate": 0.0005222905484247374, |
| "loss": 3.4805, |
| "step": 22300 |
| }, |
| { |
| "epoch": 6.51603498542274, |
| "grad_norm": 0.31307926774024963, |
| "learning_rate": 0.0005221155192532088, |
| "loss": 3.4754, |
| "step": 22350 |
| }, |
| { |
| "epoch": 6.530612244897959, |
| "grad_norm": 0.3359667956829071, |
| "learning_rate": 0.0005219404900816802, |
| "loss": 3.4805, |
| "step": 22400 |
| }, |
| { |
| "epoch": 6.545189504373178, |
| "grad_norm": 0.3326173722743988, |
| "learning_rate": 0.0005217654609101516, |
| "loss": 3.4845, |
| "step": 22450 |
| }, |
| { |
| "epoch": 6.559766763848397, |
| "grad_norm": 0.3412718176841736, |
| "learning_rate": 0.0005215904317386231, |
| "loss": 3.4866, |
| "step": 22500 |
| }, |
| { |
| "epoch": 6.574344023323615, |
| "grad_norm": 0.33276066184043884, |
| "learning_rate": 0.0005214154025670945, |
| "loss": 3.4816, |
| "step": 22550 |
| }, |
| { |
| "epoch": 6.588921282798834, |
| "grad_norm": 0.30933451652526855, |
| "learning_rate": 0.000521240373395566, |
| "loss": 3.4847, |
| "step": 22600 |
| }, |
| { |
| "epoch": 6.603498542274052, |
| "grad_norm": 0.38130906224250793, |
| "learning_rate": 0.0005210653442240373, |
| "loss": 3.4943, |
| "step": 22650 |
| }, |
| { |
| "epoch": 6.618075801749271, |
| "grad_norm": 0.3150744140148163, |
| "learning_rate": 0.0005208903150525087, |
| "loss": 3.4801, |
| "step": 22700 |
| }, |
| { |
| "epoch": 6.63265306122449, |
| "grad_norm": 0.31047096848487854, |
| "learning_rate": 0.0005207152858809802, |
| "loss": 3.4828, |
| "step": 22750 |
| }, |
| { |
| "epoch": 6.647230320699708, |
| "grad_norm": 0.32708585262298584, |
| "learning_rate": 0.0005205402567094515, |
| "loss": 3.4926, |
| "step": 22800 |
| }, |
| { |
| "epoch": 6.661807580174927, |
| "grad_norm": 0.36320850253105164, |
| "learning_rate": 0.0005203652275379229, |
| "loss": 3.4778, |
| "step": 22850 |
| }, |
| { |
| "epoch": 6.6763848396501455, |
| "grad_norm": 0.3161414861679077, |
| "learning_rate": 0.0005201901983663943, |
| "loss": 3.4908, |
| "step": 22900 |
| }, |
| { |
| "epoch": 6.690962099125365, |
| "grad_norm": 0.3203990161418915, |
| "learning_rate": 0.0005200151691948658, |
| "loss": 3.5021, |
| "step": 22950 |
| }, |
| { |
| "epoch": 6.705539358600583, |
| "grad_norm": 0.31184670329093933, |
| "learning_rate": 0.0005198401400233371, |
| "loss": 3.4919, |
| "step": 23000 |
| }, |
| { |
| "epoch": 6.705539358600583, |
| "eval_accuracy": 0.363203107195124, |
| "eval_loss": 3.604387044906616, |
| "eval_runtime": 183.0205, |
| "eval_samples_per_second": 90.831, |
| "eval_steps_per_second": 5.677, |
| "step": 23000 |
| }, |
| { |
| "epoch": 6.720116618075802, |
| "grad_norm": 0.33052515983581543, |
| "learning_rate": 0.0005196651108518086, |
| "loss": 3.4911, |
| "step": 23050 |
| }, |
| { |
| "epoch": 6.73469387755102, |
| "grad_norm": 0.35309287905693054, |
| "learning_rate": 0.00051949008168028, |
| "loss": 3.4878, |
| "step": 23100 |
| }, |
| { |
| "epoch": 6.7492711370262395, |
| "grad_norm": 0.3378291726112366, |
| "learning_rate": 0.0005193150525087515, |
| "loss": 3.4892, |
| "step": 23150 |
| }, |
| { |
| "epoch": 6.763848396501458, |
| "grad_norm": 0.3175658583641052, |
| "learning_rate": 0.0005191400233372229, |
| "loss": 3.4935, |
| "step": 23200 |
| }, |
| { |
| "epoch": 6.778425655976676, |
| "grad_norm": 0.31248053908348083, |
| "learning_rate": 0.0005189649941656942, |
| "loss": 3.5044, |
| "step": 23250 |
| }, |
| { |
| "epoch": 6.793002915451895, |
| "grad_norm": 0.30260851979255676, |
| "learning_rate": 0.0005187899649941657, |
| "loss": 3.505, |
| "step": 23300 |
| }, |
| { |
| "epoch": 6.807580174927113, |
| "grad_norm": 0.3121766149997711, |
| "learning_rate": 0.0005186149358226371, |
| "loss": 3.4833, |
| "step": 23350 |
| }, |
| { |
| "epoch": 6.822157434402333, |
| "grad_norm": 0.3289031982421875, |
| "learning_rate": 0.0005184399066511085, |
| "loss": 3.4957, |
| "step": 23400 |
| }, |
| { |
| "epoch": 6.836734693877551, |
| "grad_norm": 0.32009539008140564, |
| "learning_rate": 0.0005182648774795798, |
| "loss": 3.4924, |
| "step": 23450 |
| }, |
| { |
| "epoch": 6.85131195335277, |
| "grad_norm": 0.3235000669956207, |
| "learning_rate": 0.0005180898483080513, |
| "loss": 3.4927, |
| "step": 23500 |
| }, |
| { |
| "epoch": 6.865889212827988, |
| "grad_norm": 0.3122546970844269, |
| "learning_rate": 0.0005179148191365227, |
| "loss": 3.5006, |
| "step": 23550 |
| }, |
| { |
| "epoch": 6.8804664723032065, |
| "grad_norm": 0.3026493191719055, |
| "learning_rate": 0.000517739789964994, |
| "loss": 3.4974, |
| "step": 23600 |
| }, |
| { |
| "epoch": 6.895043731778426, |
| "grad_norm": 0.30712851881980896, |
| "learning_rate": 0.0005175647607934655, |
| "loss": 3.4842, |
| "step": 23650 |
| }, |
| { |
| "epoch": 6.909620991253644, |
| "grad_norm": 0.32958361506462097, |
| "learning_rate": 0.0005173897316219369, |
| "loss": 3.5019, |
| "step": 23700 |
| }, |
| { |
| "epoch": 6.924198250728863, |
| "grad_norm": 0.32281431555747986, |
| "learning_rate": 0.0005172147024504084, |
| "loss": 3.4878, |
| "step": 23750 |
| }, |
| { |
| "epoch": 6.938775510204081, |
| "grad_norm": 0.34471213817596436, |
| "learning_rate": 0.0005170396732788798, |
| "loss": 3.4986, |
| "step": 23800 |
| }, |
| { |
| "epoch": 6.9533527696793005, |
| "grad_norm": 0.31065723299980164, |
| "learning_rate": 0.0005168646441073512, |
| "loss": 3.4782, |
| "step": 23850 |
| }, |
| { |
| "epoch": 6.967930029154519, |
| "grad_norm": 0.3147946894168854, |
| "learning_rate": 0.0005166896149358226, |
| "loss": 3.5057, |
| "step": 23900 |
| }, |
| { |
| "epoch": 6.982507288629738, |
| "grad_norm": 0.3277629613876343, |
| "learning_rate": 0.000516514585764294, |
| "loss": 3.4993, |
| "step": 23950 |
| }, |
| { |
| "epoch": 6.997084548104956, |
| "grad_norm": 0.3514486849308014, |
| "learning_rate": 0.0005163395565927655, |
| "loss": 3.505, |
| "step": 24000 |
| }, |
| { |
| "epoch": 6.997084548104956, |
| "eval_accuracy": 0.3639886406657011, |
| "eval_loss": 3.593942642211914, |
| "eval_runtime": 187.1371, |
| "eval_samples_per_second": 88.833, |
| "eval_steps_per_second": 5.552, |
| "step": 24000 |
| }, |
| { |
| "epoch": 7.011661807580175, |
| "grad_norm": 0.3496927320957184, |
| "learning_rate": 0.0005161645274212368, |
| "loss": 3.3972, |
| "step": 24050 |
| }, |
| { |
| "epoch": 7.026239067055394, |
| "grad_norm": 0.3268931806087494, |
| "learning_rate": 0.0005159894982497082, |
| "loss": 3.3784, |
| "step": 24100 |
| }, |
| { |
| "epoch": 7.040816326530612, |
| "grad_norm": 0.34126970171928406, |
| "learning_rate": 0.0005158144690781796, |
| "loss": 3.3877, |
| "step": 24150 |
| }, |
| { |
| "epoch": 7.055393586005831, |
| "grad_norm": 0.3556489646434784, |
| "learning_rate": 0.0005156394399066511, |
| "loss": 3.3915, |
| "step": 24200 |
| }, |
| { |
| "epoch": 7.069970845481049, |
| "grad_norm": 0.341993123292923, |
| "learning_rate": 0.0005154644107351224, |
| "loss": 3.3872, |
| "step": 24250 |
| }, |
| { |
| "epoch": 7.084548104956268, |
| "grad_norm": 0.32109686732292175, |
| "learning_rate": 0.0005152893815635939, |
| "loss": 3.3911, |
| "step": 24300 |
| }, |
| { |
| "epoch": 7.099125364431487, |
| "grad_norm": 0.33900973200798035, |
| "learning_rate": 0.0005151143523920653, |
| "loss": 3.3986, |
| "step": 24350 |
| }, |
| { |
| "epoch": 7.113702623906706, |
| "grad_norm": 0.3317485749721527, |
| "learning_rate": 0.0005149393232205368, |
| "loss": 3.411, |
| "step": 24400 |
| }, |
| { |
| "epoch": 7.128279883381924, |
| "grad_norm": 0.31022968888282776, |
| "learning_rate": 0.0005147642940490081, |
| "loss": 3.4088, |
| "step": 24450 |
| }, |
| { |
| "epoch": 7.142857142857143, |
| "grad_norm": 0.3452603220939636, |
| "learning_rate": 0.0005145892648774795, |
| "loss": 3.4057, |
| "step": 24500 |
| }, |
| { |
| "epoch": 7.1574344023323615, |
| "grad_norm": 0.3521261513233185, |
| "learning_rate": 0.000514414235705951, |
| "loss": 3.4058, |
| "step": 24550 |
| }, |
| { |
| "epoch": 7.17201166180758, |
| "grad_norm": 0.32531312108039856, |
| "learning_rate": 0.0005142392065344224, |
| "loss": 3.4137, |
| "step": 24600 |
| }, |
| { |
| "epoch": 7.186588921282799, |
| "grad_norm": 0.32090941071510315, |
| "learning_rate": 0.0005140641773628937, |
| "loss": 3.4222, |
| "step": 24650 |
| }, |
| { |
| "epoch": 7.201166180758017, |
| "grad_norm": 0.35693666338920593, |
| "learning_rate": 0.0005138891481913651, |
| "loss": 3.4178, |
| "step": 24700 |
| }, |
| { |
| "epoch": 7.215743440233236, |
| "grad_norm": 0.3247111141681671, |
| "learning_rate": 0.0005137141190198366, |
| "loss": 3.4282, |
| "step": 24750 |
| }, |
| { |
| "epoch": 7.230320699708455, |
| "grad_norm": 0.3346105217933655, |
| "learning_rate": 0.000513539089848308, |
| "loss": 3.4282, |
| "step": 24800 |
| }, |
| { |
| "epoch": 7.244897959183674, |
| "grad_norm": 0.328382670879364, |
| "learning_rate": 0.0005133640606767795, |
| "loss": 3.4372, |
| "step": 24850 |
| }, |
| { |
| "epoch": 7.259475218658892, |
| "grad_norm": 0.33675169944763184, |
| "learning_rate": 0.0005131890315052508, |
| "loss": 3.4343, |
| "step": 24900 |
| }, |
| { |
| "epoch": 7.274052478134111, |
| "grad_norm": 0.3539164960384369, |
| "learning_rate": 0.0005130140023337223, |
| "loss": 3.4344, |
| "step": 24950 |
| }, |
| { |
| "epoch": 7.288629737609329, |
| "grad_norm": 0.33521610498428345, |
| "learning_rate": 0.0005128389731621937, |
| "loss": 3.4421, |
| "step": 25000 |
| }, |
| { |
| "epoch": 7.288629737609329, |
| "eval_accuracy": 0.36366291443865373, |
| "eval_loss": 3.6036746501922607, |
| "eval_runtime": 183.2654, |
| "eval_samples_per_second": 90.71, |
| "eval_steps_per_second": 5.669, |
| "step": 25000 |
| }, |
| { |
| "epoch": 7.303206997084548, |
| "grad_norm": 0.3432515561580658, |
| "learning_rate": 0.000512663943990665, |
| "loss": 3.4345, |
| "step": 25050 |
| }, |
| { |
| "epoch": 7.317784256559767, |
| "grad_norm": 0.3308854103088379, |
| "learning_rate": 0.0005124889148191365, |
| "loss": 3.4333, |
| "step": 25100 |
| }, |
| { |
| "epoch": 7.332361516034985, |
| "grad_norm": 0.3173339068889618, |
| "learning_rate": 0.0005123138856476079, |
| "loss": 3.4399, |
| "step": 25150 |
| }, |
| { |
| "epoch": 7.346938775510204, |
| "grad_norm": 0.3661780059337616, |
| "learning_rate": 0.0005121388564760793, |
| "loss": 3.4486, |
| "step": 25200 |
| }, |
| { |
| "epoch": 7.3615160349854225, |
| "grad_norm": 0.3121057152748108, |
| "learning_rate": 0.0005119638273045508, |
| "loss": 3.4385, |
| "step": 25250 |
| }, |
| { |
| "epoch": 7.376093294460642, |
| "grad_norm": 0.33542507886886597, |
| "learning_rate": 0.0005117887981330221, |
| "loss": 3.4495, |
| "step": 25300 |
| }, |
| { |
| "epoch": 7.39067055393586, |
| "grad_norm": 0.3543615937232971, |
| "learning_rate": 0.0005116137689614935, |
| "loss": 3.443, |
| "step": 25350 |
| }, |
| { |
| "epoch": 7.405247813411079, |
| "grad_norm": 0.3314652144908905, |
| "learning_rate": 0.0005114387397899649, |
| "loss": 3.4458, |
| "step": 25400 |
| }, |
| { |
| "epoch": 7.419825072886297, |
| "grad_norm": 0.34093043208122253, |
| "learning_rate": 0.0005112637106184364, |
| "loss": 3.4395, |
| "step": 25450 |
| }, |
| { |
| "epoch": 7.4344023323615165, |
| "grad_norm": 0.32408225536346436, |
| "learning_rate": 0.0005110886814469077, |
| "loss": 3.4368, |
| "step": 25500 |
| }, |
| { |
| "epoch": 7.448979591836735, |
| "grad_norm": 0.3401145935058594, |
| "learning_rate": 0.0005109136522753792, |
| "loss": 3.4453, |
| "step": 25550 |
| }, |
| { |
| "epoch": 7.463556851311953, |
| "grad_norm": 0.3173987865447998, |
| "learning_rate": 0.0005107386231038506, |
| "loss": 3.4492, |
| "step": 25600 |
| }, |
| { |
| "epoch": 7.478134110787172, |
| "grad_norm": 0.3380630612373352, |
| "learning_rate": 0.0005105635939323221, |
| "loss": 3.4549, |
| "step": 25650 |
| }, |
| { |
| "epoch": 7.49271137026239, |
| "grad_norm": 0.32536083459854126, |
| "learning_rate": 0.0005103885647607934, |
| "loss": 3.4493, |
| "step": 25700 |
| }, |
| { |
| "epoch": 7.50728862973761, |
| "grad_norm": 0.32971838116645813, |
| "learning_rate": 0.0005102135355892648, |
| "loss": 3.4459, |
| "step": 25750 |
| }, |
| { |
| "epoch": 7.521865889212828, |
| "grad_norm": 0.3344475328922272, |
| "learning_rate": 0.0005100385064177363, |
| "loss": 3.4463, |
| "step": 25800 |
| }, |
| { |
| "epoch": 7.536443148688047, |
| "grad_norm": 0.32902637124061584, |
| "learning_rate": 0.0005098634772462077, |
| "loss": 3.4497, |
| "step": 25850 |
| }, |
| { |
| "epoch": 7.551020408163265, |
| "grad_norm": 0.314765602350235, |
| "learning_rate": 0.000509688448074679, |
| "loss": 3.439, |
| "step": 25900 |
| }, |
| { |
| "epoch": 7.5655976676384835, |
| "grad_norm": 0.3501645028591156, |
| "learning_rate": 0.0005095134189031504, |
| "loss": 3.4522, |
| "step": 25950 |
| }, |
| { |
| "epoch": 7.580174927113703, |
| "grad_norm": 0.3293103277683258, |
| "learning_rate": 0.0005093383897316219, |
| "loss": 3.4527, |
| "step": 26000 |
| }, |
| { |
| "epoch": 7.580174927113703, |
| "eval_accuracy": 0.36490955005283193, |
| "eval_loss": 3.5910089015960693, |
| "eval_runtime": 182.9055, |
| "eval_samples_per_second": 90.888, |
| "eval_steps_per_second": 5.681, |
| "step": 26000 |
| }, |
| { |
| "epoch": 7.594752186588921, |
| "grad_norm": 0.3306674659252167, |
| "learning_rate": 0.0005091633605600933, |
| "loss": 3.4567, |
| "step": 26050 |
| }, |
| { |
| "epoch": 7.60932944606414, |
| "grad_norm": 0.33794093132019043, |
| "learning_rate": 0.0005089883313885647, |
| "loss": 3.4417, |
| "step": 26100 |
| }, |
| { |
| "epoch": 7.623906705539358, |
| "grad_norm": 0.32369881868362427, |
| "learning_rate": 0.0005088133022170361, |
| "loss": 3.4365, |
| "step": 26150 |
| }, |
| { |
| "epoch": 7.6384839650145775, |
| "grad_norm": 0.33512216806411743, |
| "learning_rate": 0.0005086382730455076, |
| "loss": 3.4494, |
| "step": 26200 |
| }, |
| { |
| "epoch": 7.653061224489796, |
| "grad_norm": 0.32545676827430725, |
| "learning_rate": 0.000508463243873979, |
| "loss": 3.4495, |
| "step": 26250 |
| }, |
| { |
| "epoch": 7.667638483965015, |
| "grad_norm": 0.32898396253585815, |
| "learning_rate": 0.0005082882147024503, |
| "loss": 3.4623, |
| "step": 26300 |
| }, |
| { |
| "epoch": 7.682215743440233, |
| "grad_norm": 0.33407220244407654, |
| "learning_rate": 0.0005081131855309218, |
| "loss": 3.4538, |
| "step": 26350 |
| }, |
| { |
| "epoch": 7.696793002915452, |
| "grad_norm": 0.32770851254463196, |
| "learning_rate": 0.0005079381563593932, |
| "loss": 3.4612, |
| "step": 26400 |
| }, |
| { |
| "epoch": 7.711370262390671, |
| "grad_norm": 0.3490068018436432, |
| "learning_rate": 0.0005077631271878646, |
| "loss": 3.4593, |
| "step": 26450 |
| }, |
| { |
| "epoch": 7.725947521865889, |
| "grad_norm": 0.3532838523387909, |
| "learning_rate": 0.000507588098016336, |
| "loss": 3.4614, |
| "step": 26500 |
| }, |
| { |
| "epoch": 7.740524781341108, |
| "grad_norm": 0.3085959255695343, |
| "learning_rate": 0.0005074130688448074, |
| "loss": 3.4539, |
| "step": 26550 |
| }, |
| { |
| "epoch": 7.755102040816326, |
| "grad_norm": 0.32022443413734436, |
| "learning_rate": 0.0005072380396732788, |
| "loss": 3.4592, |
| "step": 26600 |
| }, |
| { |
| "epoch": 7.769679300291545, |
| "grad_norm": 0.32210254669189453, |
| "learning_rate": 0.0005070630105017503, |
| "loss": 3.4593, |
| "step": 26650 |
| }, |
| { |
| "epoch": 7.784256559766764, |
| "grad_norm": 0.33314940333366394, |
| "learning_rate": 0.0005068879813302216, |
| "loss": 3.4635, |
| "step": 26700 |
| }, |
| { |
| "epoch": 7.798833819241983, |
| "grad_norm": 0.3578518331050873, |
| "learning_rate": 0.0005067129521586931, |
| "loss": 3.461, |
| "step": 26750 |
| }, |
| { |
| "epoch": 7.813411078717201, |
| "grad_norm": 0.33715447783470154, |
| "learning_rate": 0.0005065379229871645, |
| "loss": 3.4692, |
| "step": 26800 |
| }, |
| { |
| "epoch": 7.827988338192419, |
| "grad_norm": 0.32819151878356934, |
| "learning_rate": 0.0005063628938156359, |
| "loss": 3.4603, |
| "step": 26850 |
| }, |
| { |
| "epoch": 7.8425655976676385, |
| "grad_norm": 0.3273194134235382, |
| "learning_rate": 0.0005061878646441074, |
| "loss": 3.4548, |
| "step": 26900 |
| }, |
| { |
| "epoch": 7.857142857142857, |
| "grad_norm": 0.31844374537467957, |
| "learning_rate": 0.0005060128354725787, |
| "loss": 3.4487, |
| "step": 26950 |
| }, |
| { |
| "epoch": 7.871720116618076, |
| "grad_norm": 0.33120185136795044, |
| "learning_rate": 0.0005058378063010501, |
| "loss": 3.4547, |
| "step": 27000 |
| }, |
| { |
| "epoch": 7.871720116618076, |
| "eval_accuracy": 0.36534510734957026, |
| "eval_loss": 3.584082841873169, |
| "eval_runtime": 183.1945, |
| "eval_samples_per_second": 90.745, |
| "eval_steps_per_second": 5.672, |
| "step": 27000 |
| }, |
| { |
| "epoch": 7.886297376093294, |
| "grad_norm": 0.319904088973999, |
| "learning_rate": 0.0005056627771295216, |
| "loss": 3.4783, |
| "step": 27050 |
| }, |
| { |
| "epoch": 7.900874635568513, |
| "grad_norm": 0.34819895029067993, |
| "learning_rate": 0.000505487747957993, |
| "loss": 3.4657, |
| "step": 27100 |
| }, |
| { |
| "epoch": 7.915451895043732, |
| "grad_norm": 0.33924388885498047, |
| "learning_rate": 0.0005053127187864643, |
| "loss": 3.4583, |
| "step": 27150 |
| }, |
| { |
| "epoch": 7.930029154518951, |
| "grad_norm": 0.3199351727962494, |
| "learning_rate": 0.0005051376896149357, |
| "loss": 3.4614, |
| "step": 27200 |
| }, |
| { |
| "epoch": 7.944606413994169, |
| "grad_norm": 0.3343052268028259, |
| "learning_rate": 0.0005049626604434072, |
| "loss": 3.4726, |
| "step": 27250 |
| }, |
| { |
| "epoch": 7.959183673469388, |
| "grad_norm": 0.30949947237968445, |
| "learning_rate": 0.0005047876312718786, |
| "loss": 3.4688, |
| "step": 27300 |
| }, |
| { |
| "epoch": 7.973760932944606, |
| "grad_norm": 0.3280562162399292, |
| "learning_rate": 0.00050461260210035, |
| "loss": 3.4816, |
| "step": 27350 |
| }, |
| { |
| "epoch": 7.988338192419825, |
| "grad_norm": 0.3171611428260803, |
| "learning_rate": 0.0005044375729288214, |
| "loss": 3.4537, |
| "step": 27400 |
| }, |
| { |
| "epoch": 8.002915451895044, |
| "grad_norm": 0.3121030628681183, |
| "learning_rate": 0.0005042625437572929, |
| "loss": 3.4459, |
| "step": 27450 |
| }, |
| { |
| "epoch": 8.017492711370263, |
| "grad_norm": 0.3203401565551758, |
| "learning_rate": 0.0005040875145857643, |
| "loss": 3.3559, |
| "step": 27500 |
| }, |
| { |
| "epoch": 8.03206997084548, |
| "grad_norm": 0.333235502243042, |
| "learning_rate": 0.0005039124854142356, |
| "loss": 3.3655, |
| "step": 27550 |
| }, |
| { |
| "epoch": 8.0466472303207, |
| "grad_norm": 0.3198469579219818, |
| "learning_rate": 0.0005037374562427071, |
| "loss": 3.3627, |
| "step": 27600 |
| }, |
| { |
| "epoch": 8.061224489795919, |
| "grad_norm": 0.325804740190506, |
| "learning_rate": 0.0005035624270711785, |
| "loss": 3.3693, |
| "step": 27650 |
| }, |
| { |
| "epoch": 8.075801749271138, |
| "grad_norm": 0.32386887073516846, |
| "learning_rate": 0.0005033873978996499, |
| "loss": 3.3771, |
| "step": 27700 |
| }, |
| { |
| "epoch": 8.090379008746355, |
| "grad_norm": 0.34717753529548645, |
| "learning_rate": 0.0005032123687281213, |
| "loss": 3.3663, |
| "step": 27750 |
| }, |
| { |
| "epoch": 8.104956268221574, |
| "grad_norm": 0.32944604754447937, |
| "learning_rate": 0.0005030373395565927, |
| "loss": 3.373, |
| "step": 27800 |
| }, |
| { |
| "epoch": 8.119533527696793, |
| "grad_norm": 0.3473806083202362, |
| "learning_rate": 0.0005028623103850641, |
| "loss": 3.3747, |
| "step": 27850 |
| }, |
| { |
| "epoch": 8.13411078717201, |
| "grad_norm": 0.32745233178138733, |
| "learning_rate": 0.0005026872812135356, |
| "loss": 3.3825, |
| "step": 27900 |
| }, |
| { |
| "epoch": 8.14868804664723, |
| "grad_norm": 0.34420016407966614, |
| "learning_rate": 0.0005025122520420069, |
| "loss": 3.3733, |
| "step": 27950 |
| }, |
| { |
| "epoch": 8.16326530612245, |
| "grad_norm": 0.35828736424446106, |
| "learning_rate": 0.0005023372228704784, |
| "loss": 3.3729, |
| "step": 28000 |
| }, |
| { |
| "epoch": 8.16326530612245, |
| "eval_accuracy": 0.3654113826895875, |
| "eval_loss": 3.5947518348693848, |
| "eval_runtime": 183.5384, |
| "eval_samples_per_second": 90.575, |
| "eval_steps_per_second": 5.661, |
| "step": 28000 |
| }, |
| { |
| "epoch": 8.177842565597668, |
| "grad_norm": 0.3347868323326111, |
| "learning_rate": 0.0005021621936989498, |
| "loss": 3.3855, |
| "step": 28050 |
| }, |
| { |
| "epoch": 8.192419825072886, |
| "grad_norm": 0.3392854928970337, |
| "learning_rate": 0.0005019871645274213, |
| "loss": 3.3912, |
| "step": 28100 |
| }, |
| { |
| "epoch": 8.206997084548105, |
| "grad_norm": 0.3293224573135376, |
| "learning_rate": 0.0005018121353558926, |
| "loss": 3.3988, |
| "step": 28150 |
| }, |
| { |
| "epoch": 8.221574344023324, |
| "grad_norm": 0.34300434589385986, |
| "learning_rate": 0.000501637106184364, |
| "loss": 3.3908, |
| "step": 28200 |
| }, |
| { |
| "epoch": 8.236151603498543, |
| "grad_norm": 0.3153841197490692, |
| "learning_rate": 0.0005014620770128354, |
| "loss": 3.4038, |
| "step": 28250 |
| }, |
| { |
| "epoch": 8.25072886297376, |
| "grad_norm": 0.3276742994785309, |
| "learning_rate": 0.0005012870478413069, |
| "loss": 3.3991, |
| "step": 28300 |
| }, |
| { |
| "epoch": 8.26530612244898, |
| "grad_norm": 0.338752806186676, |
| "learning_rate": 0.0005011120186697782, |
| "loss": 3.403, |
| "step": 28350 |
| }, |
| { |
| "epoch": 8.279883381924199, |
| "grad_norm": 0.33559349179267883, |
| "learning_rate": 0.0005009369894982496, |
| "loss": 3.3886, |
| "step": 28400 |
| }, |
| { |
| "epoch": 8.294460641399416, |
| "grad_norm": 0.33547982573509216, |
| "learning_rate": 0.0005007619603267211, |
| "loss": 3.406, |
| "step": 28450 |
| }, |
| { |
| "epoch": 8.309037900874635, |
| "grad_norm": 0.38501545786857605, |
| "learning_rate": 0.0005005869311551925, |
| "loss": 3.3903, |
| "step": 28500 |
| }, |
| { |
| "epoch": 8.323615160349854, |
| "grad_norm": 0.305745393037796, |
| "learning_rate": 0.000500411901983664, |
| "loss": 3.4011, |
| "step": 28550 |
| }, |
| { |
| "epoch": 8.338192419825074, |
| "grad_norm": 0.3493766188621521, |
| "learning_rate": 0.0005002368728121353, |
| "loss": 3.4158, |
| "step": 28600 |
| }, |
| { |
| "epoch": 8.352769679300291, |
| "grad_norm": 0.3287374675273895, |
| "learning_rate": 0.0005000618436406068, |
| "loss": 3.4062, |
| "step": 28650 |
| }, |
| { |
| "epoch": 8.36734693877551, |
| "grad_norm": 0.3492163419723511, |
| "learning_rate": 0.0004998868144690782, |
| "loss": 3.4161, |
| "step": 28700 |
| }, |
| { |
| "epoch": 8.38192419825073, |
| "grad_norm": 0.3114756941795349, |
| "learning_rate": 0.0004997117852975495, |
| "loss": 3.3955, |
| "step": 28750 |
| }, |
| { |
| "epoch": 8.396501457725947, |
| "grad_norm": 0.3263934254646301, |
| "learning_rate": 0.0004995367561260209, |
| "loss": 3.3945, |
| "step": 28800 |
| }, |
| { |
| "epoch": 8.411078717201166, |
| "grad_norm": 0.3255505859851837, |
| "learning_rate": 0.0004993617269544924, |
| "loss": 3.4063, |
| "step": 28850 |
| }, |
| { |
| "epoch": 8.425655976676385, |
| "grad_norm": 0.3421190083026886, |
| "learning_rate": 0.0004991866977829638, |
| "loss": 3.4224, |
| "step": 28900 |
| }, |
| { |
| "epoch": 8.440233236151604, |
| "grad_norm": 0.34055987000465393, |
| "learning_rate": 0.0004990116686114351, |
| "loss": 3.4091, |
| "step": 28950 |
| }, |
| { |
| "epoch": 8.454810495626822, |
| "grad_norm": 0.30603712797164917, |
| "learning_rate": 0.0004988366394399066, |
| "loss": 3.4161, |
| "step": 29000 |
| }, |
| { |
| "epoch": 8.454810495626822, |
| "eval_accuracy": 0.36562303999216467, |
| "eval_loss": 3.589662790298462, |
| "eval_runtime": 183.1607, |
| "eval_samples_per_second": 90.762, |
| "eval_steps_per_second": 5.673, |
| "step": 29000 |
| }, |
| { |
| "epoch": 8.46938775510204, |
| "grad_norm": 0.32788148522377014, |
| "learning_rate": 0.000498661610268378, |
| "loss": 3.4013, |
| "step": 29050 |
| }, |
| { |
| "epoch": 8.48396501457726, |
| "grad_norm": 0.3365269899368286, |
| "learning_rate": 0.0004984865810968494, |
| "loss": 3.4032, |
| "step": 29100 |
| }, |
| { |
| "epoch": 8.498542274052479, |
| "grad_norm": 0.3457231819629669, |
| "learning_rate": 0.0004983115519253209, |
| "loss": 3.42, |
| "step": 29150 |
| }, |
| { |
| "epoch": 8.513119533527696, |
| "grad_norm": 0.32088690996170044, |
| "learning_rate": 0.0004981365227537922, |
| "loss": 3.4214, |
| "step": 29200 |
| }, |
| { |
| "epoch": 8.527696793002915, |
| "grad_norm": 0.3353572189807892, |
| "learning_rate": 0.0004979614935822637, |
| "loss": 3.43, |
| "step": 29250 |
| }, |
| { |
| "epoch": 8.542274052478135, |
| "grad_norm": 0.3345482349395752, |
| "learning_rate": 0.0004977864644107351, |
| "loss": 3.4276, |
| "step": 29300 |
| }, |
| { |
| "epoch": 8.556851311953352, |
| "grad_norm": 0.3418853282928467, |
| "learning_rate": 0.0004976114352392066, |
| "loss": 3.4262, |
| "step": 29350 |
| }, |
| { |
| "epoch": 8.571428571428571, |
| "grad_norm": 0.3535926938056946, |
| "learning_rate": 0.0004974364060676779, |
| "loss": 3.4243, |
| "step": 29400 |
| }, |
| { |
| "epoch": 8.58600583090379, |
| "grad_norm": 0.3353271186351776, |
| "learning_rate": 0.0004972613768961493, |
| "loss": 3.4257, |
| "step": 29450 |
| }, |
| { |
| "epoch": 8.60058309037901, |
| "grad_norm": 0.33887091279029846, |
| "learning_rate": 0.0004970863477246207, |
| "loss": 3.4173, |
| "step": 29500 |
| }, |
| { |
| "epoch": 8.615160349854227, |
| "grad_norm": 0.3468446433544159, |
| "learning_rate": 0.0004969113185530922, |
| "loss": 3.4244, |
| "step": 29550 |
| }, |
| { |
| "epoch": 8.629737609329446, |
| "grad_norm": 0.33011898398399353, |
| "learning_rate": 0.0004967362893815635, |
| "loss": 3.4207, |
| "step": 29600 |
| }, |
| { |
| "epoch": 8.644314868804665, |
| "grad_norm": 0.32781869173049927, |
| "learning_rate": 0.0004965612602100349, |
| "loss": 3.4287, |
| "step": 29650 |
| }, |
| { |
| "epoch": 8.658892128279884, |
| "grad_norm": 0.3354378342628479, |
| "learning_rate": 0.0004963862310385064, |
| "loss": 3.4278, |
| "step": 29700 |
| }, |
| { |
| "epoch": 8.673469387755102, |
| "grad_norm": 0.32178574800491333, |
| "learning_rate": 0.0004962112018669778, |
| "loss": 3.4176, |
| "step": 29750 |
| }, |
| { |
| "epoch": 8.68804664723032, |
| "grad_norm": 0.3298463523387909, |
| "learning_rate": 0.0004960361726954492, |
| "loss": 3.4381, |
| "step": 29800 |
| }, |
| { |
| "epoch": 8.70262390670554, |
| "grad_norm": 0.3558914065361023, |
| "learning_rate": 0.0004958611435239206, |
| "loss": 3.4415, |
| "step": 29850 |
| }, |
| { |
| "epoch": 8.717201166180757, |
| "grad_norm": 0.33775535225868225, |
| "learning_rate": 0.0004956861143523921, |
| "loss": 3.4368, |
| "step": 29900 |
| }, |
| { |
| "epoch": 8.731778425655977, |
| "grad_norm": 0.33008068799972534, |
| "learning_rate": 0.0004955110851808635, |
| "loss": 3.4347, |
| "step": 29950 |
| }, |
| { |
| "epoch": 8.746355685131196, |
| "grad_norm": 0.3202441930770874, |
| "learning_rate": 0.0004953360560093348, |
| "loss": 3.4295, |
| "step": 30000 |
| }, |
| { |
| "epoch": 8.746355685131196, |
| "eval_accuracy": 0.3664582505382076, |
| "eval_loss": 3.5774459838867188, |
| "eval_runtime": 183.0331, |
| "eval_samples_per_second": 90.825, |
| "eval_steps_per_second": 5.677, |
| "step": 30000 |
| }, |
| { |
| "epoch": 8.760932944606415, |
| "grad_norm": 0.33541247248649597, |
| "learning_rate": 0.0004951610268378062, |
| "loss": 3.4232, |
| "step": 30050 |
| }, |
| { |
| "epoch": 8.775510204081632, |
| "grad_norm": 0.3246397376060486, |
| "learning_rate": 0.0004949859976662777, |
| "loss": 3.4325, |
| "step": 30100 |
| }, |
| { |
| "epoch": 8.790087463556851, |
| "grad_norm": 0.3221427798271179, |
| "learning_rate": 0.0004948109684947491, |
| "loss": 3.4291, |
| "step": 30150 |
| }, |
| { |
| "epoch": 8.80466472303207, |
| "grad_norm": 0.3057732880115509, |
| "learning_rate": 0.0004946359393232204, |
| "loss": 3.4421, |
| "step": 30200 |
| }, |
| { |
| "epoch": 8.819241982507288, |
| "grad_norm": 0.32524847984313965, |
| "learning_rate": 0.0004944609101516919, |
| "loss": 3.4218, |
| "step": 30250 |
| }, |
| { |
| "epoch": 8.833819241982507, |
| "grad_norm": 0.33912190794944763, |
| "learning_rate": 0.0004942858809801633, |
| "loss": 3.4362, |
| "step": 30300 |
| }, |
| { |
| "epoch": 8.848396501457726, |
| "grad_norm": 0.33464741706848145, |
| "learning_rate": 0.0004941108518086348, |
| "loss": 3.4409, |
| "step": 30350 |
| }, |
| { |
| "epoch": 8.862973760932945, |
| "grad_norm": 0.3329154849052429, |
| "learning_rate": 0.0004939358226371061, |
| "loss": 3.4305, |
| "step": 30400 |
| }, |
| { |
| "epoch": 8.877551020408163, |
| "grad_norm": 0.34702932834625244, |
| "learning_rate": 0.0004937607934655776, |
| "loss": 3.4224, |
| "step": 30450 |
| }, |
| { |
| "epoch": 8.892128279883382, |
| "grad_norm": 0.33861494064331055, |
| "learning_rate": 0.000493585764294049, |
| "loss": 3.4326, |
| "step": 30500 |
| }, |
| { |
| "epoch": 8.906705539358601, |
| "grad_norm": 0.3515016436576843, |
| "learning_rate": 0.0004934107351225204, |
| "loss": 3.443, |
| "step": 30550 |
| }, |
| { |
| "epoch": 8.921282798833818, |
| "grad_norm": 0.32505327463150024, |
| "learning_rate": 0.0004932357059509919, |
| "loss": 3.4282, |
| "step": 30600 |
| }, |
| { |
| "epoch": 8.935860058309038, |
| "grad_norm": 0.3279268741607666, |
| "learning_rate": 0.0004930606767794632, |
| "loss": 3.4436, |
| "step": 30650 |
| }, |
| { |
| "epoch": 8.950437317784257, |
| "grad_norm": 0.32305216789245605, |
| "learning_rate": 0.0004928856476079346, |
| "loss": 3.4385, |
| "step": 30700 |
| }, |
| { |
| "epoch": 8.965014577259476, |
| "grad_norm": 0.32832610607147217, |
| "learning_rate": 0.000492710618436406, |
| "loss": 3.4272, |
| "step": 30750 |
| }, |
| { |
| "epoch": 8.979591836734693, |
| "grad_norm": 0.35155633091926575, |
| "learning_rate": 0.0004925355892648775, |
| "loss": 3.4509, |
| "step": 30800 |
| }, |
| { |
| "epoch": 8.994169096209912, |
| "grad_norm": 0.32218649983406067, |
| "learning_rate": 0.0004923605600933488, |
| "loss": 3.4463, |
| "step": 30850 |
| }, |
| { |
| "epoch": 9.008746355685131, |
| "grad_norm": 0.3552287817001343, |
| "learning_rate": 0.0004921855309218202, |
| "loss": 3.3747, |
| "step": 30900 |
| }, |
| { |
| "epoch": 9.02332361516035, |
| "grad_norm": 0.31643298268318176, |
| "learning_rate": 0.0004920105017502917, |
| "loss": 3.3176, |
| "step": 30950 |
| }, |
| { |
| "epoch": 9.037900874635568, |
| "grad_norm": 0.3509899079799652, |
| "learning_rate": 0.000491835472578763, |
| "loss": 3.3256, |
| "step": 31000 |
| }, |
| { |
| "epoch": 9.037900874635568, |
| "eval_accuracy": 0.3664020989623848, |
| "eval_loss": 3.5828676223754883, |
| "eval_runtime": 182.7157, |
| "eval_samples_per_second": 90.983, |
| "eval_steps_per_second": 5.686, |
| "step": 31000 |
| }, |
| { |
| "epoch": 9.052478134110787, |
| "grad_norm": 0.34235477447509766, |
| "learning_rate": 0.0004916604434072345, |
| "loss": 3.341, |
| "step": 31050 |
| }, |
| { |
| "epoch": 9.067055393586006, |
| "grad_norm": 0.35295766592025757, |
| "learning_rate": 0.0004914854142357059, |
| "loss": 3.3398, |
| "step": 31100 |
| }, |
| { |
| "epoch": 9.081632653061224, |
| "grad_norm": 0.3388746380805969, |
| "learning_rate": 0.0004913103850641774, |
| "loss": 3.3436, |
| "step": 31150 |
| }, |
| { |
| "epoch": 9.096209912536443, |
| "grad_norm": 0.37326183915138245, |
| "learning_rate": 0.0004911353558926488, |
| "loss": 3.3369, |
| "step": 31200 |
| }, |
| { |
| "epoch": 9.110787172011662, |
| "grad_norm": 0.33712223172187805, |
| "learning_rate": 0.0004909603267211201, |
| "loss": 3.3502, |
| "step": 31250 |
| }, |
| { |
| "epoch": 9.125364431486881, |
| "grad_norm": 0.32785558700561523, |
| "learning_rate": 0.0004907852975495915, |
| "loss": 3.3526, |
| "step": 31300 |
| }, |
| { |
| "epoch": 9.139941690962099, |
| "grad_norm": 0.34080231189727783, |
| "learning_rate": 0.000490610268378063, |
| "loss": 3.3697, |
| "step": 31350 |
| }, |
| { |
| "epoch": 9.154518950437318, |
| "grad_norm": 0.3284015357494354, |
| "learning_rate": 0.0004904352392065344, |
| "loss": 3.3574, |
| "step": 31400 |
| }, |
| { |
| "epoch": 9.169096209912537, |
| "grad_norm": 0.34510546922683716, |
| "learning_rate": 0.0004902602100350057, |
| "loss": 3.3552, |
| "step": 31450 |
| }, |
| { |
| "epoch": 9.183673469387756, |
| "grad_norm": 0.3248676657676697, |
| "learning_rate": 0.0004900851808634772, |
| "loss": 3.3581, |
| "step": 31500 |
| }, |
| { |
| "epoch": 9.198250728862973, |
| "grad_norm": 0.3320048749446869, |
| "learning_rate": 0.0004899101516919486, |
| "loss": 3.3597, |
| "step": 31550 |
| }, |
| { |
| "epoch": 9.212827988338192, |
| "grad_norm": 0.3347003757953644, |
| "learning_rate": 0.0004897351225204201, |
| "loss": 3.3533, |
| "step": 31600 |
| }, |
| { |
| "epoch": 9.227405247813412, |
| "grad_norm": 0.34604883193969727, |
| "learning_rate": 0.0004895600933488914, |
| "loss": 3.3575, |
| "step": 31650 |
| }, |
| { |
| "epoch": 9.241982507288629, |
| "grad_norm": 0.33016645908355713, |
| "learning_rate": 0.0004893850641773629, |
| "loss": 3.3726, |
| "step": 31700 |
| }, |
| { |
| "epoch": 9.256559766763848, |
| "grad_norm": 0.34527072310447693, |
| "learning_rate": 0.0004892100350058343, |
| "loss": 3.3707, |
| "step": 31750 |
| }, |
| { |
| "epoch": 9.271137026239067, |
| "grad_norm": 0.33581438660621643, |
| "learning_rate": 0.0004890350058343057, |
| "loss": 3.3656, |
| "step": 31800 |
| }, |
| { |
| "epoch": 9.285714285714286, |
| "grad_norm": 0.33454182744026184, |
| "learning_rate": 0.0004888599766627771, |
| "loss": 3.3743, |
| "step": 31850 |
| }, |
| { |
| "epoch": 9.300291545189504, |
| "grad_norm": 0.3457191586494446, |
| "learning_rate": 0.0004886849474912485, |
| "loss": 3.3819, |
| "step": 31900 |
| }, |
| { |
| "epoch": 9.314868804664723, |
| "grad_norm": 0.36336174607276917, |
| "learning_rate": 0.0004885099183197199, |
| "loss": 3.3725, |
| "step": 31950 |
| }, |
| { |
| "epoch": 9.329446064139942, |
| "grad_norm": 0.3674074113368988, |
| "learning_rate": 0.0004883348891481913, |
| "loss": 3.3733, |
| "step": 32000 |
| }, |
| { |
| "epoch": 9.329446064139942, |
| "eval_accuracy": 0.36651228318664075, |
| "eval_loss": 3.5811729431152344, |
| "eval_runtime": 182.8386, |
| "eval_samples_per_second": 90.922, |
| "eval_steps_per_second": 5.683, |
| "step": 32000 |
| }, |
| { |
| "epoch": 9.34402332361516, |
| "grad_norm": 0.3265478312969208, |
| "learning_rate": 0.00048815985997666274, |
| "loss": 3.3797, |
| "step": 32050 |
| }, |
| { |
| "epoch": 9.358600583090379, |
| "grad_norm": 0.36413219571113586, |
| "learning_rate": 0.00048798483080513417, |
| "loss": 3.3817, |
| "step": 32100 |
| }, |
| { |
| "epoch": 9.373177842565598, |
| "grad_norm": 0.36710673570632935, |
| "learning_rate": 0.00048780980163360554, |
| "loss": 3.3872, |
| "step": 32150 |
| }, |
| { |
| "epoch": 9.387755102040817, |
| "grad_norm": 0.3260069191455841, |
| "learning_rate": 0.0004876347724620769, |
| "loss": 3.3867, |
| "step": 32200 |
| }, |
| { |
| "epoch": 9.402332361516034, |
| "grad_norm": 0.34948107600212097, |
| "learning_rate": 0.0004874597432905484, |
| "loss": 3.388, |
| "step": 32250 |
| }, |
| { |
| "epoch": 9.416909620991254, |
| "grad_norm": 0.3187805712223053, |
| "learning_rate": 0.00048728471411901977, |
| "loss": 3.385, |
| "step": 32300 |
| }, |
| { |
| "epoch": 9.431486880466473, |
| "grad_norm": 0.3865429759025574, |
| "learning_rate": 0.0004871096849474912, |
| "loss": 3.3729, |
| "step": 32350 |
| }, |
| { |
| "epoch": 9.446064139941692, |
| "grad_norm": 0.3081527352333069, |
| "learning_rate": 0.0004869346557759626, |
| "loss": 3.3893, |
| "step": 32400 |
| }, |
| { |
| "epoch": 9.46064139941691, |
| "grad_norm": 0.33219993114471436, |
| "learning_rate": 0.00048675962660443405, |
| "loss": 3.3854, |
| "step": 32450 |
| }, |
| { |
| "epoch": 9.475218658892128, |
| "grad_norm": 0.3221242427825928, |
| "learning_rate": 0.0004865845974329054, |
| "loss": 3.4055, |
| "step": 32500 |
| }, |
| { |
| "epoch": 9.489795918367347, |
| "grad_norm": 0.33291497826576233, |
| "learning_rate": 0.00048640956826137685, |
| "loss": 3.3888, |
| "step": 32550 |
| }, |
| { |
| "epoch": 9.504373177842565, |
| "grad_norm": 0.35030466318130493, |
| "learning_rate": 0.0004862345390898483, |
| "loss": 3.4062, |
| "step": 32600 |
| }, |
| { |
| "epoch": 9.518950437317784, |
| "grad_norm": 0.33631065487861633, |
| "learning_rate": 0.0004860595099183197, |
| "loss": 3.3959, |
| "step": 32650 |
| }, |
| { |
| "epoch": 9.533527696793003, |
| "grad_norm": 0.3579002618789673, |
| "learning_rate": 0.0004858844807467911, |
| "loss": 3.3959, |
| "step": 32700 |
| }, |
| { |
| "epoch": 9.548104956268222, |
| "grad_norm": 0.3267410099506378, |
| "learning_rate": 0.00048570945157526256, |
| "loss": 3.4049, |
| "step": 32750 |
| }, |
| { |
| "epoch": 9.56268221574344, |
| "grad_norm": 0.329932302236557, |
| "learning_rate": 0.00048553442240373393, |
| "loss": 3.3952, |
| "step": 32800 |
| }, |
| { |
| "epoch": 9.577259475218659, |
| "grad_norm": 0.33361557126045227, |
| "learning_rate": 0.0004853593932322053, |
| "loss": 3.391, |
| "step": 32850 |
| }, |
| { |
| "epoch": 9.591836734693878, |
| "grad_norm": 0.37531715631484985, |
| "learning_rate": 0.00048518436406067673, |
| "loss": 3.4024, |
| "step": 32900 |
| }, |
| { |
| "epoch": 9.606413994169095, |
| "grad_norm": 0.3536641597747803, |
| "learning_rate": 0.00048500933488914816, |
| "loss": 3.4074, |
| "step": 32950 |
| }, |
| { |
| "epoch": 9.620991253644315, |
| "grad_norm": 0.33492809534072876, |
| "learning_rate": 0.0004848343057176196, |
| "loss": 3.4062, |
| "step": 33000 |
| }, |
| { |
| "epoch": 9.620991253644315, |
| "eval_accuracy": 0.36730181907562026, |
| "eval_loss": 3.5756776332855225, |
| "eval_runtime": 182.9487, |
| "eval_samples_per_second": 90.867, |
| "eval_steps_per_second": 5.679, |
| "step": 33000 |
| }, |
| { |
| "epoch": 9.635568513119534, |
| "grad_norm": 0.34546518325805664, |
| "learning_rate": 0.00048465927654609095, |
| "loss": 3.3993, |
| "step": 33050 |
| }, |
| { |
| "epoch": 9.650145772594753, |
| "grad_norm": 0.3619083762168884, |
| "learning_rate": 0.00048448424737456244, |
| "loss": 3.4156, |
| "step": 33100 |
| }, |
| { |
| "epoch": 9.66472303206997, |
| "grad_norm": 0.331717848777771, |
| "learning_rate": 0.0004843092182030338, |
| "loss": 3.4023, |
| "step": 33150 |
| }, |
| { |
| "epoch": 9.67930029154519, |
| "grad_norm": 0.3297898471355438, |
| "learning_rate": 0.0004841341890315052, |
| "loss": 3.4045, |
| "step": 33200 |
| }, |
| { |
| "epoch": 9.693877551020408, |
| "grad_norm": 0.30799806118011475, |
| "learning_rate": 0.0004839591598599766, |
| "loss": 3.3953, |
| "step": 33250 |
| }, |
| { |
| "epoch": 9.708454810495628, |
| "grad_norm": 0.34676632285118103, |
| "learning_rate": 0.00048378413068844803, |
| "loss": 3.4142, |
| "step": 33300 |
| }, |
| { |
| "epoch": 9.723032069970845, |
| "grad_norm": 0.32712289690971375, |
| "learning_rate": 0.00048360910151691946, |
| "loss": 3.411, |
| "step": 33350 |
| }, |
| { |
| "epoch": 9.737609329446064, |
| "grad_norm": 0.36372727155685425, |
| "learning_rate": 0.00048343407234539083, |
| "loss": 3.3969, |
| "step": 33400 |
| }, |
| { |
| "epoch": 9.752186588921283, |
| "grad_norm": 0.3348037898540497, |
| "learning_rate": 0.00048325904317386226, |
| "loss": 3.4057, |
| "step": 33450 |
| }, |
| { |
| "epoch": 9.7667638483965, |
| "grad_norm": 0.3365459442138672, |
| "learning_rate": 0.0004830840140023337, |
| "loss": 3.4077, |
| "step": 33500 |
| }, |
| { |
| "epoch": 9.78134110787172, |
| "grad_norm": 0.3372909426689148, |
| "learning_rate": 0.0004829089848308051, |
| "loss": 3.4113, |
| "step": 33550 |
| }, |
| { |
| "epoch": 9.795918367346939, |
| "grad_norm": 0.34129634499549866, |
| "learning_rate": 0.0004827339556592765, |
| "loss": 3.4069, |
| "step": 33600 |
| }, |
| { |
| "epoch": 9.810495626822158, |
| "grad_norm": 0.3516503870487213, |
| "learning_rate": 0.00048255892648774797, |
| "loss": 3.4067, |
| "step": 33650 |
| }, |
| { |
| "epoch": 9.825072886297376, |
| "grad_norm": 0.3345436453819275, |
| "learning_rate": 0.00048238389731621934, |
| "loss": 3.4148, |
| "step": 33700 |
| }, |
| { |
| "epoch": 9.839650145772595, |
| "grad_norm": 0.32488852739334106, |
| "learning_rate": 0.0004822088681446907, |
| "loss": 3.4156, |
| "step": 33750 |
| }, |
| { |
| "epoch": 9.854227405247814, |
| "grad_norm": 0.32768747210502625, |
| "learning_rate": 0.00048203383897316214, |
| "loss": 3.4029, |
| "step": 33800 |
| }, |
| { |
| "epoch": 9.868804664723033, |
| "grad_norm": 0.3539787828922272, |
| "learning_rate": 0.00048185880980163357, |
| "loss": 3.409, |
| "step": 33850 |
| }, |
| { |
| "epoch": 9.88338192419825, |
| "grad_norm": 0.3092935085296631, |
| "learning_rate": 0.000481683780630105, |
| "loss": 3.4199, |
| "step": 33900 |
| }, |
| { |
| "epoch": 9.89795918367347, |
| "grad_norm": 0.34841668605804443, |
| "learning_rate": 0.00048150875145857637, |
| "loss": 3.4128, |
| "step": 33950 |
| }, |
| { |
| "epoch": 9.912536443148689, |
| "grad_norm": 0.3587806522846222, |
| "learning_rate": 0.00048133372228704785, |
| "loss": 3.4021, |
| "step": 34000 |
| }, |
| { |
| "epoch": 9.912536443148689, |
| "eval_accuracy": 0.3678635702702245, |
| "eval_loss": 3.563772678375244, |
| "eval_runtime": 181.4798, |
| "eval_samples_per_second": 91.602, |
| "eval_steps_per_second": 5.725, |
| "step": 34000 |
| }, |
| { |
| "epoch": 9.927113702623906, |
| "grad_norm": 0.32694658637046814, |
| "learning_rate": 0.0004811586931155192, |
| "loss": 3.4201, |
| "step": 34050 |
| }, |
| { |
| "epoch": 9.941690962099125, |
| "grad_norm": 0.33355414867401123, |
| "learning_rate": 0.0004809836639439906, |
| "loss": 3.4142, |
| "step": 34100 |
| }, |
| { |
| "epoch": 9.956268221574344, |
| "grad_norm": 0.3344906270503998, |
| "learning_rate": 0.000480808634772462, |
| "loss": 3.4114, |
| "step": 34150 |
| }, |
| { |
| "epoch": 9.970845481049563, |
| "grad_norm": 0.31297606229782104, |
| "learning_rate": 0.00048063360560093345, |
| "loss": 3.4114, |
| "step": 34200 |
| }, |
| { |
| "epoch": 9.98542274052478, |
| "grad_norm": 0.34211552143096924, |
| "learning_rate": 0.00048045857642940487, |
| "loss": 3.4133, |
| "step": 34250 |
| }, |
| { |
| "epoch": 10.0, |
| "grad_norm": 0.347125768661499, |
| "learning_rate": 0.00048028354725787625, |
| "loss": 3.4196, |
| "step": 34300 |
| }, |
| { |
| "epoch": 10.01457725947522, |
| "grad_norm": 0.3528343737125397, |
| "learning_rate": 0.0004801085180863477, |
| "loss": 3.2922, |
| "step": 34350 |
| }, |
| { |
| "epoch": 10.029154518950437, |
| "grad_norm": 0.3449252247810364, |
| "learning_rate": 0.0004799334889148191, |
| "loss": 3.3206, |
| "step": 34400 |
| }, |
| { |
| "epoch": 10.043731778425656, |
| "grad_norm": 0.3424980640411377, |
| "learning_rate": 0.0004797584597432905, |
| "loss": 3.3048, |
| "step": 34450 |
| }, |
| { |
| "epoch": 10.058309037900875, |
| "grad_norm": 0.348306804895401, |
| "learning_rate": 0.0004795834305717619, |
| "loss": 3.3034, |
| "step": 34500 |
| }, |
| { |
| "epoch": 10.072886297376094, |
| "grad_norm": 0.3183503746986389, |
| "learning_rate": 0.0004794084014002334, |
| "loss": 3.3098, |
| "step": 34550 |
| }, |
| { |
| "epoch": 10.087463556851311, |
| "grad_norm": 0.34545329213142395, |
| "learning_rate": 0.00047923337222870475, |
| "loss": 3.3213, |
| "step": 34600 |
| }, |
| { |
| "epoch": 10.10204081632653, |
| "grad_norm": 0.3453138470649719, |
| "learning_rate": 0.0004790583430571761, |
| "loss": 3.335, |
| "step": 34650 |
| }, |
| { |
| "epoch": 10.11661807580175, |
| "grad_norm": 0.32295459508895874, |
| "learning_rate": 0.00047888331388564755, |
| "loss": 3.329, |
| "step": 34700 |
| }, |
| { |
| "epoch": 10.131195335276967, |
| "grad_norm": 0.32609206438064575, |
| "learning_rate": 0.000478708284714119, |
| "loss": 3.3368, |
| "step": 34750 |
| }, |
| { |
| "epoch": 10.145772594752186, |
| "grad_norm": 0.32333531975746155, |
| "learning_rate": 0.0004785332555425904, |
| "loss": 3.3365, |
| "step": 34800 |
| }, |
| { |
| "epoch": 10.160349854227405, |
| "grad_norm": 0.31967732310295105, |
| "learning_rate": 0.0004783582263710618, |
| "loss": 3.3437, |
| "step": 34850 |
| }, |
| { |
| "epoch": 10.174927113702624, |
| "grad_norm": 0.3523292541503906, |
| "learning_rate": 0.00047818319719953326, |
| "loss": 3.3288, |
| "step": 34900 |
| }, |
| { |
| "epoch": 10.189504373177842, |
| "grad_norm": 0.3372192084789276, |
| "learning_rate": 0.00047800816802800463, |
| "loss": 3.3424, |
| "step": 34950 |
| }, |
| { |
| "epoch": 10.204081632653061, |
| "grad_norm": 0.3291529715061188, |
| "learning_rate": 0.000477833138856476, |
| "loss": 3.3341, |
| "step": 35000 |
| }, |
| { |
| "epoch": 10.204081632653061, |
| "eval_accuracy": 0.36714042743945047, |
| "eval_loss": 3.5801405906677246, |
| "eval_runtime": 188.68, |
| "eval_samples_per_second": 88.107, |
| "eval_steps_per_second": 5.507, |
| "step": 35000 |
| }, |
| { |
| "epoch": 10.21865889212828, |
| "grad_norm": 0.3474276661872864, |
| "learning_rate": 0.00047765810968494743, |
| "loss": 3.3483, |
| "step": 35050 |
| }, |
| { |
| "epoch": 10.2332361516035, |
| "grad_norm": 0.330358624458313, |
| "learning_rate": 0.00047748308051341886, |
| "loss": 3.3368, |
| "step": 35100 |
| }, |
| { |
| "epoch": 10.247813411078717, |
| "grad_norm": 0.38131415843963623, |
| "learning_rate": 0.0004773080513418903, |
| "loss": 3.3401, |
| "step": 35150 |
| }, |
| { |
| "epoch": 10.262390670553936, |
| "grad_norm": 0.33875641226768494, |
| "learning_rate": 0.00047713302217036166, |
| "loss": 3.3439, |
| "step": 35200 |
| }, |
| { |
| "epoch": 10.276967930029155, |
| "grad_norm": 0.34253689646720886, |
| "learning_rate": 0.00047695799299883314, |
| "loss": 3.3633, |
| "step": 35250 |
| }, |
| { |
| "epoch": 10.291545189504372, |
| "grad_norm": 0.3439371585845947, |
| "learning_rate": 0.0004767829638273045, |
| "loss": 3.3574, |
| "step": 35300 |
| }, |
| { |
| "epoch": 10.306122448979592, |
| "grad_norm": 0.33233463764190674, |
| "learning_rate": 0.00047660793465577594, |
| "loss": 3.3527, |
| "step": 35350 |
| }, |
| { |
| "epoch": 10.32069970845481, |
| "grad_norm": 0.3227967321872711, |
| "learning_rate": 0.0004764329054842473, |
| "loss": 3.3656, |
| "step": 35400 |
| }, |
| { |
| "epoch": 10.33527696793003, |
| "grad_norm": 0.3533599078655243, |
| "learning_rate": 0.0004762578763127188, |
| "loss": 3.3613, |
| "step": 35450 |
| }, |
| { |
| "epoch": 10.349854227405247, |
| "grad_norm": 0.36060336232185364, |
| "learning_rate": 0.00047608284714119016, |
| "loss": 3.3651, |
| "step": 35500 |
| }, |
| { |
| "epoch": 10.364431486880466, |
| "grad_norm": 0.33997446298599243, |
| "learning_rate": 0.00047590781796966154, |
| "loss": 3.3645, |
| "step": 35550 |
| }, |
| { |
| "epoch": 10.379008746355685, |
| "grad_norm": 0.3344101309776306, |
| "learning_rate": 0.000475732788798133, |
| "loss": 3.3598, |
| "step": 35600 |
| }, |
| { |
| "epoch": 10.393586005830905, |
| "grad_norm": 0.38466399908065796, |
| "learning_rate": 0.0004755577596266044, |
| "loss": 3.3571, |
| "step": 35650 |
| }, |
| { |
| "epoch": 10.408163265306122, |
| "grad_norm": 0.3226321041584015, |
| "learning_rate": 0.0004753827304550758, |
| "loss": 3.3667, |
| "step": 35700 |
| }, |
| { |
| "epoch": 10.422740524781341, |
| "grad_norm": 0.3315838575363159, |
| "learning_rate": 0.0004752077012835472, |
| "loss": 3.3521, |
| "step": 35750 |
| }, |
| { |
| "epoch": 10.43731778425656, |
| "grad_norm": 0.31704387068748474, |
| "learning_rate": 0.00047503267211201867, |
| "loss": 3.3738, |
| "step": 35800 |
| }, |
| { |
| "epoch": 10.451895043731778, |
| "grad_norm": 0.3453398048877716, |
| "learning_rate": 0.00047485764294049004, |
| "loss": 3.3588, |
| "step": 35850 |
| }, |
| { |
| "epoch": 10.466472303206997, |
| "grad_norm": 0.3315698802471161, |
| "learning_rate": 0.0004746826137689614, |
| "loss": 3.375, |
| "step": 35900 |
| }, |
| { |
| "epoch": 10.481049562682216, |
| "grad_norm": 0.31997567415237427, |
| "learning_rate": 0.00047450758459743284, |
| "loss": 3.3575, |
| "step": 35950 |
| }, |
| { |
| "epoch": 10.495626822157435, |
| "grad_norm": 0.3380381166934967, |
| "learning_rate": 0.00047433255542590427, |
| "loss": 3.3611, |
| "step": 36000 |
| }, |
| { |
| "epoch": 10.495626822157435, |
| "eval_accuracy": 0.36805368514434134, |
| "eval_loss": 3.5723817348480225, |
| "eval_runtime": 181.6702, |
| "eval_samples_per_second": 91.506, |
| "eval_steps_per_second": 5.719, |
| "step": 36000 |
| }, |
| { |
| "epoch": 10.510204081632653, |
| "grad_norm": 0.3401227295398712, |
| "learning_rate": 0.0004741575262543757, |
| "loss": 3.3722, |
| "step": 36050 |
| }, |
| { |
| "epoch": 10.524781341107872, |
| "grad_norm": 0.3410283625125885, |
| "learning_rate": 0.00047398249708284707, |
| "loss": 3.3738, |
| "step": 36100 |
| }, |
| { |
| "epoch": 10.53935860058309, |
| "grad_norm": 0.3501899540424347, |
| "learning_rate": 0.00047380746791131855, |
| "loss": 3.3763, |
| "step": 36150 |
| }, |
| { |
| "epoch": 10.55393586005831, |
| "grad_norm": 0.33715593814849854, |
| "learning_rate": 0.0004736324387397899, |
| "loss": 3.3773, |
| "step": 36200 |
| }, |
| { |
| "epoch": 10.568513119533527, |
| "grad_norm": 0.33864983916282654, |
| "learning_rate": 0.00047345740956826135, |
| "loss": 3.3685, |
| "step": 36250 |
| }, |
| { |
| "epoch": 10.583090379008746, |
| "grad_norm": 0.3323419690132141, |
| "learning_rate": 0.0004732823803967327, |
| "loss": 3.3754, |
| "step": 36300 |
| }, |
| { |
| "epoch": 10.597667638483966, |
| "grad_norm": 0.33451366424560547, |
| "learning_rate": 0.0004731073512252042, |
| "loss": 3.3812, |
| "step": 36350 |
| }, |
| { |
| "epoch": 10.612244897959183, |
| "grad_norm": 0.33620506525039673, |
| "learning_rate": 0.0004729323220536756, |
| "loss": 3.3679, |
| "step": 36400 |
| }, |
| { |
| "epoch": 10.626822157434402, |
| "grad_norm": 0.35062161087989807, |
| "learning_rate": 0.00047275729288214695, |
| "loss": 3.3713, |
| "step": 36450 |
| }, |
| { |
| "epoch": 10.641399416909621, |
| "grad_norm": 0.34864264726638794, |
| "learning_rate": 0.00047258226371061843, |
| "loss": 3.3778, |
| "step": 36500 |
| }, |
| { |
| "epoch": 10.65597667638484, |
| "grad_norm": 0.3500450849533081, |
| "learning_rate": 0.0004724072345390898, |
| "loss": 3.385, |
| "step": 36550 |
| }, |
| { |
| "epoch": 10.670553935860058, |
| "grad_norm": 0.3300856947898865, |
| "learning_rate": 0.00047223220536756123, |
| "loss": 3.3774, |
| "step": 36600 |
| }, |
| { |
| "epoch": 10.685131195335277, |
| "grad_norm": 0.3436342179775238, |
| "learning_rate": 0.0004720571761960326, |
| "loss": 3.3858, |
| "step": 36650 |
| }, |
| { |
| "epoch": 10.699708454810496, |
| "grad_norm": 0.3223649561405182, |
| "learning_rate": 0.0004718821470245041, |
| "loss": 3.39, |
| "step": 36700 |
| }, |
| { |
| "epoch": 10.714285714285714, |
| "grad_norm": 0.3425664007663727, |
| "learning_rate": 0.00047170711785297545, |
| "loss": 3.3773, |
| "step": 36750 |
| }, |
| { |
| "epoch": 10.728862973760933, |
| "grad_norm": 0.35593581199645996, |
| "learning_rate": 0.00047153208868144683, |
| "loss": 3.3936, |
| "step": 36800 |
| }, |
| { |
| "epoch": 10.743440233236152, |
| "grad_norm": 0.3750920295715332, |
| "learning_rate": 0.0004713570595099183, |
| "loss": 3.3842, |
| "step": 36850 |
| }, |
| { |
| "epoch": 10.758017492711371, |
| "grad_norm": 0.3294471800327301, |
| "learning_rate": 0.0004711820303383897, |
| "loss": 3.3889, |
| "step": 36900 |
| }, |
| { |
| "epoch": 10.772594752186588, |
| "grad_norm": 0.3309391140937805, |
| "learning_rate": 0.0004710070011668611, |
| "loss": 3.4021, |
| "step": 36950 |
| }, |
| { |
| "epoch": 10.787172011661808, |
| "grad_norm": 0.37308335304260254, |
| "learning_rate": 0.0004708319719953325, |
| "loss": 3.3845, |
| "step": 37000 |
| }, |
| { |
| "epoch": 10.787172011661808, |
| "eval_accuracy": 0.36864180521312645, |
| "eval_loss": 3.5626156330108643, |
| "eval_runtime": 182.1455, |
| "eval_samples_per_second": 91.268, |
| "eval_steps_per_second": 5.704, |
| "step": 37000 |
| }, |
| { |
| "epoch": 10.801749271137027, |
| "grad_norm": 0.34260591864585876, |
| "learning_rate": 0.00047065694282380396, |
| "loss": 3.3928, |
| "step": 37050 |
| }, |
| { |
| "epoch": 10.816326530612244, |
| "grad_norm": 0.3432435691356659, |
| "learning_rate": 0.00047048191365227533, |
| "loss": 3.3851, |
| "step": 37100 |
| }, |
| { |
| "epoch": 10.830903790087463, |
| "grad_norm": 0.3411959111690521, |
| "learning_rate": 0.00047030688448074676, |
| "loss": 3.3789, |
| "step": 37150 |
| }, |
| { |
| "epoch": 10.845481049562682, |
| "grad_norm": 0.3398871123790741, |
| "learning_rate": 0.00047013185530921813, |
| "loss": 3.3843, |
| "step": 37200 |
| }, |
| { |
| "epoch": 10.860058309037901, |
| "grad_norm": 0.33319368958473206, |
| "learning_rate": 0.0004699568261376896, |
| "loss": 3.3983, |
| "step": 37250 |
| }, |
| { |
| "epoch": 10.874635568513119, |
| "grad_norm": 0.48678529262542725, |
| "learning_rate": 0.000469781796966161, |
| "loss": 3.3963, |
| "step": 37300 |
| }, |
| { |
| "epoch": 10.889212827988338, |
| "grad_norm": 0.33169659972190857, |
| "learning_rate": 0.00046960676779463236, |
| "loss": 3.394, |
| "step": 37350 |
| }, |
| { |
| "epoch": 10.903790087463557, |
| "grad_norm": 0.3148620128631592, |
| "learning_rate": 0.00046943173862310384, |
| "loss": 3.3961, |
| "step": 37400 |
| }, |
| { |
| "epoch": 10.918367346938776, |
| "grad_norm": 0.3453501760959625, |
| "learning_rate": 0.0004692567094515752, |
| "loss": 3.4005, |
| "step": 37450 |
| }, |
| { |
| "epoch": 10.932944606413994, |
| "grad_norm": 0.36119741201400757, |
| "learning_rate": 0.00046908168028004664, |
| "loss": 3.3911, |
| "step": 37500 |
| }, |
| { |
| "epoch": 10.947521865889213, |
| "grad_norm": 0.3385680019855499, |
| "learning_rate": 0.000468906651108518, |
| "loss": 3.3792, |
| "step": 37550 |
| }, |
| { |
| "epoch": 10.962099125364432, |
| "grad_norm": 0.30895158648490906, |
| "learning_rate": 0.0004687316219369895, |
| "loss": 3.3978, |
| "step": 37600 |
| }, |
| { |
| "epoch": 10.97667638483965, |
| "grad_norm": 0.3564043939113617, |
| "learning_rate": 0.00046855659276546087, |
| "loss": 3.388, |
| "step": 37650 |
| }, |
| { |
| "epoch": 10.991253644314869, |
| "grad_norm": 0.3299904465675354, |
| "learning_rate": 0.00046838156359393224, |
| "loss": 3.3916, |
| "step": 37700 |
| }, |
| { |
| "epoch": 11.005830903790088, |
| "grad_norm": 0.3511546850204468, |
| "learning_rate": 0.0004682065344224037, |
| "loss": 3.3494, |
| "step": 37750 |
| }, |
| { |
| "epoch": 11.020408163265307, |
| "grad_norm": 0.37850990891456604, |
| "learning_rate": 0.0004680315052508751, |
| "loss": 3.289, |
| "step": 37800 |
| }, |
| { |
| "epoch": 11.034985422740524, |
| "grad_norm": 0.3183746933937073, |
| "learning_rate": 0.0004678564760793465, |
| "loss": 3.2999, |
| "step": 37850 |
| }, |
| { |
| "epoch": 11.049562682215743, |
| "grad_norm": 0.3424279987812042, |
| "learning_rate": 0.0004676814469078179, |
| "loss": 3.3011, |
| "step": 37900 |
| }, |
| { |
| "epoch": 11.064139941690962, |
| "grad_norm": 0.34383082389831543, |
| "learning_rate": 0.00046750641773628937, |
| "loss": 3.2857, |
| "step": 37950 |
| }, |
| { |
| "epoch": 11.078717201166182, |
| "grad_norm": 0.34979602694511414, |
| "learning_rate": 0.00046733138856476075, |
| "loss": 3.2814, |
| "step": 38000 |
| }, |
| { |
| "epoch": 11.078717201166182, |
| "eval_accuracy": 0.36815092036788344, |
| "eval_loss": 3.574191093444824, |
| "eval_runtime": 182.0074, |
| "eval_samples_per_second": 91.337, |
| "eval_steps_per_second": 5.709, |
| "step": 38000 |
| }, |
| { |
| "epoch": 11.093294460641399, |
| "grad_norm": 0.3627917170524597, |
| "learning_rate": 0.00046715635939323217, |
| "loss": 3.2924, |
| "step": 38050 |
| }, |
| { |
| "epoch": 11.107871720116618, |
| "grad_norm": 0.36935657262802124, |
| "learning_rate": 0.0004669813302217036, |
| "loss": 3.3067, |
| "step": 38100 |
| }, |
| { |
| "epoch": 11.122448979591837, |
| "grad_norm": 0.3449645936489105, |
| "learning_rate": 0.000466806301050175, |
| "loss": 3.3012, |
| "step": 38150 |
| }, |
| { |
| "epoch": 11.137026239067055, |
| "grad_norm": 0.3319040834903717, |
| "learning_rate": 0.0004666312718786464, |
| "loss": 3.3077, |
| "step": 38200 |
| }, |
| { |
| "epoch": 11.151603498542274, |
| "grad_norm": 0.3427700102329254, |
| "learning_rate": 0.00046645624270711777, |
| "loss": 3.3209, |
| "step": 38250 |
| }, |
| { |
| "epoch": 11.166180758017493, |
| "grad_norm": 0.3540455996990204, |
| "learning_rate": 0.00046628121353558925, |
| "loss": 3.3128, |
| "step": 38300 |
| }, |
| { |
| "epoch": 11.180758017492712, |
| "grad_norm": 0.34144529700279236, |
| "learning_rate": 0.0004661061843640606, |
| "loss": 3.3151, |
| "step": 38350 |
| }, |
| { |
| "epoch": 11.19533527696793, |
| "grad_norm": 0.3432770371437073, |
| "learning_rate": 0.00046593115519253205, |
| "loss": 3.3182, |
| "step": 38400 |
| }, |
| { |
| "epoch": 11.209912536443149, |
| "grad_norm": 0.36312398314476013, |
| "learning_rate": 0.0004657561260210034, |
| "loss": 3.3341, |
| "step": 38450 |
| }, |
| { |
| "epoch": 11.224489795918368, |
| "grad_norm": 0.3511596620082855, |
| "learning_rate": 0.0004655810968494749, |
| "loss": 3.3246, |
| "step": 38500 |
| }, |
| { |
| "epoch": 11.239067055393585, |
| "grad_norm": 0.3314606249332428, |
| "learning_rate": 0.0004654060676779463, |
| "loss": 3.3157, |
| "step": 38550 |
| }, |
| { |
| "epoch": 11.253644314868804, |
| "grad_norm": 0.32473742961883545, |
| "learning_rate": 0.00046523103850641765, |
| "loss": 3.3226, |
| "step": 38600 |
| }, |
| { |
| "epoch": 11.268221574344023, |
| "grad_norm": 0.3840942084789276, |
| "learning_rate": 0.00046505600933488913, |
| "loss": 3.3321, |
| "step": 38650 |
| }, |
| { |
| "epoch": 11.282798833819243, |
| "grad_norm": 0.33107447624206543, |
| "learning_rate": 0.0004648809801633605, |
| "loss": 3.3308, |
| "step": 38700 |
| }, |
| { |
| "epoch": 11.29737609329446, |
| "grad_norm": 0.37443867325782776, |
| "learning_rate": 0.00046470595099183193, |
| "loss": 3.3342, |
| "step": 38750 |
| }, |
| { |
| "epoch": 11.31195335276968, |
| "grad_norm": 0.3259379267692566, |
| "learning_rate": 0.0004645309218203033, |
| "loss": 3.3435, |
| "step": 38800 |
| }, |
| { |
| "epoch": 11.326530612244898, |
| "grad_norm": 0.34641292691230774, |
| "learning_rate": 0.0004643558926487748, |
| "loss": 3.3338, |
| "step": 38850 |
| }, |
| { |
| "epoch": 11.341107871720117, |
| "grad_norm": 0.36578723788261414, |
| "learning_rate": 0.00046418086347724616, |
| "loss": 3.3361, |
| "step": 38900 |
| }, |
| { |
| "epoch": 11.355685131195335, |
| "grad_norm": 0.37702852487564087, |
| "learning_rate": 0.0004640058343057176, |
| "loss": 3.3376, |
| "step": 38950 |
| }, |
| { |
| "epoch": 11.370262390670554, |
| "grad_norm": 0.3479772210121155, |
| "learning_rate": 0.000463830805134189, |
| "loss": 3.3361, |
| "step": 39000 |
| }, |
| { |
| "epoch": 11.370262390670554, |
| "eval_accuracy": 0.36859189270128395, |
| "eval_loss": 3.5683250427246094, |
| "eval_runtime": 182.349, |
| "eval_samples_per_second": 91.166, |
| "eval_steps_per_second": 5.698, |
| "step": 39000 |
| }, |
| { |
| "epoch": 11.384839650145773, |
| "grad_norm": 0.35327914357185364, |
| "learning_rate": 0.00046365577596266044, |
| "loss": 3.3319, |
| "step": 39050 |
| }, |
| { |
| "epoch": 11.39941690962099, |
| "grad_norm": 0.3379618227481842, |
| "learning_rate": 0.0004634807467911318, |
| "loss": 3.3439, |
| "step": 39100 |
| }, |
| { |
| "epoch": 11.41399416909621, |
| "grad_norm": 0.34011024236679077, |
| "learning_rate": 0.0004633057176196032, |
| "loss": 3.3306, |
| "step": 39150 |
| }, |
| { |
| "epoch": 11.428571428571429, |
| "grad_norm": 0.3477155566215515, |
| "learning_rate": 0.00046313068844807466, |
| "loss": 3.3477, |
| "step": 39200 |
| }, |
| { |
| "epoch": 11.443148688046648, |
| "grad_norm": 0.3348955512046814, |
| "learning_rate": 0.00046295565927654604, |
| "loss": 3.3553, |
| "step": 39250 |
| }, |
| { |
| "epoch": 11.457725947521865, |
| "grad_norm": 0.3500663936138153, |
| "learning_rate": 0.00046278063010501746, |
| "loss": 3.3497, |
| "step": 39300 |
| }, |
| { |
| "epoch": 11.472303206997085, |
| "grad_norm": 0.3484138250350952, |
| "learning_rate": 0.0004626056009334889, |
| "loss": 3.3546, |
| "step": 39350 |
| }, |
| { |
| "epoch": 11.486880466472304, |
| "grad_norm": 0.3597988784313202, |
| "learning_rate": 0.0004624305717619603, |
| "loss": 3.3544, |
| "step": 39400 |
| }, |
| { |
| "epoch": 11.501457725947521, |
| "grad_norm": 0.33883535861968994, |
| "learning_rate": 0.0004622555425904317, |
| "loss": 3.3627, |
| "step": 39450 |
| }, |
| { |
| "epoch": 11.51603498542274, |
| "grad_norm": 0.3259831964969635, |
| "learning_rate": 0.00046208051341890306, |
| "loss": 3.3501, |
| "step": 39500 |
| }, |
| { |
| "epoch": 11.53061224489796, |
| "grad_norm": 0.3629702925682068, |
| "learning_rate": 0.00046190548424737454, |
| "loss": 3.3493, |
| "step": 39550 |
| }, |
| { |
| "epoch": 11.545189504373178, |
| "grad_norm": 0.3257094919681549, |
| "learning_rate": 0.0004617304550758459, |
| "loss": 3.3555, |
| "step": 39600 |
| }, |
| { |
| "epoch": 11.559766763848396, |
| "grad_norm": 0.3216487169265747, |
| "learning_rate": 0.00046155542590431734, |
| "loss": 3.3588, |
| "step": 39650 |
| }, |
| { |
| "epoch": 11.574344023323615, |
| "grad_norm": 0.3487645089626312, |
| "learning_rate": 0.0004613803967327887, |
| "loss": 3.3672, |
| "step": 39700 |
| }, |
| { |
| "epoch": 11.588921282798834, |
| "grad_norm": 0.3463019132614136, |
| "learning_rate": 0.0004612053675612602, |
| "loss": 3.3684, |
| "step": 39750 |
| }, |
| { |
| "epoch": 11.603498542274053, |
| "grad_norm": 0.36498206853866577, |
| "learning_rate": 0.00046103033838973157, |
| "loss": 3.3629, |
| "step": 39800 |
| }, |
| { |
| "epoch": 11.61807580174927, |
| "grad_norm": 0.3331190347671509, |
| "learning_rate": 0.000460855309218203, |
| "loss": 3.363, |
| "step": 39850 |
| }, |
| { |
| "epoch": 11.63265306122449, |
| "grad_norm": 0.36024343967437744, |
| "learning_rate": 0.0004606802800466744, |
| "loss": 3.3487, |
| "step": 39900 |
| }, |
| { |
| "epoch": 11.647230320699709, |
| "grad_norm": 0.3634866774082184, |
| "learning_rate": 0.00046050525087514585, |
| "loss": 3.3566, |
| "step": 39950 |
| }, |
| { |
| "epoch": 11.661807580174926, |
| "grad_norm": 0.34901607036590576, |
| "learning_rate": 0.0004603302217036172, |
| "loss": 3.3702, |
| "step": 40000 |
| }, |
| { |
| "epoch": 11.661807580174926, |
| "eval_accuracy": 0.36894645988446667, |
| "eval_loss": 3.562626361846924, |
| "eval_runtime": 182.1151, |
| "eval_samples_per_second": 91.283, |
| "eval_steps_per_second": 5.705, |
| "step": 40000 |
| }, |
| { |
| "epoch": 11.676384839650146, |
| "grad_norm": 0.33021676540374756, |
| "learning_rate": 0.0004601551925320886, |
| "loss": 3.3612, |
| "step": 40050 |
| }, |
| { |
| "epoch": 11.690962099125365, |
| "grad_norm": 0.33711355924606323, |
| "learning_rate": 0.0004599801633605601, |
| "loss": 3.3578, |
| "step": 40100 |
| }, |
| { |
| "epoch": 11.705539358600584, |
| "grad_norm": 0.3925802409648895, |
| "learning_rate": 0.00045980513418903145, |
| "loss": 3.3587, |
| "step": 40150 |
| }, |
| { |
| "epoch": 11.720116618075801, |
| "grad_norm": 0.3565693199634552, |
| "learning_rate": 0.0004596301050175029, |
| "loss": 3.366, |
| "step": 40200 |
| }, |
| { |
| "epoch": 11.73469387755102, |
| "grad_norm": 0.33789220452308655, |
| "learning_rate": 0.0004594550758459743, |
| "loss": 3.3658, |
| "step": 40250 |
| }, |
| { |
| "epoch": 11.74927113702624, |
| "grad_norm": 0.326738178730011, |
| "learning_rate": 0.00045928004667444573, |
| "loss": 3.3584, |
| "step": 40300 |
| }, |
| { |
| "epoch": 11.763848396501459, |
| "grad_norm": 0.34021317958831787, |
| "learning_rate": 0.0004591050175029171, |
| "loss": 3.373, |
| "step": 40350 |
| }, |
| { |
| "epoch": 11.778425655976676, |
| "grad_norm": 0.3672606348991394, |
| "learning_rate": 0.0004589299883313885, |
| "loss": 3.3564, |
| "step": 40400 |
| }, |
| { |
| "epoch": 11.793002915451895, |
| "grad_norm": 0.34738481044769287, |
| "learning_rate": 0.00045875495915985995, |
| "loss": 3.3651, |
| "step": 40450 |
| }, |
| { |
| "epoch": 11.807580174927114, |
| "grad_norm": 0.33744242787361145, |
| "learning_rate": 0.00045857992998833133, |
| "loss": 3.3682, |
| "step": 40500 |
| }, |
| { |
| "epoch": 11.822157434402332, |
| "grad_norm": 0.35602742433547974, |
| "learning_rate": 0.00045840490081680275, |
| "loss": 3.3661, |
| "step": 40550 |
| }, |
| { |
| "epoch": 11.83673469387755, |
| "grad_norm": 0.3444158434867859, |
| "learning_rate": 0.0004582298716452742, |
| "loss": 3.364, |
| "step": 40600 |
| }, |
| { |
| "epoch": 11.85131195335277, |
| "grad_norm": 0.342560738325119, |
| "learning_rate": 0.0004580548424737456, |
| "loss": 3.3817, |
| "step": 40650 |
| }, |
| { |
| "epoch": 11.86588921282799, |
| "grad_norm": 0.364653617143631, |
| "learning_rate": 0.000457879813302217, |
| "loss": 3.3652, |
| "step": 40700 |
| }, |
| { |
| "epoch": 11.880466472303207, |
| "grad_norm": 0.34066253900527954, |
| "learning_rate": 0.0004577047841306884, |
| "loss": 3.3719, |
| "step": 40750 |
| }, |
| { |
| "epoch": 11.895043731778426, |
| "grad_norm": 0.3448898196220398, |
| "learning_rate": 0.00045752975495915983, |
| "loss": 3.3707, |
| "step": 40800 |
| }, |
| { |
| "epoch": 11.909620991253645, |
| "grad_norm": 0.31567615270614624, |
| "learning_rate": 0.00045735472578763126, |
| "loss": 3.3814, |
| "step": 40850 |
| }, |
| { |
| "epoch": 11.924198250728862, |
| "grad_norm": 0.3457653820514679, |
| "learning_rate": 0.00045717969661610263, |
| "loss": 3.3668, |
| "step": 40900 |
| }, |
| { |
| "epoch": 11.938775510204081, |
| "grad_norm": 0.34771329164505005, |
| "learning_rate": 0.000457004667444574, |
| "loss": 3.377, |
| "step": 40950 |
| }, |
| { |
| "epoch": 11.9533527696793, |
| "grad_norm": 0.3419136106967926, |
| "learning_rate": 0.0004568296382730455, |
| "loss": 3.3664, |
| "step": 41000 |
| }, |
| { |
| "epoch": 11.9533527696793, |
| "eval_accuracy": 0.36959120240182775, |
| "eval_loss": 3.554091691970825, |
| "eval_runtime": 181.934, |
| "eval_samples_per_second": 91.374, |
| "eval_steps_per_second": 5.711, |
| "step": 41000 |
| }, |
| { |
| "epoch": 11.96793002915452, |
| "grad_norm": 0.3192198872566223, |
| "learning_rate": 0.00045665460910151686, |
| "loss": 3.3775, |
| "step": 41050 |
| }, |
| { |
| "epoch": 11.982507288629737, |
| "grad_norm": 0.36953675746917725, |
| "learning_rate": 0.0004564795799299883, |
| "loss": 3.3717, |
| "step": 41100 |
| }, |
| { |
| "epoch": 11.997084548104956, |
| "grad_norm": 0.3306615948677063, |
| "learning_rate": 0.0004563045507584597, |
| "loss": 3.377, |
| "step": 41150 |
| }, |
| { |
| "epoch": 12.011661807580175, |
| "grad_norm": 0.3414677083492279, |
| "learning_rate": 0.00045612952158693114, |
| "loss": 3.2798, |
| "step": 41200 |
| }, |
| { |
| "epoch": 12.026239067055394, |
| "grad_norm": 0.3555249869823456, |
| "learning_rate": 0.0004559544924154025, |
| "loss": 3.2622, |
| "step": 41250 |
| }, |
| { |
| "epoch": 12.040816326530612, |
| "grad_norm": 0.34148281812667847, |
| "learning_rate": 0.0004557794632438739, |
| "loss": 3.2654, |
| "step": 41300 |
| }, |
| { |
| "epoch": 12.055393586005831, |
| "grad_norm": 0.340695858001709, |
| "learning_rate": 0.00045560443407234537, |
| "loss": 3.28, |
| "step": 41350 |
| }, |
| { |
| "epoch": 12.06997084548105, |
| "grad_norm": 0.342260479927063, |
| "learning_rate": 0.00045542940490081674, |
| "loss": 3.2805, |
| "step": 41400 |
| }, |
| { |
| "epoch": 12.084548104956268, |
| "grad_norm": 0.3923518657684326, |
| "learning_rate": 0.00045525437572928817, |
| "loss": 3.2852, |
| "step": 41450 |
| }, |
| { |
| "epoch": 12.099125364431487, |
| "grad_norm": 0.331145703792572, |
| "learning_rate": 0.0004550793465577596, |
| "loss": 3.262, |
| "step": 41500 |
| }, |
| { |
| "epoch": 12.113702623906706, |
| "grad_norm": 0.35945871472358704, |
| "learning_rate": 0.000454904317386231, |
| "loss": 3.2814, |
| "step": 41550 |
| }, |
| { |
| "epoch": 12.128279883381925, |
| "grad_norm": 0.34431734681129456, |
| "learning_rate": 0.0004547292882147024, |
| "loss": 3.3014, |
| "step": 41600 |
| }, |
| { |
| "epoch": 12.142857142857142, |
| "grad_norm": 0.3369593322277069, |
| "learning_rate": 0.0004545542590431738, |
| "loss": 3.2962, |
| "step": 41650 |
| }, |
| { |
| "epoch": 12.157434402332362, |
| "grad_norm": 0.34433311223983765, |
| "learning_rate": 0.00045437922987164525, |
| "loss": 3.2765, |
| "step": 41700 |
| }, |
| { |
| "epoch": 12.17201166180758, |
| "grad_norm": 0.3524353802204132, |
| "learning_rate": 0.00045420420070011667, |
| "loss": 3.2916, |
| "step": 41750 |
| }, |
| { |
| "epoch": 12.186588921282798, |
| "grad_norm": 0.3536403477191925, |
| "learning_rate": 0.00045402917152858804, |
| "loss": 3.2998, |
| "step": 41800 |
| }, |
| { |
| "epoch": 12.201166180758017, |
| "grad_norm": 0.3326958119869232, |
| "learning_rate": 0.0004538541423570595, |
| "loss": 3.3098, |
| "step": 41850 |
| }, |
| { |
| "epoch": 12.215743440233236, |
| "grad_norm": 0.34114643931388855, |
| "learning_rate": 0.0004536791131855309, |
| "loss": 3.301, |
| "step": 41900 |
| }, |
| { |
| "epoch": 12.230320699708455, |
| "grad_norm": 0.37027707695961, |
| "learning_rate": 0.00045350408401400227, |
| "loss": 3.2952, |
| "step": 41950 |
| }, |
| { |
| "epoch": 12.244897959183673, |
| "grad_norm": 0.346375435590744, |
| "learning_rate": 0.0004533290548424737, |
| "loss": 3.3073, |
| "step": 42000 |
| }, |
| { |
| "epoch": 12.244897959183673, |
| "eval_accuracy": 0.3688454576788987, |
| "eval_loss": 3.5696043968200684, |
| "eval_runtime": 181.8129, |
| "eval_samples_per_second": 91.435, |
| "eval_steps_per_second": 5.715, |
| "step": 42000 |
| }, |
| { |
| "epoch": 12.259475218658892, |
| "grad_norm": 0.3549197316169739, |
| "learning_rate": 0.0004531540256709451, |
| "loss": 3.3238, |
| "step": 42050 |
| }, |
| { |
| "epoch": 12.274052478134111, |
| "grad_norm": 0.3610092103481293, |
| "learning_rate": 0.00045297899649941655, |
| "loss": 3.3197, |
| "step": 42100 |
| }, |
| { |
| "epoch": 12.28862973760933, |
| "grad_norm": 0.3515133857727051, |
| "learning_rate": 0.0004528039673278879, |
| "loss": 3.316, |
| "step": 42150 |
| }, |
| { |
| "epoch": 12.303206997084548, |
| "grad_norm": 0.3595082461833954, |
| "learning_rate": 0.0004526289381563593, |
| "loss": 3.3178, |
| "step": 42200 |
| }, |
| { |
| "epoch": 12.317784256559767, |
| "grad_norm": 0.34504249691963196, |
| "learning_rate": 0.0004524539089848308, |
| "loss": 3.3047, |
| "step": 42250 |
| }, |
| { |
| "epoch": 12.332361516034986, |
| "grad_norm": 0.33892661333084106, |
| "learning_rate": 0.00045227887981330215, |
| "loss": 3.3231, |
| "step": 42300 |
| }, |
| { |
| "epoch": 12.346938775510203, |
| "grad_norm": 0.33925503492355347, |
| "learning_rate": 0.0004521038506417736, |
| "loss": 3.3194, |
| "step": 42350 |
| }, |
| { |
| "epoch": 12.361516034985423, |
| "grad_norm": 0.3226408064365387, |
| "learning_rate": 0.000451928821470245, |
| "loss": 3.3263, |
| "step": 42400 |
| }, |
| { |
| "epoch": 12.376093294460642, |
| "grad_norm": 0.33827292919158936, |
| "learning_rate": 0.00045175379229871643, |
| "loss": 3.3174, |
| "step": 42450 |
| }, |
| { |
| "epoch": 12.39067055393586, |
| "grad_norm": 0.3600287437438965, |
| "learning_rate": 0.0004515787631271878, |
| "loss": 3.3228, |
| "step": 42500 |
| }, |
| { |
| "epoch": 12.405247813411078, |
| "grad_norm": 0.353061318397522, |
| "learning_rate": 0.00045140373395565923, |
| "loss": 3.3152, |
| "step": 42550 |
| }, |
| { |
| "epoch": 12.419825072886297, |
| "grad_norm": 0.36559176445007324, |
| "learning_rate": 0.00045122870478413066, |
| "loss": 3.3145, |
| "step": 42600 |
| }, |
| { |
| "epoch": 12.434402332361516, |
| "grad_norm": 0.34186482429504395, |
| "learning_rate": 0.0004510536756126021, |
| "loss": 3.341, |
| "step": 42650 |
| }, |
| { |
| "epoch": 12.448979591836734, |
| "grad_norm": 0.3551715016365051, |
| "learning_rate": 0.00045087864644107346, |
| "loss": 3.3345, |
| "step": 42700 |
| }, |
| { |
| "epoch": 12.463556851311953, |
| "grad_norm": 0.3601856529712677, |
| "learning_rate": 0.00045070361726954494, |
| "loss": 3.3279, |
| "step": 42750 |
| }, |
| { |
| "epoch": 12.478134110787172, |
| "grad_norm": 0.33436763286590576, |
| "learning_rate": 0.0004505285880980163, |
| "loss": 3.3244, |
| "step": 42800 |
| }, |
| { |
| "epoch": 12.492711370262391, |
| "grad_norm": 0.33347827196121216, |
| "learning_rate": 0.0004503535589264877, |
| "loss": 3.3318, |
| "step": 42850 |
| }, |
| { |
| "epoch": 12.507288629737609, |
| "grad_norm": 0.3343319892883301, |
| "learning_rate": 0.0004501785297549591, |
| "loss": 3.3318, |
| "step": 42900 |
| }, |
| { |
| "epoch": 12.521865889212828, |
| "grad_norm": 0.3466685712337494, |
| "learning_rate": 0.00045000350058343054, |
| "loss": 3.3361, |
| "step": 42950 |
| }, |
| { |
| "epoch": 12.536443148688047, |
| "grad_norm": 0.3229898512363434, |
| "learning_rate": 0.00044982847141190196, |
| "loss": 3.3408, |
| "step": 43000 |
| }, |
| { |
| "epoch": 12.536443148688047, |
| "eval_accuracy": 0.36942121733791145, |
| "eval_loss": 3.55837345123291, |
| "eval_runtime": 181.7545, |
| "eval_samples_per_second": 91.464, |
| "eval_steps_per_second": 5.717, |
| "step": 43000 |
| }, |
| { |
| "epoch": 12.551020408163264, |
| "grad_norm": 0.34018051624298096, |
| "learning_rate": 0.00044965344224037334, |
| "loss": 3.329, |
| "step": 43050 |
| }, |
| { |
| "epoch": 12.565597667638484, |
| "grad_norm": 0.36611273884773254, |
| "learning_rate": 0.0004494784130688448, |
| "loss": 3.337, |
| "step": 43100 |
| }, |
| { |
| "epoch": 12.580174927113703, |
| "grad_norm": 0.3752545714378357, |
| "learning_rate": 0.0004493033838973162, |
| "loss": 3.3445, |
| "step": 43150 |
| }, |
| { |
| "epoch": 12.594752186588922, |
| "grad_norm": 0.3641834259033203, |
| "learning_rate": 0.00044912835472578756, |
| "loss": 3.3453, |
| "step": 43200 |
| }, |
| { |
| "epoch": 12.60932944606414, |
| "grad_norm": 0.36704397201538086, |
| "learning_rate": 0.000448953325554259, |
| "loss": 3.341, |
| "step": 43250 |
| }, |
| { |
| "epoch": 12.623906705539358, |
| "grad_norm": 0.33914434909820557, |
| "learning_rate": 0.0004487782963827304, |
| "loss": 3.3488, |
| "step": 43300 |
| }, |
| { |
| "epoch": 12.638483965014577, |
| "grad_norm": 0.3278019428253174, |
| "learning_rate": 0.00044860326721120184, |
| "loss": 3.3478, |
| "step": 43350 |
| }, |
| { |
| "epoch": 12.653061224489797, |
| "grad_norm": 0.35495081543922424, |
| "learning_rate": 0.0004484282380396732, |
| "loss": 3.3384, |
| "step": 43400 |
| }, |
| { |
| "epoch": 12.667638483965014, |
| "grad_norm": 0.35847678780555725, |
| "learning_rate": 0.00044825320886814464, |
| "loss": 3.3517, |
| "step": 43450 |
| }, |
| { |
| "epoch": 12.682215743440233, |
| "grad_norm": 0.3521868884563446, |
| "learning_rate": 0.00044807817969661607, |
| "loss": 3.3402, |
| "step": 43500 |
| }, |
| { |
| "epoch": 12.696793002915452, |
| "grad_norm": 0.36590296030044556, |
| "learning_rate": 0.0004479031505250875, |
| "loss": 3.3511, |
| "step": 43550 |
| }, |
| { |
| "epoch": 12.71137026239067, |
| "grad_norm": 0.3575020432472229, |
| "learning_rate": 0.00044772812135355887, |
| "loss": 3.3492, |
| "step": 43600 |
| }, |
| { |
| "epoch": 12.725947521865889, |
| "grad_norm": 0.31945955753326416, |
| "learning_rate": 0.00044755309218203035, |
| "loss": 3.347, |
| "step": 43650 |
| }, |
| { |
| "epoch": 12.740524781341108, |
| "grad_norm": 0.3386148512363434, |
| "learning_rate": 0.0004473780630105017, |
| "loss": 3.3531, |
| "step": 43700 |
| }, |
| { |
| "epoch": 12.755102040816327, |
| "grad_norm": 0.3749040365219116, |
| "learning_rate": 0.0004472030338389731, |
| "loss": 3.3309, |
| "step": 43750 |
| }, |
| { |
| "epoch": 12.769679300291545, |
| "grad_norm": 0.35152819752693176, |
| "learning_rate": 0.0004470280046674445, |
| "loss": 3.353, |
| "step": 43800 |
| }, |
| { |
| "epoch": 12.784256559766764, |
| "grad_norm": 0.3418697416782379, |
| "learning_rate": 0.00044685297549591595, |
| "loss": 3.3484, |
| "step": 43850 |
| }, |
| { |
| "epoch": 12.798833819241983, |
| "grad_norm": 0.33918634057044983, |
| "learning_rate": 0.0004466779463243874, |
| "loss": 3.3562, |
| "step": 43900 |
| }, |
| { |
| "epoch": 12.813411078717202, |
| "grad_norm": 0.355820894241333, |
| "learning_rate": 0.00044650291715285875, |
| "loss": 3.3487, |
| "step": 43950 |
| }, |
| { |
| "epoch": 12.82798833819242, |
| "grad_norm": 0.339217871427536, |
| "learning_rate": 0.00044632788798133023, |
| "loss": 3.3521, |
| "step": 44000 |
| }, |
| { |
| "epoch": 12.82798833819242, |
| "eval_accuracy": 0.37001840170719624, |
| "eval_loss": 3.549464464187622, |
| "eval_runtime": 181.7706, |
| "eval_samples_per_second": 91.456, |
| "eval_steps_per_second": 5.716, |
| "step": 44000 |
| }, |
| { |
| "epoch": 12.842565597667638, |
| "grad_norm": 0.35124990344047546, |
| "learning_rate": 0.0004461528588098016, |
| "loss": 3.3483, |
| "step": 44050 |
| }, |
| { |
| "epoch": 12.857142857142858, |
| "grad_norm": 0.3328862190246582, |
| "learning_rate": 0.000445977829638273, |
| "loss": 3.3474, |
| "step": 44100 |
| }, |
| { |
| "epoch": 12.871720116618075, |
| "grad_norm": 0.330714613199234, |
| "learning_rate": 0.0004458028004667444, |
| "loss": 3.3629, |
| "step": 44150 |
| }, |
| { |
| "epoch": 12.886297376093294, |
| "grad_norm": 0.362414687871933, |
| "learning_rate": 0.00044562777129521583, |
| "loss": 3.3562, |
| "step": 44200 |
| }, |
| { |
| "epoch": 12.900874635568513, |
| "grad_norm": 0.3345976769924164, |
| "learning_rate": 0.00044545274212368725, |
| "loss": 3.3585, |
| "step": 44250 |
| }, |
| { |
| "epoch": 12.915451895043732, |
| "grad_norm": 0.40451279282569885, |
| "learning_rate": 0.0004452777129521586, |
| "loss": 3.3541, |
| "step": 44300 |
| }, |
| { |
| "epoch": 12.93002915451895, |
| "grad_norm": 0.3470134139060974, |
| "learning_rate": 0.0004451026837806301, |
| "loss": 3.3578, |
| "step": 44350 |
| }, |
| { |
| "epoch": 12.944606413994169, |
| "grad_norm": 0.3472599983215332, |
| "learning_rate": 0.0004449276546091015, |
| "loss": 3.3493, |
| "step": 44400 |
| }, |
| { |
| "epoch": 12.959183673469388, |
| "grad_norm": 0.3843700885772705, |
| "learning_rate": 0.0004447526254375729, |
| "loss": 3.3645, |
| "step": 44450 |
| }, |
| { |
| "epoch": 12.973760932944607, |
| "grad_norm": 0.3511066436767578, |
| "learning_rate": 0.0004445775962660443, |
| "loss": 3.3645, |
| "step": 44500 |
| }, |
| { |
| "epoch": 12.988338192419825, |
| "grad_norm": 0.3752990961074829, |
| "learning_rate": 0.00044440256709451576, |
| "loss": 3.3557, |
| "step": 44550 |
| }, |
| { |
| "epoch": 13.002915451895044, |
| "grad_norm": 0.35009706020355225, |
| "learning_rate": 0.00044422753792298713, |
| "loss": 3.3437, |
| "step": 44600 |
| }, |
| { |
| "epoch": 13.017492711370263, |
| "grad_norm": 0.36041513085365295, |
| "learning_rate": 0.0004440525087514585, |
| "loss": 3.25, |
| "step": 44650 |
| }, |
| { |
| "epoch": 13.03206997084548, |
| "grad_norm": 0.373749703168869, |
| "learning_rate": 0.00044387747957992993, |
| "loss": 3.2508, |
| "step": 44700 |
| }, |
| { |
| "epoch": 13.0466472303207, |
| "grad_norm": 0.34117427468299866, |
| "learning_rate": 0.00044370245040840136, |
| "loss": 3.2498, |
| "step": 44750 |
| }, |
| { |
| "epoch": 13.061224489795919, |
| "grad_norm": 0.3592318296432495, |
| "learning_rate": 0.0004435274212368728, |
| "loss": 3.2561, |
| "step": 44800 |
| }, |
| { |
| "epoch": 13.075801749271138, |
| "grad_norm": 0.3574276864528656, |
| "learning_rate": 0.00044335239206534416, |
| "loss": 3.2736, |
| "step": 44850 |
| }, |
| { |
| "epoch": 13.090379008746355, |
| "grad_norm": 0.35315731167793274, |
| "learning_rate": 0.00044317736289381564, |
| "loss": 3.267, |
| "step": 44900 |
| }, |
| { |
| "epoch": 13.104956268221574, |
| "grad_norm": 0.35178348422050476, |
| "learning_rate": 0.000443002333722287, |
| "loss": 3.2747, |
| "step": 44950 |
| }, |
| { |
| "epoch": 13.119533527696793, |
| "grad_norm": 0.4218001365661621, |
| "learning_rate": 0.0004428273045507584, |
| "loss": 3.2711, |
| "step": 45000 |
| }, |
| { |
| "epoch": 13.119533527696793, |
| "eval_accuracy": 0.36915187812306355, |
| "eval_loss": 3.5657756328582764, |
| "eval_runtime": 181.8618, |
| "eval_samples_per_second": 91.41, |
| "eval_steps_per_second": 5.713, |
| "step": 45000 |
| }, |
| { |
| "epoch": 13.13411078717201, |
| "grad_norm": 0.37173452973365784, |
| "learning_rate": 0.0004426522753792298, |
| "loss": 3.277, |
| "step": 45050 |
| }, |
| { |
| "epoch": 13.14868804664723, |
| "grad_norm": 0.34601107239723206, |
| "learning_rate": 0.00044247724620770124, |
| "loss": 3.2698, |
| "step": 45100 |
| }, |
| { |
| "epoch": 13.16326530612245, |
| "grad_norm": 0.37380507588386536, |
| "learning_rate": 0.00044230221703617267, |
| "loss": 3.2775, |
| "step": 45150 |
| }, |
| { |
| "epoch": 13.177842565597668, |
| "grad_norm": 0.34601885080337524, |
| "learning_rate": 0.00044212718786464404, |
| "loss": 3.2846, |
| "step": 45200 |
| }, |
| { |
| "epoch": 13.192419825072886, |
| "grad_norm": 0.343649685382843, |
| "learning_rate": 0.0004419521586931155, |
| "loss": 3.2878, |
| "step": 45250 |
| }, |
| { |
| "epoch": 13.206997084548105, |
| "grad_norm": 0.3760887086391449, |
| "learning_rate": 0.0004417771295215869, |
| "loss": 3.2891, |
| "step": 45300 |
| }, |
| { |
| "epoch": 13.221574344023324, |
| "grad_norm": 0.34684184193611145, |
| "learning_rate": 0.0004416021003500583, |
| "loss": 3.2816, |
| "step": 45350 |
| }, |
| { |
| "epoch": 13.236151603498543, |
| "grad_norm": 0.37741604447364807, |
| "learning_rate": 0.0004414270711785297, |
| "loss": 3.2905, |
| "step": 45400 |
| }, |
| { |
| "epoch": 13.25072886297376, |
| "grad_norm": 0.3436417281627655, |
| "learning_rate": 0.00044125204200700117, |
| "loss": 3.2854, |
| "step": 45450 |
| }, |
| { |
| "epoch": 13.26530612244898, |
| "grad_norm": 0.3912341892719269, |
| "learning_rate": 0.00044107701283547254, |
| "loss": 3.304, |
| "step": 45500 |
| }, |
| { |
| "epoch": 13.279883381924199, |
| "grad_norm": 0.34772416949272156, |
| "learning_rate": 0.0004409019836639439, |
| "loss": 3.2981, |
| "step": 45550 |
| }, |
| { |
| "epoch": 13.294460641399416, |
| "grad_norm": 0.35889309644699097, |
| "learning_rate": 0.0004407269544924154, |
| "loss": 3.299, |
| "step": 45600 |
| }, |
| { |
| "epoch": 13.309037900874635, |
| "grad_norm": 0.3548545241355896, |
| "learning_rate": 0.00044055192532088677, |
| "loss": 3.2913, |
| "step": 45650 |
| }, |
| { |
| "epoch": 13.323615160349854, |
| "grad_norm": 0.3576371669769287, |
| "learning_rate": 0.0004403768961493582, |
| "loss": 3.2963, |
| "step": 45700 |
| }, |
| { |
| "epoch": 13.338192419825074, |
| "grad_norm": 0.35445261001586914, |
| "learning_rate": 0.00044020186697782957, |
| "loss": 3.3088, |
| "step": 45750 |
| }, |
| { |
| "epoch": 13.352769679300291, |
| "grad_norm": 0.3462010622024536, |
| "learning_rate": 0.00044002683780630105, |
| "loss": 3.3021, |
| "step": 45800 |
| }, |
| { |
| "epoch": 13.36734693877551, |
| "grad_norm": 0.36478662490844727, |
| "learning_rate": 0.0004398518086347724, |
| "loss": 3.3029, |
| "step": 45850 |
| }, |
| { |
| "epoch": 13.38192419825073, |
| "grad_norm": 0.36026138067245483, |
| "learning_rate": 0.0004396767794632438, |
| "loss": 3.3103, |
| "step": 45900 |
| }, |
| { |
| "epoch": 13.396501457725947, |
| "grad_norm": 0.34346291422843933, |
| "learning_rate": 0.0004395017502917152, |
| "loss": 3.3082, |
| "step": 45950 |
| }, |
| { |
| "epoch": 13.411078717201166, |
| "grad_norm": 0.38728317618370056, |
| "learning_rate": 0.00043932672112018665, |
| "loss": 3.303, |
| "step": 46000 |
| }, |
| { |
| "epoch": 13.411078717201166, |
| "eval_accuracy": 0.3698803182723114, |
| "eval_loss": 3.5604703426361084, |
| "eval_runtime": 181.7583, |
| "eval_samples_per_second": 91.462, |
| "eval_steps_per_second": 5.716, |
| "step": 46000 |
| }, |
| { |
| "epoch": 13.425655976676385, |
| "grad_norm": 0.37464943528175354, |
| "learning_rate": 0.0004391516919486581, |
| "loss": 3.3086, |
| "step": 46050 |
| }, |
| { |
| "epoch": 13.440233236151604, |
| "grad_norm": 0.3361836373806, |
| "learning_rate": 0.00043897666277712945, |
| "loss": 3.3045, |
| "step": 46100 |
| }, |
| { |
| "epoch": 13.454810495626822, |
| "grad_norm": 0.37110447883605957, |
| "learning_rate": 0.00043880163360560093, |
| "loss": 3.3114, |
| "step": 46150 |
| }, |
| { |
| "epoch": 13.46938775510204, |
| "grad_norm": 0.3804593086242676, |
| "learning_rate": 0.0004386266044340723, |
| "loss": 3.3161, |
| "step": 46200 |
| }, |
| { |
| "epoch": 13.48396501457726, |
| "grad_norm": 0.3696232736110687, |
| "learning_rate": 0.00043845157526254373, |
| "loss": 3.3119, |
| "step": 46250 |
| }, |
| { |
| "epoch": 13.498542274052479, |
| "grad_norm": 0.3362153172492981, |
| "learning_rate": 0.0004382765460910151, |
| "loss": 3.3166, |
| "step": 46300 |
| }, |
| { |
| "epoch": 13.513119533527696, |
| "grad_norm": 0.355120986700058, |
| "learning_rate": 0.0004381015169194866, |
| "loss": 3.3145, |
| "step": 46350 |
| }, |
| { |
| "epoch": 13.527696793002915, |
| "grad_norm": 0.3447381854057312, |
| "learning_rate": 0.00043792648774795796, |
| "loss": 3.3336, |
| "step": 46400 |
| }, |
| { |
| "epoch": 13.542274052478135, |
| "grad_norm": 0.3290923535823822, |
| "learning_rate": 0.00043775145857642933, |
| "loss": 3.3119, |
| "step": 46450 |
| }, |
| { |
| "epoch": 13.556851311953352, |
| "grad_norm": 0.3647269010543823, |
| "learning_rate": 0.0004375764294049008, |
| "loss": 3.314, |
| "step": 46500 |
| }, |
| { |
| "epoch": 13.571428571428571, |
| "grad_norm": 0.34759560227394104, |
| "learning_rate": 0.0004374014002333722, |
| "loss": 3.3183, |
| "step": 46550 |
| }, |
| { |
| "epoch": 13.58600583090379, |
| "grad_norm": 0.35087084770202637, |
| "learning_rate": 0.0004372263710618436, |
| "loss": 3.314, |
| "step": 46600 |
| }, |
| { |
| "epoch": 13.60058309037901, |
| "grad_norm": 0.3470548987388611, |
| "learning_rate": 0.000437051341890315, |
| "loss": 3.3172, |
| "step": 46650 |
| }, |
| { |
| "epoch": 13.615160349854227, |
| "grad_norm": 0.34616619348526, |
| "learning_rate": 0.00043687631271878646, |
| "loss": 3.3246, |
| "step": 46700 |
| }, |
| { |
| "epoch": 13.629737609329446, |
| "grad_norm": 0.3519126772880554, |
| "learning_rate": 0.00043670128354725784, |
| "loss": 3.3239, |
| "step": 46750 |
| }, |
| { |
| "epoch": 13.644314868804665, |
| "grad_norm": 0.3683834969997406, |
| "learning_rate": 0.0004365262543757292, |
| "loss": 3.32, |
| "step": 46800 |
| }, |
| { |
| "epoch": 13.658892128279884, |
| "grad_norm": 0.37900853157043457, |
| "learning_rate": 0.0004363512252042007, |
| "loss": 3.3224, |
| "step": 46850 |
| }, |
| { |
| "epoch": 13.673469387755102, |
| "grad_norm": 0.3418913781642914, |
| "learning_rate": 0.00043617619603267206, |
| "loss": 3.3192, |
| "step": 46900 |
| }, |
| { |
| "epoch": 13.68804664723032, |
| "grad_norm": 0.33223602175712585, |
| "learning_rate": 0.0004360011668611435, |
| "loss": 3.3318, |
| "step": 46950 |
| }, |
| { |
| "epoch": 13.70262390670554, |
| "grad_norm": 0.3742575943470001, |
| "learning_rate": 0.00043582613768961486, |
| "loss": 3.3211, |
| "step": 47000 |
| }, |
| { |
| "epoch": 13.70262390670554, |
| "eval_accuracy": 0.37053812750857457, |
| "eval_loss": 3.552196741104126, |
| "eval_runtime": 181.8675, |
| "eval_samples_per_second": 91.407, |
| "eval_steps_per_second": 5.713, |
| "step": 47000 |
| }, |
| { |
| "epoch": 13.717201166180757, |
| "grad_norm": 0.3494127690792084, |
| "learning_rate": 0.00043565110851808634, |
| "loss": 3.3215, |
| "step": 47050 |
| }, |
| { |
| "epoch": 13.731778425655977, |
| "grad_norm": 0.35944312810897827, |
| "learning_rate": 0.0004354760793465577, |
| "loss": 3.3346, |
| "step": 47100 |
| }, |
| { |
| "epoch": 13.746355685131196, |
| "grad_norm": 0.34206297993659973, |
| "learning_rate": 0.00043530105017502914, |
| "loss": 3.3435, |
| "step": 47150 |
| }, |
| { |
| "epoch": 13.760932944606415, |
| "grad_norm": 0.3618668019771576, |
| "learning_rate": 0.0004351260210035005, |
| "loss": 3.3161, |
| "step": 47200 |
| }, |
| { |
| "epoch": 13.775510204081632, |
| "grad_norm": 0.34841158986091614, |
| "learning_rate": 0.000434950991831972, |
| "loss": 3.3357, |
| "step": 47250 |
| }, |
| { |
| "epoch": 13.790087463556851, |
| "grad_norm": 0.3624778091907501, |
| "learning_rate": 0.00043477596266044337, |
| "loss": 3.3493, |
| "step": 47300 |
| }, |
| { |
| "epoch": 13.80466472303207, |
| "grad_norm": 0.37030959129333496, |
| "learning_rate": 0.00043460093348891474, |
| "loss": 3.3391, |
| "step": 47350 |
| }, |
| { |
| "epoch": 13.819241982507288, |
| "grad_norm": 0.37484002113342285, |
| "learning_rate": 0.0004344259043173862, |
| "loss": 3.3422, |
| "step": 47400 |
| }, |
| { |
| "epoch": 13.833819241982507, |
| "grad_norm": 0.3513255715370178, |
| "learning_rate": 0.0004342508751458576, |
| "loss": 3.3304, |
| "step": 47450 |
| }, |
| { |
| "epoch": 13.848396501457726, |
| "grad_norm": 0.33904463052749634, |
| "learning_rate": 0.000434075845974329, |
| "loss": 3.3344, |
| "step": 47500 |
| }, |
| { |
| "epoch": 13.862973760932945, |
| "grad_norm": 0.3457725942134857, |
| "learning_rate": 0.0004339008168028004, |
| "loss": 3.34, |
| "step": 47550 |
| }, |
| { |
| "epoch": 13.877551020408163, |
| "grad_norm": 0.324558287858963, |
| "learning_rate": 0.0004337257876312719, |
| "loss": 3.3388, |
| "step": 47600 |
| }, |
| { |
| "epoch": 13.892128279883382, |
| "grad_norm": 0.34971484541893005, |
| "learning_rate": 0.00043355075845974325, |
| "loss": 3.3423, |
| "step": 47650 |
| }, |
| { |
| "epoch": 13.906705539358601, |
| "grad_norm": 0.3688257038593292, |
| "learning_rate": 0.0004333757292882146, |
| "loss": 3.3374, |
| "step": 47700 |
| }, |
| { |
| "epoch": 13.921282798833818, |
| "grad_norm": 0.39048126339912415, |
| "learning_rate": 0.0004332007001166861, |
| "loss": 3.3488, |
| "step": 47750 |
| }, |
| { |
| "epoch": 13.935860058309038, |
| "grad_norm": 0.35178861021995544, |
| "learning_rate": 0.0004330256709451575, |
| "loss": 3.34, |
| "step": 47800 |
| }, |
| { |
| "epoch": 13.950437317784257, |
| "grad_norm": 0.31503215432167053, |
| "learning_rate": 0.0004328506417736289, |
| "loss": 3.3474, |
| "step": 47850 |
| }, |
| { |
| "epoch": 13.965014577259476, |
| "grad_norm": 0.40469351410865784, |
| "learning_rate": 0.0004326756126021003, |
| "loss": 3.3482, |
| "step": 47900 |
| }, |
| { |
| "epoch": 13.979591836734693, |
| "grad_norm": 0.3635316491127014, |
| "learning_rate": 0.00043250058343057175, |
| "loss": 3.3434, |
| "step": 47950 |
| }, |
| { |
| "epoch": 13.994169096209912, |
| "grad_norm": 0.3199983835220337, |
| "learning_rate": 0.0004323255542590431, |
| "loss": 3.3581, |
| "step": 48000 |
| }, |
| { |
| "epoch": 13.994169096209912, |
| "eval_accuracy": 0.3712570324845695, |
| "eval_loss": 3.541492223739624, |
| "eval_runtime": 181.829, |
| "eval_samples_per_second": 91.427, |
| "eval_steps_per_second": 5.714, |
| "step": 48000 |
| }, |
| { |
| "epoch": 14.008746355685131, |
| "grad_norm": 0.3561180531978607, |
| "learning_rate": 0.00043215052508751455, |
| "loss": 3.2609, |
| "step": 48050 |
| }, |
| { |
| "epoch": 14.02332361516035, |
| "grad_norm": 0.34569546580314636, |
| "learning_rate": 0.000431975495915986, |
| "loss": 3.2238, |
| "step": 48100 |
| }, |
| { |
| "epoch": 14.037900874635568, |
| "grad_norm": 0.37043094635009766, |
| "learning_rate": 0.0004318004667444574, |
| "loss": 3.2315, |
| "step": 48150 |
| }, |
| { |
| "epoch": 14.052478134110787, |
| "grad_norm": 0.3671160936355591, |
| "learning_rate": 0.0004316254375729288, |
| "loss": 3.2376, |
| "step": 48200 |
| }, |
| { |
| "epoch": 14.067055393586006, |
| "grad_norm": 0.37406212091445923, |
| "learning_rate": 0.00043145040840140015, |
| "loss": 3.2348, |
| "step": 48250 |
| }, |
| { |
| "epoch": 14.081632653061224, |
| "grad_norm": 0.4101208448410034, |
| "learning_rate": 0.00043127537922987163, |
| "loss": 3.2382, |
| "step": 48300 |
| }, |
| { |
| "epoch": 14.096209912536443, |
| "grad_norm": 0.36454081535339355, |
| "learning_rate": 0.000431100350058343, |
| "loss": 3.2494, |
| "step": 48350 |
| }, |
| { |
| "epoch": 14.110787172011662, |
| "grad_norm": 0.3647298514842987, |
| "learning_rate": 0.00043092532088681443, |
| "loss": 3.2586, |
| "step": 48400 |
| }, |
| { |
| "epoch": 14.125364431486881, |
| "grad_norm": 0.3764522075653076, |
| "learning_rate": 0.0004307502917152858, |
| "loss": 3.2495, |
| "step": 48450 |
| }, |
| { |
| "epoch": 14.139941690962099, |
| "grad_norm": 0.3430420160293579, |
| "learning_rate": 0.0004305752625437573, |
| "loss": 3.2612, |
| "step": 48500 |
| }, |
| { |
| "epoch": 14.154518950437318, |
| "grad_norm": 0.3535497784614563, |
| "learning_rate": 0.00043040023337222866, |
| "loss": 3.2669, |
| "step": 48550 |
| }, |
| { |
| "epoch": 14.169096209912537, |
| "grad_norm": 0.3597559630870819, |
| "learning_rate": 0.00043022520420070003, |
| "loss": 3.26, |
| "step": 48600 |
| }, |
| { |
| "epoch": 14.183673469387756, |
| "grad_norm": 0.3598780930042267, |
| "learning_rate": 0.0004300501750291715, |
| "loss": 3.2653, |
| "step": 48650 |
| }, |
| { |
| "epoch": 14.198250728862973, |
| "grad_norm": 0.3604069650173187, |
| "learning_rate": 0.0004298751458576429, |
| "loss": 3.2714, |
| "step": 48700 |
| }, |
| { |
| "epoch": 14.212827988338192, |
| "grad_norm": 0.39230746030807495, |
| "learning_rate": 0.0004297001166861143, |
| "loss": 3.2626, |
| "step": 48750 |
| }, |
| { |
| "epoch": 14.227405247813412, |
| "grad_norm": 0.3730916678905487, |
| "learning_rate": 0.0004295250875145857, |
| "loss": 3.2742, |
| "step": 48800 |
| }, |
| { |
| "epoch": 14.241982507288629, |
| "grad_norm": 0.3509989380836487, |
| "learning_rate": 0.00042935005834305717, |
| "loss": 3.277, |
| "step": 48850 |
| }, |
| { |
| "epoch": 14.256559766763848, |
| "grad_norm": 0.36396312713623047, |
| "learning_rate": 0.00042917502917152854, |
| "loss": 3.2784, |
| "step": 48900 |
| }, |
| { |
| "epoch": 14.271137026239067, |
| "grad_norm": 0.3386274576187134, |
| "learning_rate": 0.00042899999999999997, |
| "loss": 3.2694, |
| "step": 48950 |
| }, |
| { |
| "epoch": 14.285714285714286, |
| "grad_norm": 0.37972214818000793, |
| "learning_rate": 0.0004288249708284714, |
| "loss": 3.2851, |
| "step": 49000 |
| }, |
| { |
| "epoch": 14.285714285714286, |
| "eval_accuracy": 0.37012011022189406, |
| "eval_loss": 3.5599772930145264, |
| "eval_runtime": 181.7492, |
| "eval_samples_per_second": 91.467, |
| "eval_steps_per_second": 5.717, |
| "step": 49000 |
| }, |
| { |
| "epoch": 14.300291545189504, |
| "grad_norm": 0.3713630437850952, |
| "learning_rate": 0.0004286499416569428, |
| "loss": 3.2855, |
| "step": 49050 |
| }, |
| { |
| "epoch": 14.314868804664723, |
| "grad_norm": 0.3884792625904083, |
| "learning_rate": 0.0004284749124854142, |
| "loss": 3.2723, |
| "step": 49100 |
| }, |
| { |
| "epoch": 14.329446064139942, |
| "grad_norm": 0.3575059771537781, |
| "learning_rate": 0.00042829988331388556, |
| "loss": 3.2844, |
| "step": 49150 |
| }, |
| { |
| "epoch": 14.34402332361516, |
| "grad_norm": 0.35480329394340515, |
| "learning_rate": 0.00042812485414235704, |
| "loss": 3.2873, |
| "step": 49200 |
| }, |
| { |
| "epoch": 14.358600583090379, |
| "grad_norm": 0.3646618127822876, |
| "learning_rate": 0.0004279498249708284, |
| "loss": 3.2825, |
| "step": 49250 |
| }, |
| { |
| "epoch": 14.373177842565598, |
| "grad_norm": 0.3766191303730011, |
| "learning_rate": 0.00042777479579929984, |
| "loss": 3.2883, |
| "step": 49300 |
| }, |
| { |
| "epoch": 14.387755102040817, |
| "grad_norm": 0.3955742418766022, |
| "learning_rate": 0.00042759976662777127, |
| "loss": 3.2963, |
| "step": 49350 |
| }, |
| { |
| "epoch": 14.402332361516034, |
| "grad_norm": 0.3547273874282837, |
| "learning_rate": 0.0004274247374562427, |
| "loss": 3.2965, |
| "step": 49400 |
| }, |
| { |
| "epoch": 14.416909620991254, |
| "grad_norm": 0.34324541687965393, |
| "learning_rate": 0.00042724970828471407, |
| "loss": 3.2945, |
| "step": 49450 |
| }, |
| { |
| "epoch": 14.431486880466473, |
| "grad_norm": 0.36863452196121216, |
| "learning_rate": 0.00042707467911318544, |
| "loss": 3.2996, |
| "step": 49500 |
| }, |
| { |
| "epoch": 14.446064139941692, |
| "grad_norm": 0.37455543875694275, |
| "learning_rate": 0.0004268996499416569, |
| "loss": 3.2931, |
| "step": 49550 |
| }, |
| { |
| "epoch": 14.46064139941691, |
| "grad_norm": 0.32714369893074036, |
| "learning_rate": 0.0004267246207701283, |
| "loss": 3.3118, |
| "step": 49600 |
| }, |
| { |
| "epoch": 14.475218658892128, |
| "grad_norm": 0.3829602003097534, |
| "learning_rate": 0.0004265495915985997, |
| "loss": 3.3034, |
| "step": 49650 |
| }, |
| { |
| "epoch": 14.489795918367347, |
| "grad_norm": 0.37818360328674316, |
| "learning_rate": 0.0004263745624270711, |
| "loss": 3.3091, |
| "step": 49700 |
| }, |
| { |
| "epoch": 14.504373177842565, |
| "grad_norm": 0.36534905433654785, |
| "learning_rate": 0.0004261995332555426, |
| "loss": 3.3183, |
| "step": 49750 |
| }, |
| { |
| "epoch": 14.518950437317784, |
| "grad_norm": 0.3587852716445923, |
| "learning_rate": 0.00042602450408401395, |
| "loss": 3.3065, |
| "step": 49800 |
| }, |
| { |
| "epoch": 14.533527696793003, |
| "grad_norm": 0.3622785210609436, |
| "learning_rate": 0.0004258494749124854, |
| "loss": 3.3013, |
| "step": 49850 |
| }, |
| { |
| "epoch": 14.548104956268222, |
| "grad_norm": 0.38269826769828796, |
| "learning_rate": 0.0004256744457409568, |
| "loss": 3.3107, |
| "step": 49900 |
| }, |
| { |
| "epoch": 14.56268221574344, |
| "grad_norm": 0.3362688720226288, |
| "learning_rate": 0.00042549941656942823, |
| "loss": 3.3043, |
| "step": 49950 |
| }, |
| { |
| "epoch": 14.577259475218659, |
| "grad_norm": 0.3540298044681549, |
| "learning_rate": 0.0004253243873978996, |
| "loss": 3.3056, |
| "step": 50000 |
| }, |
| { |
| "epoch": 14.577259475218659, |
| "eval_accuracy": 0.3706933977989524, |
| "eval_loss": 3.552320718765259, |
| "eval_runtime": 181.7913, |
| "eval_samples_per_second": 91.446, |
| "eval_steps_per_second": 5.715, |
| "step": 50000 |
| }, |
| { |
| "epoch": 14.591836734693878, |
| "grad_norm": 0.35744428634643555, |
| "learning_rate": 0.000425149358226371, |
| "loss": 3.3006, |
| "step": 50050 |
| }, |
| { |
| "epoch": 14.606413994169095, |
| "grad_norm": 0.35320979356765747, |
| "learning_rate": 0.00042497432905484246, |
| "loss": 3.3105, |
| "step": 50100 |
| }, |
| { |
| "epoch": 14.620991253644315, |
| "grad_norm": 0.3753153085708618, |
| "learning_rate": 0.00042479929988331383, |
| "loss": 3.3255, |
| "step": 50150 |
| }, |
| { |
| "epoch": 14.635568513119534, |
| "grad_norm": 0.3381422162055969, |
| "learning_rate": 0.00042462427071178526, |
| "loss": 3.3203, |
| "step": 50200 |
| }, |
| { |
| "epoch": 14.650145772594753, |
| "grad_norm": 0.37024828791618347, |
| "learning_rate": 0.0004244492415402567, |
| "loss": 3.3146, |
| "step": 50250 |
| }, |
| { |
| "epoch": 14.66472303206997, |
| "grad_norm": 0.3409423828125, |
| "learning_rate": 0.0004242742123687281, |
| "loss": 3.2963, |
| "step": 50300 |
| }, |
| { |
| "epoch": 14.67930029154519, |
| "grad_norm": 0.33922427892684937, |
| "learning_rate": 0.0004240991831971995, |
| "loss": 3.3112, |
| "step": 50350 |
| }, |
| { |
| "epoch": 14.693877551020408, |
| "grad_norm": 0.38118499517440796, |
| "learning_rate": 0.00042392415402567085, |
| "loss": 3.3151, |
| "step": 50400 |
| }, |
| { |
| "epoch": 14.708454810495628, |
| "grad_norm": 0.34291163086891174, |
| "learning_rate": 0.00042374912485414234, |
| "loss": 3.3245, |
| "step": 50450 |
| }, |
| { |
| "epoch": 14.723032069970845, |
| "grad_norm": 0.3485682010650635, |
| "learning_rate": 0.0004235740956826137, |
| "loss": 3.3257, |
| "step": 50500 |
| }, |
| { |
| "epoch": 14.737609329446064, |
| "grad_norm": 0.3919681906700134, |
| "learning_rate": 0.00042339906651108514, |
| "loss": 3.3112, |
| "step": 50550 |
| }, |
| { |
| "epoch": 14.752186588921283, |
| "grad_norm": 0.36933663487434387, |
| "learning_rate": 0.00042322403733955656, |
| "loss": 3.3269, |
| "step": 50600 |
| }, |
| { |
| "epoch": 14.7667638483965, |
| "grad_norm": 0.37549716234207153, |
| "learning_rate": 0.000423049008168028, |
| "loss": 3.3143, |
| "step": 50650 |
| }, |
| { |
| "epoch": 14.78134110787172, |
| "grad_norm": 0.37795278429985046, |
| "learning_rate": 0.00042287397899649936, |
| "loss": 3.3174, |
| "step": 50700 |
| }, |
| { |
| "epoch": 14.795918367346939, |
| "grad_norm": 0.3924393951892853, |
| "learning_rate": 0.0004226989498249708, |
| "loss": 3.3155, |
| "step": 50750 |
| }, |
| { |
| "epoch": 14.810495626822158, |
| "grad_norm": 0.38116684556007385, |
| "learning_rate": 0.0004225239206534422, |
| "loss": 3.3216, |
| "step": 50800 |
| }, |
| { |
| "epoch": 14.825072886297376, |
| "grad_norm": 0.3596772253513336, |
| "learning_rate": 0.00042234889148191364, |
| "loss": 3.3287, |
| "step": 50850 |
| }, |
| { |
| "epoch": 14.839650145772595, |
| "grad_norm": 0.34857410192489624, |
| "learning_rate": 0.000422173862310385, |
| "loss": 3.3243, |
| "step": 50900 |
| }, |
| { |
| "epoch": 14.854227405247814, |
| "grad_norm": 0.33267682790756226, |
| "learning_rate": 0.0004219988331388564, |
| "loss": 3.318, |
| "step": 50950 |
| }, |
| { |
| "epoch": 14.868804664723033, |
| "grad_norm": 0.3842490613460541, |
| "learning_rate": 0.00042182380396732787, |
| "loss": 3.3251, |
| "step": 51000 |
| }, |
| { |
| "epoch": 14.868804664723033, |
| "eval_accuracy": 0.3712048833271492, |
| "eval_loss": 3.5443015098571777, |
| "eval_runtime": 181.8147, |
| "eval_samples_per_second": 91.434, |
| "eval_steps_per_second": 5.715, |
| "step": 51000 |
| }, |
| { |
| "epoch": 14.88338192419825, |
| "grad_norm": 0.3622199296951294, |
| "learning_rate": 0.00042164877479579924, |
| "loss": 3.3409, |
| "step": 51050 |
| }, |
| { |
| "epoch": 14.89795918367347, |
| "grad_norm": 0.35461923480033875, |
| "learning_rate": 0.00042147374562427067, |
| "loss": 3.3252, |
| "step": 51100 |
| }, |
| { |
| "epoch": 14.912536443148689, |
| "grad_norm": 0.3685915172100067, |
| "learning_rate": 0.0004212987164527421, |
| "loss": 3.3284, |
| "step": 51150 |
| }, |
| { |
| "epoch": 14.927113702623906, |
| "grad_norm": 0.3246322274208069, |
| "learning_rate": 0.0004211236872812135, |
| "loss": 3.3201, |
| "step": 51200 |
| }, |
| { |
| "epoch": 14.941690962099125, |
| "grad_norm": 0.36358892917633057, |
| "learning_rate": 0.0004209486581096849, |
| "loss": 3.322, |
| "step": 51250 |
| }, |
| { |
| "epoch": 14.956268221574344, |
| "grad_norm": 0.3402910828590393, |
| "learning_rate": 0.00042077362893815627, |
| "loss": 3.3405, |
| "step": 51300 |
| }, |
| { |
| "epoch": 14.970845481049563, |
| "grad_norm": 0.3693722188472748, |
| "learning_rate": 0.00042059859976662775, |
| "loss": 3.3269, |
| "step": 51350 |
| }, |
| { |
| "epoch": 14.98542274052478, |
| "grad_norm": 0.34414148330688477, |
| "learning_rate": 0.0004204235705950991, |
| "loss": 3.3193, |
| "step": 51400 |
| }, |
| { |
| "epoch": 15.0, |
| "grad_norm": 0.35340461134910583, |
| "learning_rate": 0.00042024854142357055, |
| "loss": 3.3147, |
| "step": 51450 |
| }, |
| { |
| "epoch": 15.01457725947522, |
| "grad_norm": 0.3598097562789917, |
| "learning_rate": 0.000420073512252042, |
| "loss": 3.2213, |
| "step": 51500 |
| }, |
| { |
| "epoch": 15.029154518950437, |
| "grad_norm": 0.36583930253982544, |
| "learning_rate": 0.0004198984830805134, |
| "loss": 3.2128, |
| "step": 51550 |
| }, |
| { |
| "epoch": 15.043731778425656, |
| "grad_norm": 0.3556552231311798, |
| "learning_rate": 0.0004197234539089848, |
| "loss": 3.2216, |
| "step": 51600 |
| }, |
| { |
| "epoch": 15.058309037900875, |
| "grad_norm": 0.35826733708381653, |
| "learning_rate": 0.0004195484247374562, |
| "loss": 3.2234, |
| "step": 51650 |
| }, |
| { |
| "epoch": 15.072886297376094, |
| "grad_norm": 0.35535216331481934, |
| "learning_rate": 0.0004193733955659276, |
| "loss": 3.2361, |
| "step": 51700 |
| }, |
| { |
| "epoch": 15.087463556851311, |
| "grad_norm": 0.34739965200424194, |
| "learning_rate": 0.00041919836639439905, |
| "loss": 3.2338, |
| "step": 51750 |
| }, |
| { |
| "epoch": 15.10204081632653, |
| "grad_norm": 0.3416599631309509, |
| "learning_rate": 0.0004190233372228704, |
| "loss": 3.2461, |
| "step": 51800 |
| }, |
| { |
| "epoch": 15.11661807580175, |
| "grad_norm": 0.3742870092391968, |
| "learning_rate": 0.0004188483080513419, |
| "loss": 3.2419, |
| "step": 51850 |
| }, |
| { |
| "epoch": 15.131195335276967, |
| "grad_norm": 0.3667660057544708, |
| "learning_rate": 0.0004186732788798133, |
| "loss": 3.2593, |
| "step": 51900 |
| }, |
| { |
| "epoch": 15.145772594752186, |
| "grad_norm": 0.38009244203567505, |
| "learning_rate": 0.00041849824970828465, |
| "loss": 3.2449, |
| "step": 51950 |
| }, |
| { |
| "epoch": 15.160349854227405, |
| "grad_norm": 0.352700799703598, |
| "learning_rate": 0.0004183232205367561, |
| "loss": 3.2478, |
| "step": 52000 |
| }, |
| { |
| "epoch": 15.160349854227405, |
| "eval_accuracy": 0.3707503734020933, |
| "eval_loss": 3.559041976928711, |
| "eval_runtime": 181.6538, |
| "eval_samples_per_second": 91.515, |
| "eval_steps_per_second": 5.72, |
| "step": 52000 |
| }, |
| { |
| "epoch": 15.174927113702624, |
| "grad_norm": 0.38742712140083313, |
| "learning_rate": 0.0004181481913652275, |
| "loss": 3.2548, |
| "step": 52050 |
| }, |
| { |
| "epoch": 15.189504373177842, |
| "grad_norm": 0.39204293489456177, |
| "learning_rate": 0.00041797316219369893, |
| "loss": 3.2614, |
| "step": 52100 |
| }, |
| { |
| "epoch": 15.204081632653061, |
| "grad_norm": 0.3423581123352051, |
| "learning_rate": 0.0004177981330221703, |
| "loss": 3.2599, |
| "step": 52150 |
| }, |
| { |
| "epoch": 15.21865889212828, |
| "grad_norm": 0.3979237973690033, |
| "learning_rate": 0.0004176231038506417, |
| "loss": 3.2615, |
| "step": 52200 |
| }, |
| { |
| "epoch": 15.2332361516035, |
| "grad_norm": 0.37007424235343933, |
| "learning_rate": 0.00041744807467911316, |
| "loss": 3.2607, |
| "step": 52250 |
| }, |
| { |
| "epoch": 15.247813411078717, |
| "grad_norm": 0.34523463249206543, |
| "learning_rate": 0.00041727304550758453, |
| "loss": 3.2592, |
| "step": 52300 |
| }, |
| { |
| "epoch": 15.262390670553936, |
| "grad_norm": 0.3491571843624115, |
| "learning_rate": 0.00041709801633605596, |
| "loss": 3.2602, |
| "step": 52350 |
| }, |
| { |
| "epoch": 15.276967930029155, |
| "grad_norm": 0.3450250029563904, |
| "learning_rate": 0.0004169229871645274, |
| "loss": 3.2702, |
| "step": 52400 |
| }, |
| { |
| "epoch": 15.291545189504372, |
| "grad_norm": 0.3784157633781433, |
| "learning_rate": 0.0004167479579929988, |
| "loss": 3.2883, |
| "step": 52450 |
| }, |
| { |
| "epoch": 15.306122448979592, |
| "grad_norm": 0.3518495261669159, |
| "learning_rate": 0.0004165729288214702, |
| "loss": 3.2777, |
| "step": 52500 |
| }, |
| { |
| "epoch": 15.32069970845481, |
| "grad_norm": 0.37740063667297363, |
| "learning_rate": 0.0004163978996499416, |
| "loss": 3.2679, |
| "step": 52550 |
| }, |
| { |
| "epoch": 15.33527696793003, |
| "grad_norm": 0.3901689648628235, |
| "learning_rate": 0.00041622287047841304, |
| "loss": 3.2735, |
| "step": 52600 |
| }, |
| { |
| "epoch": 15.349854227405247, |
| "grad_norm": 0.38509419560432434, |
| "learning_rate": 0.00041604784130688447, |
| "loss": 3.2671, |
| "step": 52650 |
| }, |
| { |
| "epoch": 15.364431486880466, |
| "grad_norm": 0.35500845313072205, |
| "learning_rate": 0.00041587281213535584, |
| "loss": 3.2684, |
| "step": 52700 |
| }, |
| { |
| "epoch": 15.379008746355685, |
| "grad_norm": 0.37637239694595337, |
| "learning_rate": 0.0004156977829638273, |
| "loss": 3.27, |
| "step": 52750 |
| }, |
| { |
| "epoch": 15.393586005830905, |
| "grad_norm": 0.3534194827079773, |
| "learning_rate": 0.0004155227537922987, |
| "loss": 3.2818, |
| "step": 52800 |
| }, |
| { |
| "epoch": 15.408163265306122, |
| "grad_norm": 0.37404513359069824, |
| "learning_rate": 0.00041534772462077006, |
| "loss": 3.2755, |
| "step": 52850 |
| }, |
| { |
| "epoch": 15.422740524781341, |
| "grad_norm": 0.33428794145584106, |
| "learning_rate": 0.0004151726954492415, |
| "loss": 3.2799, |
| "step": 52900 |
| }, |
| { |
| "epoch": 15.43731778425656, |
| "grad_norm": 0.37358441948890686, |
| "learning_rate": 0.0004149976662777129, |
| "loss": 3.2847, |
| "step": 52950 |
| }, |
| { |
| "epoch": 15.451895043731778, |
| "grad_norm": 0.3440437912940979, |
| "learning_rate": 0.00041482263710618434, |
| "loss": 3.2882, |
| "step": 53000 |
| }, |
| { |
| "epoch": 15.451895043731778, |
| "eval_accuracy": 0.3708990514739259, |
| "eval_loss": 3.5549874305725098, |
| "eval_runtime": 181.6144, |
| "eval_samples_per_second": 91.535, |
| "eval_steps_per_second": 5.721, |
| "step": 53000 |
| }, |
| { |
| "epoch": 15.466472303206997, |
| "grad_norm": 0.41005825996398926, |
| "learning_rate": 0.0004146476079346557, |
| "loss": 3.2757, |
| "step": 53050 |
| }, |
| { |
| "epoch": 15.481049562682216, |
| "grad_norm": 0.36211097240448, |
| "learning_rate": 0.0004144725787631272, |
| "loss": 3.2745, |
| "step": 53100 |
| }, |
| { |
| "epoch": 15.495626822157435, |
| "grad_norm": 0.37981894612312317, |
| "learning_rate": 0.00041429754959159857, |
| "loss": 3.2811, |
| "step": 53150 |
| }, |
| { |
| "epoch": 15.510204081632653, |
| "grad_norm": 0.35121893882751465, |
| "learning_rate": 0.00041412252042006994, |
| "loss": 3.2869, |
| "step": 53200 |
| }, |
| { |
| "epoch": 15.524781341107872, |
| "grad_norm": 0.38234373927116394, |
| "learning_rate": 0.00041394749124854137, |
| "loss": 3.3016, |
| "step": 53250 |
| }, |
| { |
| "epoch": 15.53935860058309, |
| "grad_norm": 0.3564067482948303, |
| "learning_rate": 0.0004137724620770128, |
| "loss": 3.2852, |
| "step": 53300 |
| }, |
| { |
| "epoch": 15.55393586005831, |
| "grad_norm": 0.37256014347076416, |
| "learning_rate": 0.0004135974329054842, |
| "loss": 3.2858, |
| "step": 53350 |
| }, |
| { |
| "epoch": 15.568513119533527, |
| "grad_norm": 0.35131582617759705, |
| "learning_rate": 0.0004134224037339556, |
| "loss": 3.2981, |
| "step": 53400 |
| }, |
| { |
| "epoch": 15.583090379008746, |
| "grad_norm": 0.3604174852371216, |
| "learning_rate": 0.000413247374562427, |
| "loss": 3.2876, |
| "step": 53450 |
| }, |
| { |
| "epoch": 15.597667638483966, |
| "grad_norm": 0.3844100534915924, |
| "learning_rate": 0.00041307234539089845, |
| "loss": 3.2905, |
| "step": 53500 |
| }, |
| { |
| "epoch": 15.612244897959183, |
| "grad_norm": 0.33866187930107117, |
| "learning_rate": 0.0004128973162193699, |
| "loss": 3.2859, |
| "step": 53550 |
| }, |
| { |
| "epoch": 15.626822157434402, |
| "grad_norm": 0.37496766448020935, |
| "learning_rate": 0.00041272228704784125, |
| "loss": 3.2932, |
| "step": 53600 |
| }, |
| { |
| "epoch": 15.641399416909621, |
| "grad_norm": 0.34779778122901917, |
| "learning_rate": 0.00041254725787631273, |
| "loss": 3.2996, |
| "step": 53650 |
| }, |
| { |
| "epoch": 15.65597667638484, |
| "grad_norm": 0.3477199673652649, |
| "learning_rate": 0.0004123722287047841, |
| "loss": 3.3018, |
| "step": 53700 |
| }, |
| { |
| "epoch": 15.670553935860058, |
| "grad_norm": 0.3835027515888214, |
| "learning_rate": 0.0004121971995332555, |
| "loss": 3.3079, |
| "step": 53750 |
| }, |
| { |
| "epoch": 15.685131195335277, |
| "grad_norm": 0.35675570368766785, |
| "learning_rate": 0.0004120221703617269, |
| "loss": 3.3029, |
| "step": 53800 |
| }, |
| { |
| "epoch": 15.699708454810496, |
| "grad_norm": 0.3724403977394104, |
| "learning_rate": 0.00041184714119019833, |
| "loss": 3.296, |
| "step": 53850 |
| }, |
| { |
| "epoch": 15.714285714285714, |
| "grad_norm": 0.38267725706100464, |
| "learning_rate": 0.00041167211201866976, |
| "loss": 3.3099, |
| "step": 53900 |
| }, |
| { |
| "epoch": 15.728862973760933, |
| "grad_norm": 0.3567516803741455, |
| "learning_rate": 0.00041149708284714113, |
| "loss": 3.2896, |
| "step": 53950 |
| }, |
| { |
| "epoch": 15.743440233236152, |
| "grad_norm": 0.39038965106010437, |
| "learning_rate": 0.0004113220536756126, |
| "loss": 3.3035, |
| "step": 54000 |
| }, |
| { |
| "epoch": 15.743440233236152, |
| "eval_accuracy": 0.3714218379482002, |
| "eval_loss": 3.546982526779175, |
| "eval_runtime": 181.7086, |
| "eval_samples_per_second": 91.487, |
| "eval_steps_per_second": 5.718, |
| "step": 54000 |
| }, |
| { |
| "epoch": 15.758017492711371, |
| "grad_norm": 0.3851892948150635, |
| "learning_rate": 0.000411147024504084, |
| "loss": 3.3165, |
| "step": 54050 |
| }, |
| { |
| "epoch": 15.772594752186588, |
| "grad_norm": 0.36965999007225037, |
| "learning_rate": 0.00041097199533255535, |
| "loss": 3.3061, |
| "step": 54100 |
| }, |
| { |
| "epoch": 15.787172011661808, |
| "grad_norm": 0.3408694863319397, |
| "learning_rate": 0.0004107969661610268, |
| "loss": 3.3097, |
| "step": 54150 |
| }, |
| { |
| "epoch": 15.801749271137027, |
| "grad_norm": 0.33933934569358826, |
| "learning_rate": 0.0004106219369894982, |
| "loss": 3.2994, |
| "step": 54200 |
| }, |
| { |
| "epoch": 15.816326530612244, |
| "grad_norm": 0.5129480957984924, |
| "learning_rate": 0.00041044690781796964, |
| "loss": 3.299, |
| "step": 54250 |
| }, |
| { |
| "epoch": 15.830903790087463, |
| "grad_norm": 0.37039506435394287, |
| "learning_rate": 0.000410271878646441, |
| "loss": 3.3087, |
| "step": 54300 |
| }, |
| { |
| "epoch": 15.845481049562682, |
| "grad_norm": 0.3602822721004486, |
| "learning_rate": 0.0004100968494749125, |
| "loss": 3.3002, |
| "step": 54350 |
| }, |
| { |
| "epoch": 15.860058309037901, |
| "grad_norm": 0.3565203547477722, |
| "learning_rate": 0.00040992182030338386, |
| "loss": 3.3034, |
| "step": 54400 |
| }, |
| { |
| "epoch": 15.874635568513119, |
| "grad_norm": 0.3605969548225403, |
| "learning_rate": 0.0004097467911318553, |
| "loss": 3.3129, |
| "step": 54450 |
| }, |
| { |
| "epoch": 15.889212827988338, |
| "grad_norm": 0.3564240038394928, |
| "learning_rate": 0.00040957176196032666, |
| "loss": 3.308, |
| "step": 54500 |
| }, |
| { |
| "epoch": 15.903790087463557, |
| "grad_norm": 0.3524434268474579, |
| "learning_rate": 0.00040939673278879814, |
| "loss": 3.3183, |
| "step": 54550 |
| }, |
| { |
| "epoch": 15.918367346938776, |
| "grad_norm": 0.34038567543029785, |
| "learning_rate": 0.0004092217036172695, |
| "loss": 3.3084, |
| "step": 54600 |
| }, |
| { |
| "epoch": 15.932944606413994, |
| "grad_norm": 0.3590739965438843, |
| "learning_rate": 0.0004090466744457409, |
| "loss": 3.3132, |
| "step": 54650 |
| }, |
| { |
| "epoch": 15.947521865889213, |
| "grad_norm": 0.3675514757633209, |
| "learning_rate": 0.0004088716452742123, |
| "loss": 3.3113, |
| "step": 54700 |
| }, |
| { |
| "epoch": 15.962099125364432, |
| "grad_norm": 0.3558247983455658, |
| "learning_rate": 0.00040869661610268374, |
| "loss": 3.3172, |
| "step": 54750 |
| }, |
| { |
| "epoch": 15.97667638483965, |
| "grad_norm": 0.35684889554977417, |
| "learning_rate": 0.00040852158693115517, |
| "loss": 3.3083, |
| "step": 54800 |
| }, |
| { |
| "epoch": 15.991253644314869, |
| "grad_norm": 0.3759230375289917, |
| "learning_rate": 0.00040834655775962654, |
| "loss": 3.3054, |
| "step": 54850 |
| }, |
| { |
| "epoch": 16.005830903790088, |
| "grad_norm": 0.37320634722709656, |
| "learning_rate": 0.000408171528588098, |
| "loss": 3.2709, |
| "step": 54900 |
| }, |
| { |
| "epoch": 16.020408163265305, |
| "grad_norm": 0.3853946626186371, |
| "learning_rate": 0.0004079964994165694, |
| "loss": 3.2058, |
| "step": 54950 |
| }, |
| { |
| "epoch": 16.034985422740526, |
| "grad_norm": 0.40219175815582275, |
| "learning_rate": 0.00040782147024504077, |
| "loss": 3.1955, |
| "step": 55000 |
| }, |
| { |
| "epoch": 16.034985422740526, |
| "eval_accuracy": 0.37107209721073814, |
| "eval_loss": 3.5545384883880615, |
| "eval_runtime": 181.9202, |
| "eval_samples_per_second": 91.381, |
| "eval_steps_per_second": 5.711, |
| "step": 55000 |
| }, |
| { |
| "epoch": 16.049562682215743, |
| "grad_norm": 0.38274168968200684, |
| "learning_rate": 0.0004076464410735122, |
| "loss": 3.2219, |
| "step": 55050 |
| }, |
| { |
| "epoch": 16.06413994169096, |
| "grad_norm": 0.3740251660346985, |
| "learning_rate": 0.0004074714119019836, |
| "loss": 3.2159, |
| "step": 55100 |
| }, |
| { |
| "epoch": 16.07871720116618, |
| "grad_norm": 0.37743881344795227, |
| "learning_rate": 0.00040729638273045505, |
| "loss": 3.2209, |
| "step": 55150 |
| }, |
| { |
| "epoch": 16.0932944606414, |
| "grad_norm": 0.40411314368247986, |
| "learning_rate": 0.0004071213535589264, |
| "loss": 3.2181, |
| "step": 55200 |
| }, |
| { |
| "epoch": 16.107871720116616, |
| "grad_norm": 0.3726896345615387, |
| "learning_rate": 0.0004069463243873979, |
| "loss": 3.2248, |
| "step": 55250 |
| }, |
| { |
| "epoch": 16.122448979591837, |
| "grad_norm": 0.3628726899623871, |
| "learning_rate": 0.0004067712952158693, |
| "loss": 3.2261, |
| "step": 55300 |
| }, |
| { |
| "epoch": 16.137026239067055, |
| "grad_norm": 0.35304582118988037, |
| "learning_rate": 0.0004065962660443407, |
| "loss": 3.2226, |
| "step": 55350 |
| }, |
| { |
| "epoch": 16.151603498542276, |
| "grad_norm": 0.34734106063842773, |
| "learning_rate": 0.00040642123687281207, |
| "loss": 3.2242, |
| "step": 55400 |
| }, |
| { |
| "epoch": 16.166180758017493, |
| "grad_norm": 0.38421186804771423, |
| "learning_rate": 0.00040624620770128355, |
| "loss": 3.2378, |
| "step": 55450 |
| }, |
| { |
| "epoch": 16.18075801749271, |
| "grad_norm": 0.3777143955230713, |
| "learning_rate": 0.0004060711785297549, |
| "loss": 3.2481, |
| "step": 55500 |
| }, |
| { |
| "epoch": 16.19533527696793, |
| "grad_norm": 0.3493998944759369, |
| "learning_rate": 0.0004058961493582263, |
| "loss": 3.239, |
| "step": 55550 |
| }, |
| { |
| "epoch": 16.20991253644315, |
| "grad_norm": 0.39447450637817383, |
| "learning_rate": 0.0004057211201866978, |
| "loss": 3.2466, |
| "step": 55600 |
| }, |
| { |
| "epoch": 16.224489795918366, |
| "grad_norm": 0.3562599718570709, |
| "learning_rate": 0.00040554609101516915, |
| "loss": 3.237, |
| "step": 55650 |
| }, |
| { |
| "epoch": 16.239067055393587, |
| "grad_norm": 0.38712114095687866, |
| "learning_rate": 0.0004053710618436406, |
| "loss": 3.2464, |
| "step": 55700 |
| }, |
| { |
| "epoch": 16.253644314868804, |
| "grad_norm": 0.34968268871307373, |
| "learning_rate": 0.00040519603267211195, |
| "loss": 3.2474, |
| "step": 55750 |
| }, |
| { |
| "epoch": 16.26822157434402, |
| "grad_norm": 0.3648940920829773, |
| "learning_rate": 0.00040502100350058343, |
| "loss": 3.2563, |
| "step": 55800 |
| }, |
| { |
| "epoch": 16.282798833819243, |
| "grad_norm": 0.378246933221817, |
| "learning_rate": 0.0004048459743290548, |
| "loss": 3.2449, |
| "step": 55850 |
| }, |
| { |
| "epoch": 16.29737609329446, |
| "grad_norm": 0.3541260063648224, |
| "learning_rate": 0.0004046709451575262, |
| "loss": 3.2452, |
| "step": 55900 |
| }, |
| { |
| "epoch": 16.31195335276968, |
| "grad_norm": 0.3614341914653778, |
| "learning_rate": 0.0004044959159859976, |
| "loss": 3.2649, |
| "step": 55950 |
| }, |
| { |
| "epoch": 16.3265306122449, |
| "grad_norm": 0.35265859961509705, |
| "learning_rate": 0.00040432088681446903, |
| "loss": 3.2497, |
| "step": 56000 |
| }, |
| { |
| "epoch": 16.3265306122449, |
| "eval_accuracy": 0.37080417061414994, |
| "eval_loss": 3.555210590362549, |
| "eval_runtime": 182.0266, |
| "eval_samples_per_second": 91.327, |
| "eval_steps_per_second": 5.708, |
| "step": 56000 |
| }, |
| { |
| "epoch": 16.341107871720116, |
| "grad_norm": 0.37136510014533997, |
| "learning_rate": 0.00040414585764294046, |
| "loss": 3.2598, |
| "step": 56050 |
| }, |
| { |
| "epoch": 16.355685131195337, |
| "grad_norm": 0.3642927408218384, |
| "learning_rate": 0.00040397082847141183, |
| "loss": 3.2527, |
| "step": 56100 |
| }, |
| { |
| "epoch": 16.370262390670554, |
| "grad_norm": 0.41137054562568665, |
| "learning_rate": 0.0004037957992998833, |
| "loss": 3.2589, |
| "step": 56150 |
| }, |
| { |
| "epoch": 16.38483965014577, |
| "grad_norm": 0.3674943447113037, |
| "learning_rate": 0.0004036207701283547, |
| "loss": 3.2586, |
| "step": 56200 |
| }, |
| { |
| "epoch": 16.399416909620992, |
| "grad_norm": 0.3654170334339142, |
| "learning_rate": 0.0004034457409568261, |
| "loss": 3.2585, |
| "step": 56250 |
| }, |
| { |
| "epoch": 16.41399416909621, |
| "grad_norm": 0.4025888741016388, |
| "learning_rate": 0.0004032707117852975, |
| "loss": 3.2654, |
| "step": 56300 |
| }, |
| { |
| "epoch": 16.428571428571427, |
| "grad_norm": 0.395749568939209, |
| "learning_rate": 0.00040309568261376897, |
| "loss": 3.2618, |
| "step": 56350 |
| }, |
| { |
| "epoch": 16.443148688046648, |
| "grad_norm": 0.37377673387527466, |
| "learning_rate": 0.00040292065344224034, |
| "loss": 3.2805, |
| "step": 56400 |
| }, |
| { |
| "epoch": 16.457725947521865, |
| "grad_norm": 0.34729164838790894, |
| "learning_rate": 0.0004027456242707117, |
| "loss": 3.2746, |
| "step": 56450 |
| }, |
| { |
| "epoch": 16.472303206997086, |
| "grad_norm": 0.39409855008125305, |
| "learning_rate": 0.0004025705950991832, |
| "loss": 3.2704, |
| "step": 56500 |
| }, |
| { |
| "epoch": 16.486880466472304, |
| "grad_norm": 0.38538074493408203, |
| "learning_rate": 0.00040239556592765456, |
| "loss": 3.2669, |
| "step": 56550 |
| }, |
| { |
| "epoch": 16.50145772594752, |
| "grad_norm": 0.34272196888923645, |
| "learning_rate": 0.000402220536756126, |
| "loss": 3.2761, |
| "step": 56600 |
| }, |
| { |
| "epoch": 16.516034985422742, |
| "grad_norm": 0.34145087003707886, |
| "learning_rate": 0.00040204550758459736, |
| "loss": 3.2916, |
| "step": 56650 |
| }, |
| { |
| "epoch": 16.53061224489796, |
| "grad_norm": 0.3740696907043457, |
| "learning_rate": 0.00040187047841306884, |
| "loss": 3.2784, |
| "step": 56700 |
| }, |
| { |
| "epoch": 16.545189504373177, |
| "grad_norm": 0.34579378366470337, |
| "learning_rate": 0.0004016954492415402, |
| "loss": 3.2782, |
| "step": 56750 |
| }, |
| { |
| "epoch": 16.559766763848398, |
| "grad_norm": 0.3800579011440277, |
| "learning_rate": 0.00040152042007001164, |
| "loss": 3.2874, |
| "step": 56800 |
| }, |
| { |
| "epoch": 16.574344023323615, |
| "grad_norm": 0.3693027198314667, |
| "learning_rate": 0.00040134539089848307, |
| "loss": 3.2819, |
| "step": 56850 |
| }, |
| { |
| "epoch": 16.588921282798832, |
| "grad_norm": 0.3319258987903595, |
| "learning_rate": 0.00040117036172695444, |
| "loss": 3.2907, |
| "step": 56900 |
| }, |
| { |
| "epoch": 16.603498542274053, |
| "grad_norm": 0.3513771593570709, |
| "learning_rate": 0.00040099533255542587, |
| "loss": 3.2793, |
| "step": 56950 |
| }, |
| { |
| "epoch": 16.61807580174927, |
| "grad_norm": 0.37214353680610657, |
| "learning_rate": 0.00040082030338389724, |
| "loss": 3.2795, |
| "step": 57000 |
| }, |
| { |
| "epoch": 16.61807580174927, |
| "eval_accuracy": 0.37194886227725366, |
| "eval_loss": 3.544847249984741, |
| "eval_runtime": 181.8259, |
| "eval_samples_per_second": 91.428, |
| "eval_steps_per_second": 5.714, |
| "step": 57000 |
| }, |
| { |
| "epoch": 16.632653061224488, |
| "grad_norm": 0.37557870149612427, |
| "learning_rate": 0.0004006452742123687, |
| "loss": 3.2815, |
| "step": 57050 |
| }, |
| { |
| "epoch": 16.64723032069971, |
| "grad_norm": 0.3824288547039032, |
| "learning_rate": 0.0004004702450408401, |
| "loss": 3.2786, |
| "step": 57100 |
| }, |
| { |
| "epoch": 16.661807580174926, |
| "grad_norm": 0.4045317769050598, |
| "learning_rate": 0.0004002952158693115, |
| "loss": 3.2844, |
| "step": 57150 |
| }, |
| { |
| "epoch": 16.676384839650147, |
| "grad_norm": 0.36842742562294006, |
| "learning_rate": 0.0004001201866977829, |
| "loss": 3.2794, |
| "step": 57200 |
| }, |
| { |
| "epoch": 16.690962099125365, |
| "grad_norm": 0.35082441568374634, |
| "learning_rate": 0.0003999451575262544, |
| "loss": 3.2956, |
| "step": 57250 |
| }, |
| { |
| "epoch": 16.705539358600582, |
| "grad_norm": 0.3607030510902405, |
| "learning_rate": 0.00039977012835472575, |
| "loss": 3.2963, |
| "step": 57300 |
| }, |
| { |
| "epoch": 16.720116618075803, |
| "grad_norm": 0.3500318229198456, |
| "learning_rate": 0.0003995950991831971, |
| "loss": 3.2832, |
| "step": 57350 |
| }, |
| { |
| "epoch": 16.73469387755102, |
| "grad_norm": 0.3622967004776001, |
| "learning_rate": 0.0003994200700116686, |
| "loss": 3.2963, |
| "step": 57400 |
| }, |
| { |
| "epoch": 16.749271137026238, |
| "grad_norm": 0.3940633237361908, |
| "learning_rate": 0.00039924504084014, |
| "loss": 3.2942, |
| "step": 57450 |
| }, |
| { |
| "epoch": 16.76384839650146, |
| "grad_norm": 0.3878549635410309, |
| "learning_rate": 0.0003990700116686114, |
| "loss": 3.2843, |
| "step": 57500 |
| }, |
| { |
| "epoch": 16.778425655976676, |
| "grad_norm": 0.34530991315841675, |
| "learning_rate": 0.0003988949824970828, |
| "loss": 3.299, |
| "step": 57550 |
| }, |
| { |
| "epoch": 16.793002915451893, |
| "grad_norm": 0.3695226013660431, |
| "learning_rate": 0.00039871995332555426, |
| "loss": 3.2941, |
| "step": 57600 |
| }, |
| { |
| "epoch": 16.807580174927114, |
| "grad_norm": 0.3550872802734375, |
| "learning_rate": 0.00039854492415402563, |
| "loss": 3.2898, |
| "step": 57650 |
| }, |
| { |
| "epoch": 16.82215743440233, |
| "grad_norm": 0.3620699346065521, |
| "learning_rate": 0.00039836989498249706, |
| "loss": 3.3027, |
| "step": 57700 |
| }, |
| { |
| "epoch": 16.836734693877553, |
| "grad_norm": 0.36660054326057434, |
| "learning_rate": 0.0003981948658109685, |
| "loss": 3.3013, |
| "step": 57750 |
| }, |
| { |
| "epoch": 16.85131195335277, |
| "grad_norm": 0.37367725372314453, |
| "learning_rate": 0.00039801983663943985, |
| "loss": 3.2955, |
| "step": 57800 |
| }, |
| { |
| "epoch": 16.865889212827987, |
| "grad_norm": 0.35791313648223877, |
| "learning_rate": 0.0003978448074679113, |
| "loss": 3.3011, |
| "step": 57850 |
| }, |
| { |
| "epoch": 16.88046647230321, |
| "grad_norm": 0.36037060618400574, |
| "learning_rate": 0.00039766977829638265, |
| "loss": 3.309, |
| "step": 57900 |
| }, |
| { |
| "epoch": 16.895043731778426, |
| "grad_norm": 0.35850104689598083, |
| "learning_rate": 0.00039749474912485414, |
| "loss": 3.295, |
| "step": 57950 |
| }, |
| { |
| "epoch": 16.909620991253643, |
| "grad_norm": 0.3768969178199768, |
| "learning_rate": 0.0003973197199533255, |
| "loss": 3.2888, |
| "step": 58000 |
| }, |
| { |
| "epoch": 16.909620991253643, |
| "eval_accuracy": 0.37238277151935567, |
| "eval_loss": 3.539978265762329, |
| "eval_runtime": 181.464, |
| "eval_samples_per_second": 91.61, |
| "eval_steps_per_second": 5.726, |
| "step": 58000 |
| }, |
| { |
| "epoch": 16.924198250728864, |
| "grad_norm": 0.3688867688179016, |
| "learning_rate": 0.00039714469078179693, |
| "loss": 3.3078, |
| "step": 58050 |
| }, |
| { |
| "epoch": 16.93877551020408, |
| "grad_norm": 0.4198916554450989, |
| "learning_rate": 0.00039696966161026836, |
| "loss": 3.3041, |
| "step": 58100 |
| }, |
| { |
| "epoch": 16.9533527696793, |
| "grad_norm": 0.3668155372142792, |
| "learning_rate": 0.0003967946324387398, |
| "loss": 3.3077, |
| "step": 58150 |
| }, |
| { |
| "epoch": 16.96793002915452, |
| "grad_norm": 0.359727680683136, |
| "learning_rate": 0.00039661960326721116, |
| "loss": 3.3038, |
| "step": 58200 |
| }, |
| { |
| "epoch": 16.982507288629737, |
| "grad_norm": 0.34617719054222107, |
| "learning_rate": 0.00039644457409568253, |
| "loss": 3.2923, |
| "step": 58250 |
| }, |
| { |
| "epoch": 16.997084548104958, |
| "grad_norm": 0.34785595536231995, |
| "learning_rate": 0.000396269544924154, |
| "loss": 3.3034, |
| "step": 58300 |
| }, |
| { |
| "epoch": 17.011661807580175, |
| "grad_norm": 0.3586324453353882, |
| "learning_rate": 0.0003960945157526254, |
| "loss": 3.202, |
| "step": 58350 |
| }, |
| { |
| "epoch": 17.026239067055393, |
| "grad_norm": 0.4087458848953247, |
| "learning_rate": 0.0003959194865810968, |
| "loss": 3.2051, |
| "step": 58400 |
| }, |
| { |
| "epoch": 17.040816326530614, |
| "grad_norm": 0.3719448745250702, |
| "learning_rate": 0.0003957444574095682, |
| "loss": 3.1944, |
| "step": 58450 |
| }, |
| { |
| "epoch": 17.05539358600583, |
| "grad_norm": 0.3907237648963928, |
| "learning_rate": 0.00039556942823803967, |
| "loss": 3.2072, |
| "step": 58500 |
| }, |
| { |
| "epoch": 17.06997084548105, |
| "grad_norm": 0.3574305772781372, |
| "learning_rate": 0.00039539439906651104, |
| "loss": 3.216, |
| "step": 58550 |
| }, |
| { |
| "epoch": 17.08454810495627, |
| "grad_norm": 0.3930377960205078, |
| "learning_rate": 0.00039521936989498247, |
| "loss": 3.195, |
| "step": 58600 |
| }, |
| { |
| "epoch": 17.099125364431487, |
| "grad_norm": 0.38136032223701477, |
| "learning_rate": 0.0003950443407234539, |
| "loss": 3.2201, |
| "step": 58650 |
| }, |
| { |
| "epoch": 17.113702623906704, |
| "grad_norm": 0.42671507596969604, |
| "learning_rate": 0.0003948693115519253, |
| "loss": 3.2237, |
| "step": 58700 |
| }, |
| { |
| "epoch": 17.128279883381925, |
| "grad_norm": 0.3523384928703308, |
| "learning_rate": 0.0003946942823803967, |
| "loss": 3.2151, |
| "step": 58750 |
| }, |
| { |
| "epoch": 17.142857142857142, |
| "grad_norm": 0.38585031032562256, |
| "learning_rate": 0.00039451925320886807, |
| "loss": 3.2146, |
| "step": 58800 |
| }, |
| { |
| "epoch": 17.157434402332363, |
| "grad_norm": 0.35892701148986816, |
| "learning_rate": 0.00039434422403733955, |
| "loss": 3.2247, |
| "step": 58850 |
| }, |
| { |
| "epoch": 17.17201166180758, |
| "grad_norm": 0.3723486363887787, |
| "learning_rate": 0.0003941691948658109, |
| "loss": 3.2318, |
| "step": 58900 |
| }, |
| { |
| "epoch": 17.186588921282798, |
| "grad_norm": 0.3572362959384918, |
| "learning_rate": 0.00039399416569428235, |
| "loss": 3.2172, |
| "step": 58950 |
| }, |
| { |
| "epoch": 17.20116618075802, |
| "grad_norm": 0.37814998626708984, |
| "learning_rate": 0.0003938191365227538, |
| "loss": 3.2231, |
| "step": 59000 |
| }, |
| { |
| "epoch": 17.20116618075802, |
| "eval_accuracy": 0.3713283697066839, |
| "eval_loss": 3.558633804321289, |
| "eval_runtime": 181.6477, |
| "eval_samples_per_second": 91.518, |
| "eval_steps_per_second": 5.72, |
| "step": 59000 |
| }, |
| { |
| "epoch": 17.215743440233236, |
| "grad_norm": 0.3887748122215271, |
| "learning_rate": 0.0003936441073512252, |
| "loss": 3.24, |
| "step": 59050 |
| }, |
| { |
| "epoch": 17.230320699708454, |
| "grad_norm": 0.39903128147125244, |
| "learning_rate": 0.00039346907817969657, |
| "loss": 3.2365, |
| "step": 59100 |
| }, |
| { |
| "epoch": 17.244897959183675, |
| "grad_norm": 0.3754963278770447, |
| "learning_rate": 0.00039329404900816795, |
| "loss": 3.234, |
| "step": 59150 |
| }, |
| { |
| "epoch": 17.259475218658892, |
| "grad_norm": 0.37241125106811523, |
| "learning_rate": 0.0003931190198366394, |
| "loss": 3.2384, |
| "step": 59200 |
| }, |
| { |
| "epoch": 17.27405247813411, |
| "grad_norm": 0.36812448501586914, |
| "learning_rate": 0.0003929439906651108, |
| "loss": 3.2294, |
| "step": 59250 |
| }, |
| { |
| "epoch": 17.28862973760933, |
| "grad_norm": 0.3592478632926941, |
| "learning_rate": 0.0003927689614935822, |
| "loss": 3.2397, |
| "step": 59300 |
| }, |
| { |
| "epoch": 17.303206997084548, |
| "grad_norm": 0.367058664560318, |
| "learning_rate": 0.00039259393232205365, |
| "loss": 3.2444, |
| "step": 59350 |
| }, |
| { |
| "epoch": 17.317784256559765, |
| "grad_norm": 0.3607527017593384, |
| "learning_rate": 0.0003924189031505251, |
| "loss": 3.2395, |
| "step": 59400 |
| }, |
| { |
| "epoch": 17.332361516034986, |
| "grad_norm": 0.3962002694606781, |
| "learning_rate": 0.00039224387397899645, |
| "loss": 3.2428, |
| "step": 59450 |
| }, |
| { |
| "epoch": 17.346938775510203, |
| "grad_norm": 0.37258464097976685, |
| "learning_rate": 0.0003920688448074679, |
| "loss": 3.2478, |
| "step": 59500 |
| }, |
| { |
| "epoch": 17.361516034985424, |
| "grad_norm": 0.4007439613342285, |
| "learning_rate": 0.0003918938156359393, |
| "loss": 3.2549, |
| "step": 59550 |
| }, |
| { |
| "epoch": 17.37609329446064, |
| "grad_norm": 0.40145668387413025, |
| "learning_rate": 0.00039171878646441073, |
| "loss": 3.2573, |
| "step": 59600 |
| }, |
| { |
| "epoch": 17.39067055393586, |
| "grad_norm": 0.3657708764076233, |
| "learning_rate": 0.0003915437572928821, |
| "loss": 3.2602, |
| "step": 59650 |
| }, |
| { |
| "epoch": 17.40524781341108, |
| "grad_norm": 0.34860551357269287, |
| "learning_rate": 0.0003913687281213535, |
| "loss": 3.2522, |
| "step": 59700 |
| }, |
| { |
| "epoch": 17.419825072886297, |
| "grad_norm": 0.36920854449272156, |
| "learning_rate": 0.00039119369894982496, |
| "loss": 3.2609, |
| "step": 59750 |
| }, |
| { |
| "epoch": 17.434402332361515, |
| "grad_norm": 0.33922135829925537, |
| "learning_rate": 0.00039101866977829633, |
| "loss": 3.2541, |
| "step": 59800 |
| }, |
| { |
| "epoch": 17.448979591836736, |
| "grad_norm": 0.37371137738227844, |
| "learning_rate": 0.00039084364060676776, |
| "loss": 3.2537, |
| "step": 59850 |
| }, |
| { |
| "epoch": 17.463556851311953, |
| "grad_norm": 0.37898826599121094, |
| "learning_rate": 0.0003906686114352392, |
| "loss": 3.2568, |
| "step": 59900 |
| }, |
| { |
| "epoch": 17.47813411078717, |
| "grad_norm": 0.37351587414741516, |
| "learning_rate": 0.0003904935822637106, |
| "loss": 3.2739, |
| "step": 59950 |
| }, |
| { |
| "epoch": 17.49271137026239, |
| "grad_norm": 0.3553948402404785, |
| "learning_rate": 0.000390318553092182, |
| "loss": 3.2615, |
| "step": 60000 |
| }, |
| { |
| "epoch": 17.49271137026239, |
| "eval_accuracy": 0.3715660427288771, |
| "eval_loss": 3.5488474369049072, |
| "eval_runtime": 181.4626, |
| "eval_samples_per_second": 91.611, |
| "eval_steps_per_second": 5.726, |
| "step": 60000 |
| }, |
| { |
| "epoch": 17.50728862973761, |
| "grad_norm": 0.3778980076313019, |
| "learning_rate": 0.00039014352392065336, |
| "loss": 3.2639, |
| "step": 60050 |
| }, |
| { |
| "epoch": 17.52186588921283, |
| "grad_norm": 0.3963741958141327, |
| "learning_rate": 0.00038996849474912484, |
| "loss": 3.2614, |
| "step": 60100 |
| }, |
| { |
| "epoch": 17.536443148688047, |
| "grad_norm": 0.3608973026275635, |
| "learning_rate": 0.0003897934655775962, |
| "loss": 3.2699, |
| "step": 60150 |
| }, |
| { |
| "epoch": 17.551020408163264, |
| "grad_norm": 0.3675440549850464, |
| "learning_rate": 0.00038961843640606764, |
| "loss": 3.2676, |
| "step": 60200 |
| }, |
| { |
| "epoch": 17.565597667638485, |
| "grad_norm": 0.36255237460136414, |
| "learning_rate": 0.00038944340723453906, |
| "loss": 3.2672, |
| "step": 60250 |
| }, |
| { |
| "epoch": 17.580174927113703, |
| "grad_norm": 0.38902583718299866, |
| "learning_rate": 0.0003892683780630105, |
| "loss": 3.2571, |
| "step": 60300 |
| }, |
| { |
| "epoch": 17.59475218658892, |
| "grad_norm": 0.38929134607315063, |
| "learning_rate": 0.00038909334889148186, |
| "loss": 3.2739, |
| "step": 60350 |
| }, |
| { |
| "epoch": 17.60932944606414, |
| "grad_norm": 0.3460721969604492, |
| "learning_rate": 0.0003889183197199533, |
| "loss": 3.2634, |
| "step": 60400 |
| }, |
| { |
| "epoch": 17.62390670553936, |
| "grad_norm": 0.3623656928539276, |
| "learning_rate": 0.0003887432905484247, |
| "loss": 3.2716, |
| "step": 60450 |
| }, |
| { |
| "epoch": 17.638483965014576, |
| "grad_norm": 0.38560980558395386, |
| "learning_rate": 0.00038856826137689614, |
| "loss": 3.2748, |
| "step": 60500 |
| }, |
| { |
| "epoch": 17.653061224489797, |
| "grad_norm": 0.39393141865730286, |
| "learning_rate": 0.0003883932322053675, |
| "loss": 3.2795, |
| "step": 60550 |
| }, |
| { |
| "epoch": 17.667638483965014, |
| "grad_norm": 0.3850944936275482, |
| "learning_rate": 0.00038821820303383894, |
| "loss": 3.2676, |
| "step": 60600 |
| }, |
| { |
| "epoch": 17.682215743440235, |
| "grad_norm": 0.3653624653816223, |
| "learning_rate": 0.00038804317386231037, |
| "loss": 3.2848, |
| "step": 60650 |
| }, |
| { |
| "epoch": 17.696793002915452, |
| "grad_norm": 0.35563892126083374, |
| "learning_rate": 0.00038786814469078174, |
| "loss": 3.2879, |
| "step": 60700 |
| }, |
| { |
| "epoch": 17.71137026239067, |
| "grad_norm": 0.38104385137557983, |
| "learning_rate": 0.00038769311551925317, |
| "loss": 3.2737, |
| "step": 60750 |
| }, |
| { |
| "epoch": 17.72594752186589, |
| "grad_norm": 0.3962397873401642, |
| "learning_rate": 0.0003875180863477246, |
| "loss": 3.2767, |
| "step": 60800 |
| }, |
| { |
| "epoch": 17.740524781341108, |
| "grad_norm": 0.3726269006729126, |
| "learning_rate": 0.000387343057176196, |
| "loss": 3.2919, |
| "step": 60850 |
| }, |
| { |
| "epoch": 17.755102040816325, |
| "grad_norm": 0.3684369623661041, |
| "learning_rate": 0.0003871680280046674, |
| "loss": 3.2791, |
| "step": 60900 |
| }, |
| { |
| "epoch": 17.769679300291546, |
| "grad_norm": 0.39243683218955994, |
| "learning_rate": 0.00038699299883313877, |
| "loss": 3.2804, |
| "step": 60950 |
| }, |
| { |
| "epoch": 17.784256559766764, |
| "grad_norm": 0.349883109331131, |
| "learning_rate": 0.00038681796966161025, |
| "loss": 3.272, |
| "step": 61000 |
| }, |
| { |
| "epoch": 17.784256559766764, |
| "eval_accuracy": 0.37223009102912064, |
| "eval_loss": 3.541855812072754, |
| "eval_runtime": 181.727, |
| "eval_samples_per_second": 91.478, |
| "eval_steps_per_second": 5.717, |
| "step": 61000 |
| }, |
| { |
| "epoch": 17.79883381924198, |
| "grad_norm": 0.35422345995903015, |
| "learning_rate": 0.0003866429404900816, |
| "loss": 3.2824, |
| "step": 61050 |
| }, |
| { |
| "epoch": 17.813411078717202, |
| "grad_norm": 0.37883347272872925, |
| "learning_rate": 0.00038646791131855305, |
| "loss": 3.2653, |
| "step": 61100 |
| }, |
| { |
| "epoch": 17.82798833819242, |
| "grad_norm": 0.3556387722492218, |
| "learning_rate": 0.0003862928821470245, |
| "loss": 3.2756, |
| "step": 61150 |
| }, |
| { |
| "epoch": 17.842565597667637, |
| "grad_norm": 0.37966683506965637, |
| "learning_rate": 0.0003861178529754959, |
| "loss": 3.2827, |
| "step": 61200 |
| }, |
| { |
| "epoch": 17.857142857142858, |
| "grad_norm": 0.3499722480773926, |
| "learning_rate": 0.0003859428238039673, |
| "loss": 3.2842, |
| "step": 61250 |
| }, |
| { |
| "epoch": 17.871720116618075, |
| "grad_norm": 0.36272355914115906, |
| "learning_rate": 0.0003857677946324387, |
| "loss": 3.271, |
| "step": 61300 |
| }, |
| { |
| "epoch": 17.886297376093296, |
| "grad_norm": 0.3606753647327423, |
| "learning_rate": 0.00038559276546091013, |
| "loss": 3.2813, |
| "step": 61350 |
| }, |
| { |
| "epoch": 17.900874635568513, |
| "grad_norm": 0.37976881861686707, |
| "learning_rate": 0.00038541773628938156, |
| "loss": 3.2749, |
| "step": 61400 |
| }, |
| { |
| "epoch": 17.91545189504373, |
| "grad_norm": 0.34956464171409607, |
| "learning_rate": 0.00038524270711785293, |
| "loss": 3.2873, |
| "step": 61450 |
| }, |
| { |
| "epoch": 17.93002915451895, |
| "grad_norm": 0.38344627618789673, |
| "learning_rate": 0.0003850676779463244, |
| "loss": 3.2827, |
| "step": 61500 |
| }, |
| { |
| "epoch": 17.94460641399417, |
| "grad_norm": 0.3596233129501343, |
| "learning_rate": 0.0003848926487747958, |
| "loss": 3.2865, |
| "step": 61550 |
| }, |
| { |
| "epoch": 17.959183673469386, |
| "grad_norm": 0.367553174495697, |
| "learning_rate": 0.00038471761960326715, |
| "loss": 3.2868, |
| "step": 61600 |
| }, |
| { |
| "epoch": 17.973760932944607, |
| "grad_norm": 0.35922616720199585, |
| "learning_rate": 0.0003845425904317386, |
| "loss": 3.2856, |
| "step": 61650 |
| }, |
| { |
| "epoch": 17.988338192419825, |
| "grad_norm": 0.370110422372818, |
| "learning_rate": 0.00038436756126021, |
| "loss": 3.291, |
| "step": 61700 |
| }, |
| { |
| "epoch": 18.002915451895042, |
| "grad_norm": 0.6661171913146973, |
| "learning_rate": 0.00038419253208868143, |
| "loss": 3.2759, |
| "step": 61750 |
| }, |
| { |
| "epoch": 18.017492711370263, |
| "grad_norm": 0.3718918561935425, |
| "learning_rate": 0.0003840175029171528, |
| "loss": 3.1828, |
| "step": 61800 |
| }, |
| { |
| "epoch": 18.03206997084548, |
| "grad_norm": 0.39824941754341125, |
| "learning_rate": 0.0003838424737456243, |
| "loss": 3.1886, |
| "step": 61850 |
| }, |
| { |
| "epoch": 18.0466472303207, |
| "grad_norm": 0.389596164226532, |
| "learning_rate": 0.00038366744457409566, |
| "loss": 3.1964, |
| "step": 61900 |
| }, |
| { |
| "epoch": 18.06122448979592, |
| "grad_norm": 0.3624783158302307, |
| "learning_rate": 0.00038349241540256703, |
| "loss": 3.1964, |
| "step": 61950 |
| }, |
| { |
| "epoch": 18.075801749271136, |
| "grad_norm": 0.3595573902130127, |
| "learning_rate": 0.00038331738623103846, |
| "loss": 3.203, |
| "step": 62000 |
| }, |
| { |
| "epoch": 18.075801749271136, |
| "eval_accuracy": 0.3718941233196906, |
| "eval_loss": 3.557060480117798, |
| "eval_runtime": 181.5132, |
| "eval_samples_per_second": 91.586, |
| "eval_steps_per_second": 5.724, |
| "step": 62000 |
| }, |
| { |
| "epoch": 18.090379008746357, |
| "grad_norm": 0.3661964535713196, |
| "learning_rate": 0.0003831423570595099, |
| "loss": 3.1916, |
| "step": 62050 |
| }, |
| { |
| "epoch": 18.104956268221574, |
| "grad_norm": 0.43816515803337097, |
| "learning_rate": 0.0003829673278879813, |
| "loss": 3.1928, |
| "step": 62100 |
| }, |
| { |
| "epoch": 18.11953352769679, |
| "grad_norm": 0.38191908597946167, |
| "learning_rate": 0.0003827922987164527, |
| "loss": 3.213, |
| "step": 62150 |
| }, |
| { |
| "epoch": 18.134110787172013, |
| "grad_norm": 0.3885975778102875, |
| "learning_rate": 0.0003826172695449241, |
| "loss": 3.2124, |
| "step": 62200 |
| }, |
| { |
| "epoch": 18.14868804664723, |
| "grad_norm": 0.4196251928806305, |
| "learning_rate": 0.00038244224037339554, |
| "loss": 3.2082, |
| "step": 62250 |
| }, |
| { |
| "epoch": 18.163265306122447, |
| "grad_norm": 0.38539016246795654, |
| "learning_rate": 0.00038226721120186697, |
| "loss": 3.2086, |
| "step": 62300 |
| }, |
| { |
| "epoch": 18.17784256559767, |
| "grad_norm": 0.38454657793045044, |
| "learning_rate": 0.00038209218203033834, |
| "loss": 3.2176, |
| "step": 62350 |
| }, |
| { |
| "epoch": 18.192419825072886, |
| "grad_norm": 0.4121674597263336, |
| "learning_rate": 0.0003819171528588098, |
| "loss": 3.2164, |
| "step": 62400 |
| }, |
| { |
| "epoch": 18.206997084548107, |
| "grad_norm": 0.3765960931777954, |
| "learning_rate": 0.0003817421236872812, |
| "loss": 3.2222, |
| "step": 62450 |
| }, |
| { |
| "epoch": 18.221574344023324, |
| "grad_norm": 0.3664701581001282, |
| "learning_rate": 0.00038156709451575257, |
| "loss": 3.2252, |
| "step": 62500 |
| }, |
| { |
| "epoch": 18.23615160349854, |
| "grad_norm": 0.36911651492118835, |
| "learning_rate": 0.000381392065344224, |
| "loss": 3.2341, |
| "step": 62550 |
| }, |
| { |
| "epoch": 18.250728862973762, |
| "grad_norm": 0.393379807472229, |
| "learning_rate": 0.0003812170361726954, |
| "loss": 3.2249, |
| "step": 62600 |
| }, |
| { |
| "epoch": 18.26530612244898, |
| "grad_norm": 0.3650977611541748, |
| "learning_rate": 0.00038104200700116685, |
| "loss": 3.2255, |
| "step": 62650 |
| }, |
| { |
| "epoch": 18.279883381924197, |
| "grad_norm": 0.3733424246311188, |
| "learning_rate": 0.0003808669778296382, |
| "loss": 3.2305, |
| "step": 62700 |
| }, |
| { |
| "epoch": 18.294460641399418, |
| "grad_norm": 0.3693566620349884, |
| "learning_rate": 0.0003806919486581097, |
| "loss": 3.2214, |
| "step": 62750 |
| }, |
| { |
| "epoch": 18.309037900874635, |
| "grad_norm": 0.3484310805797577, |
| "learning_rate": 0.00038051691948658107, |
| "loss": 3.2257, |
| "step": 62800 |
| }, |
| { |
| "epoch": 18.323615160349853, |
| "grad_norm": 0.3896331191062927, |
| "learning_rate": 0.00038034189031505245, |
| "loss": 3.2346, |
| "step": 62850 |
| }, |
| { |
| "epoch": 18.338192419825074, |
| "grad_norm": 0.3532673716545105, |
| "learning_rate": 0.00038016686114352387, |
| "loss": 3.2408, |
| "step": 62900 |
| }, |
| { |
| "epoch": 18.35276967930029, |
| "grad_norm": 0.3979147672653198, |
| "learning_rate": 0.0003799918319719953, |
| "loss": 3.2279, |
| "step": 62950 |
| }, |
| { |
| "epoch": 18.367346938775512, |
| "grad_norm": 0.3705503046512604, |
| "learning_rate": 0.0003798168028004667, |
| "loss": 3.2376, |
| "step": 63000 |
| }, |
| { |
| "epoch": 18.367346938775512, |
| "eval_accuracy": 0.37182149119750474, |
| "eval_loss": 3.552273750305176, |
| "eval_runtime": 181.406, |
| "eval_samples_per_second": 91.64, |
| "eval_steps_per_second": 5.727, |
| "step": 63000 |
| }, |
| { |
| "epoch": 18.38192419825073, |
| "grad_norm": 0.38964420557022095, |
| "learning_rate": 0.0003796417736289381, |
| "loss": 3.228, |
| "step": 63050 |
| }, |
| { |
| "epoch": 18.396501457725947, |
| "grad_norm": 0.37392449378967285, |
| "learning_rate": 0.0003794667444574096, |
| "loss": 3.2566, |
| "step": 63100 |
| }, |
| { |
| "epoch": 18.411078717201168, |
| "grad_norm": 0.3898674547672272, |
| "learning_rate": 0.00037929171528588095, |
| "loss": 3.2463, |
| "step": 63150 |
| }, |
| { |
| "epoch": 18.425655976676385, |
| "grad_norm": 0.37367600202560425, |
| "learning_rate": 0.0003791166861143524, |
| "loss": 3.2472, |
| "step": 63200 |
| }, |
| { |
| "epoch": 18.440233236151602, |
| "grad_norm": 0.3786541819572449, |
| "learning_rate": 0.00037894165694282375, |
| "loss": 3.243, |
| "step": 63250 |
| }, |
| { |
| "epoch": 18.454810495626823, |
| "grad_norm": 0.37856510281562805, |
| "learning_rate": 0.00037876662777129523, |
| "loss": 3.2575, |
| "step": 63300 |
| }, |
| { |
| "epoch": 18.46938775510204, |
| "grad_norm": 0.3948310911655426, |
| "learning_rate": 0.0003785915985997666, |
| "loss": 3.2487, |
| "step": 63350 |
| }, |
| { |
| "epoch": 18.483965014577258, |
| "grad_norm": 0.3751145303249359, |
| "learning_rate": 0.000378416569428238, |
| "loss": 3.2465, |
| "step": 63400 |
| }, |
| { |
| "epoch": 18.49854227405248, |
| "grad_norm": 0.36819157004356384, |
| "learning_rate": 0.0003782415402567094, |
| "loss": 3.2507, |
| "step": 63450 |
| }, |
| { |
| "epoch": 18.513119533527696, |
| "grad_norm": 0.35814929008483887, |
| "learning_rate": 0.00037806651108518083, |
| "loss": 3.26, |
| "step": 63500 |
| }, |
| { |
| "epoch": 18.527696793002917, |
| "grad_norm": 0.3531430959701538, |
| "learning_rate": 0.00037789148191365226, |
| "loss": 3.2505, |
| "step": 63550 |
| }, |
| { |
| "epoch": 18.542274052478135, |
| "grad_norm": 0.37971770763397217, |
| "learning_rate": 0.00037771645274212363, |
| "loss": 3.246, |
| "step": 63600 |
| }, |
| { |
| "epoch": 18.556851311953352, |
| "grad_norm": 0.37397658824920654, |
| "learning_rate": 0.0003775414235705951, |
| "loss": 3.2506, |
| "step": 63650 |
| }, |
| { |
| "epoch": 18.571428571428573, |
| "grad_norm": 0.35488125681877136, |
| "learning_rate": 0.0003773663943990665, |
| "loss": 3.266, |
| "step": 63700 |
| }, |
| { |
| "epoch": 18.58600583090379, |
| "grad_norm": 0.3492448925971985, |
| "learning_rate": 0.00037719136522753786, |
| "loss": 3.2565, |
| "step": 63750 |
| }, |
| { |
| "epoch": 18.600583090379008, |
| "grad_norm": 0.37872013449668884, |
| "learning_rate": 0.0003770163360560093, |
| "loss": 3.2577, |
| "step": 63800 |
| }, |
| { |
| "epoch": 18.61516034985423, |
| "grad_norm": 0.35813814401626587, |
| "learning_rate": 0.0003768413068844807, |
| "loss": 3.2374, |
| "step": 63850 |
| }, |
| { |
| "epoch": 18.629737609329446, |
| "grad_norm": 0.37753552198410034, |
| "learning_rate": 0.00037666627771295214, |
| "loss": 3.2499, |
| "step": 63900 |
| }, |
| { |
| "epoch": 18.644314868804663, |
| "grad_norm": 0.35299044847488403, |
| "learning_rate": 0.0003764912485414235, |
| "loss": 3.2535, |
| "step": 63950 |
| }, |
| { |
| "epoch": 18.658892128279884, |
| "grad_norm": 0.3655620217323303, |
| "learning_rate": 0.000376316219369895, |
| "loss": 3.2659, |
| "step": 64000 |
| }, |
| { |
| "epoch": 18.658892128279884, |
| "eval_accuracy": 0.3724969581620141, |
| "eval_loss": 3.543386936187744, |
| "eval_runtime": 181.6734, |
| "eval_samples_per_second": 91.505, |
| "eval_steps_per_second": 5.719, |
| "step": 64000 |
| }, |
| { |
| "epoch": 18.6734693877551, |
| "grad_norm": 0.358867883682251, |
| "learning_rate": 0.00037614119019836636, |
| "loss": 3.2715, |
| "step": 64050 |
| }, |
| { |
| "epoch": 18.68804664723032, |
| "grad_norm": 0.3794868588447571, |
| "learning_rate": 0.0003759661610268378, |
| "loss": 3.2644, |
| "step": 64100 |
| }, |
| { |
| "epoch": 18.70262390670554, |
| "grad_norm": 0.42781543731689453, |
| "learning_rate": 0.00037579113185530916, |
| "loss": 3.2612, |
| "step": 64150 |
| }, |
| { |
| "epoch": 18.717201166180757, |
| "grad_norm": 0.40130743384361267, |
| "learning_rate": 0.00037561610268378064, |
| "loss": 3.2794, |
| "step": 64200 |
| }, |
| { |
| "epoch": 18.73177842565598, |
| "grad_norm": 0.38567736744880676, |
| "learning_rate": 0.000375441073512252, |
| "loss": 3.2603, |
| "step": 64250 |
| }, |
| { |
| "epoch": 18.746355685131196, |
| "grad_norm": 0.3566703498363495, |
| "learning_rate": 0.0003752660443407234, |
| "loss": 3.2642, |
| "step": 64300 |
| }, |
| { |
| "epoch": 18.760932944606413, |
| "grad_norm": 0.3746073544025421, |
| "learning_rate": 0.00037509101516919487, |
| "loss": 3.264, |
| "step": 64350 |
| }, |
| { |
| "epoch": 18.775510204081634, |
| "grad_norm": 0.37497463822364807, |
| "learning_rate": 0.00037491598599766624, |
| "loss": 3.2568, |
| "step": 64400 |
| }, |
| { |
| "epoch": 18.79008746355685, |
| "grad_norm": 0.37049469351768494, |
| "learning_rate": 0.00037474095682613767, |
| "loss": 3.2802, |
| "step": 64450 |
| }, |
| { |
| "epoch": 18.80466472303207, |
| "grad_norm": 0.3565693795681, |
| "learning_rate": 0.00037456592765460904, |
| "loss": 3.2713, |
| "step": 64500 |
| }, |
| { |
| "epoch": 18.81924198250729, |
| "grad_norm": 0.36381709575653076, |
| "learning_rate": 0.0003743908984830805, |
| "loss": 3.2586, |
| "step": 64550 |
| }, |
| { |
| "epoch": 18.833819241982507, |
| "grad_norm": 0.34240925312042236, |
| "learning_rate": 0.0003742158693115519, |
| "loss": 3.2755, |
| "step": 64600 |
| }, |
| { |
| "epoch": 18.848396501457724, |
| "grad_norm": 0.3438854515552521, |
| "learning_rate": 0.00037404084014002327, |
| "loss": 3.2651, |
| "step": 64650 |
| }, |
| { |
| "epoch": 18.862973760932945, |
| "grad_norm": 0.3836745023727417, |
| "learning_rate": 0.0003738658109684947, |
| "loss": 3.259, |
| "step": 64700 |
| }, |
| { |
| "epoch": 18.877551020408163, |
| "grad_norm": 0.3804767429828644, |
| "learning_rate": 0.0003736907817969661, |
| "loss": 3.2655, |
| "step": 64750 |
| }, |
| { |
| "epoch": 18.892128279883384, |
| "grad_norm": 0.36783847212791443, |
| "learning_rate": 0.00037351575262543755, |
| "loss": 3.277, |
| "step": 64800 |
| }, |
| { |
| "epoch": 18.9067055393586, |
| "grad_norm": 0.3522402346134186, |
| "learning_rate": 0.0003733407234539089, |
| "loss": 3.2743, |
| "step": 64850 |
| }, |
| { |
| "epoch": 18.92128279883382, |
| "grad_norm": 0.3842095732688904, |
| "learning_rate": 0.0003731656942823804, |
| "loss": 3.2749, |
| "step": 64900 |
| }, |
| { |
| "epoch": 18.93586005830904, |
| "grad_norm": 0.34390994906425476, |
| "learning_rate": 0.0003729906651108518, |
| "loss": 3.2834, |
| "step": 64950 |
| }, |
| { |
| "epoch": 18.950437317784257, |
| "grad_norm": 0.352525919675827, |
| "learning_rate": 0.0003728156359393232, |
| "loss": 3.2758, |
| "step": 65000 |
| }, |
| { |
| "epoch": 18.950437317784257, |
| "eval_accuracy": 0.37309014011289643, |
| "eval_loss": 3.5348432064056396, |
| "eval_runtime": 181.6813, |
| "eval_samples_per_second": 91.501, |
| "eval_steps_per_second": 5.719, |
| "step": 65000 |
| }, |
| { |
| "epoch": 18.965014577259474, |
| "grad_norm": 0.36129817366600037, |
| "learning_rate": 0.0003726406067677946, |
| "loss": 3.2741, |
| "step": 65050 |
| }, |
| { |
| "epoch": 18.979591836734695, |
| "grad_norm": 0.36104297637939453, |
| "learning_rate": 0.00037246557759626606, |
| "loss": 3.2763, |
| "step": 65100 |
| }, |
| { |
| "epoch": 18.994169096209912, |
| "grad_norm": 0.3394487202167511, |
| "learning_rate": 0.00037229054842473743, |
| "loss": 3.2746, |
| "step": 65150 |
| }, |
| { |
| "epoch": 19.00874635568513, |
| "grad_norm": 0.3638574779033661, |
| "learning_rate": 0.0003721155192532088, |
| "loss": 3.2164, |
| "step": 65200 |
| }, |
| { |
| "epoch": 19.02332361516035, |
| "grad_norm": 0.41334033012390137, |
| "learning_rate": 0.0003719404900816803, |
| "loss": 3.1806, |
| "step": 65250 |
| }, |
| { |
| "epoch": 19.037900874635568, |
| "grad_norm": 0.34988388419151306, |
| "learning_rate": 0.00037176546091015165, |
| "loss": 3.1682, |
| "step": 65300 |
| }, |
| { |
| "epoch": 19.05247813411079, |
| "grad_norm": 0.3725140690803528, |
| "learning_rate": 0.0003715904317386231, |
| "loss": 3.1849, |
| "step": 65350 |
| }, |
| { |
| "epoch": 19.067055393586006, |
| "grad_norm": 0.37143728137016296, |
| "learning_rate": 0.00037141540256709445, |
| "loss": 3.1845, |
| "step": 65400 |
| }, |
| { |
| "epoch": 19.081632653061224, |
| "grad_norm": 0.42141345143318176, |
| "learning_rate": 0.00037124037339556593, |
| "loss": 3.1886, |
| "step": 65450 |
| }, |
| { |
| "epoch": 19.096209912536445, |
| "grad_norm": 0.3842184245586395, |
| "learning_rate": 0.0003710653442240373, |
| "loss": 3.1938, |
| "step": 65500 |
| }, |
| { |
| "epoch": 19.110787172011662, |
| "grad_norm": 0.3778875172138214, |
| "learning_rate": 0.0003708903150525087, |
| "loss": 3.1953, |
| "step": 65550 |
| }, |
| { |
| "epoch": 19.12536443148688, |
| "grad_norm": 0.39668551087379456, |
| "learning_rate": 0.00037071528588098016, |
| "loss": 3.1955, |
| "step": 65600 |
| }, |
| { |
| "epoch": 19.1399416909621, |
| "grad_norm": 0.4067428708076477, |
| "learning_rate": 0.00037054025670945153, |
| "loss": 3.1883, |
| "step": 65650 |
| }, |
| { |
| "epoch": 19.154518950437318, |
| "grad_norm": 0.3798586428165436, |
| "learning_rate": 0.00037036522753792296, |
| "loss": 3.1912, |
| "step": 65700 |
| }, |
| { |
| "epoch": 19.169096209912535, |
| "grad_norm": 0.37418290972709656, |
| "learning_rate": 0.00037019019836639433, |
| "loss": 3.2043, |
| "step": 65750 |
| }, |
| { |
| "epoch": 19.183673469387756, |
| "grad_norm": 0.36611396074295044, |
| "learning_rate": 0.0003700151691948658, |
| "loss": 3.189, |
| "step": 65800 |
| }, |
| { |
| "epoch": 19.198250728862973, |
| "grad_norm": 0.39174631237983704, |
| "learning_rate": 0.0003698401400233372, |
| "loss": 3.2102, |
| "step": 65850 |
| }, |
| { |
| "epoch": 19.21282798833819, |
| "grad_norm": 0.37841376662254333, |
| "learning_rate": 0.0003696651108518086, |
| "loss": 3.2186, |
| "step": 65900 |
| }, |
| { |
| "epoch": 19.22740524781341, |
| "grad_norm": 0.4026353657245636, |
| "learning_rate": 0.00036949008168028, |
| "loss": 3.1962, |
| "step": 65950 |
| }, |
| { |
| "epoch": 19.24198250728863, |
| "grad_norm": 0.36771753430366516, |
| "learning_rate": 0.00036931505250875147, |
| "loss": 3.2112, |
| "step": 66000 |
| }, |
| { |
| "epoch": 19.24198250728863, |
| "eval_accuracy": 0.3721091944497287, |
| "eval_loss": 3.553576707839966, |
| "eval_runtime": 181.571, |
| "eval_samples_per_second": 91.556, |
| "eval_steps_per_second": 5.722, |
| "step": 66000 |
| }, |
| { |
| "epoch": 19.25655976676385, |
| "grad_norm": 0.36355555057525635, |
| "learning_rate": 0.00036914002333722284, |
| "loss": 3.2189, |
| "step": 66050 |
| }, |
| { |
| "epoch": 19.271137026239067, |
| "grad_norm": 0.36505192518234253, |
| "learning_rate": 0.0003689649941656942, |
| "loss": 3.2245, |
| "step": 66100 |
| }, |
| { |
| "epoch": 19.285714285714285, |
| "grad_norm": 0.37705734372138977, |
| "learning_rate": 0.0003687899649941657, |
| "loss": 3.2141, |
| "step": 66150 |
| }, |
| { |
| "epoch": 19.300291545189506, |
| "grad_norm": 0.399637907743454, |
| "learning_rate": 0.00036861493582263707, |
| "loss": 3.2206, |
| "step": 66200 |
| }, |
| { |
| "epoch": 19.314868804664723, |
| "grad_norm": 0.3929356634616852, |
| "learning_rate": 0.0003684399066511085, |
| "loss": 3.2324, |
| "step": 66250 |
| }, |
| { |
| "epoch": 19.32944606413994, |
| "grad_norm": 0.37350767850875854, |
| "learning_rate": 0.00036826487747957987, |
| "loss": 3.2241, |
| "step": 66300 |
| }, |
| { |
| "epoch": 19.34402332361516, |
| "grad_norm": 0.36935678124427795, |
| "learning_rate": 0.00036808984830805135, |
| "loss": 3.2237, |
| "step": 66350 |
| }, |
| { |
| "epoch": 19.35860058309038, |
| "grad_norm": 0.38563114404678345, |
| "learning_rate": 0.0003679148191365227, |
| "loss": 3.2205, |
| "step": 66400 |
| }, |
| { |
| "epoch": 19.373177842565596, |
| "grad_norm": 0.36945411562919617, |
| "learning_rate": 0.0003677397899649941, |
| "loss": 3.2197, |
| "step": 66450 |
| }, |
| { |
| "epoch": 19.387755102040817, |
| "grad_norm": 0.3770820200443268, |
| "learning_rate": 0.00036756476079346557, |
| "loss": 3.2432, |
| "step": 66500 |
| }, |
| { |
| "epoch": 19.402332361516034, |
| "grad_norm": 0.3747752010822296, |
| "learning_rate": 0.00036738973162193695, |
| "loss": 3.2299, |
| "step": 66550 |
| }, |
| { |
| "epoch": 19.416909620991255, |
| "grad_norm": 0.38895103335380554, |
| "learning_rate": 0.00036721470245040837, |
| "loss": 3.2374, |
| "step": 66600 |
| }, |
| { |
| "epoch": 19.431486880466473, |
| "grad_norm": 0.44169071316719055, |
| "learning_rate": 0.00036703967327887974, |
| "loss": 3.2261, |
| "step": 66650 |
| }, |
| { |
| "epoch": 19.44606413994169, |
| "grad_norm": 0.4019821286201477, |
| "learning_rate": 0.0003668646441073512, |
| "loss": 3.242, |
| "step": 66700 |
| }, |
| { |
| "epoch": 19.46064139941691, |
| "grad_norm": 0.40160903334617615, |
| "learning_rate": 0.0003666896149358226, |
| "loss": 3.2431, |
| "step": 66750 |
| }, |
| { |
| "epoch": 19.47521865889213, |
| "grad_norm": 0.3881227970123291, |
| "learning_rate": 0.000366514585764294, |
| "loss": 3.229, |
| "step": 66800 |
| }, |
| { |
| "epoch": 19.489795918367346, |
| "grad_norm": 0.38421711325645447, |
| "learning_rate": 0.00036633955659276545, |
| "loss": 3.2255, |
| "step": 66850 |
| }, |
| { |
| "epoch": 19.504373177842567, |
| "grad_norm": 0.39073434472084045, |
| "learning_rate": 0.0003661645274212369, |
| "loss": 3.2359, |
| "step": 66900 |
| }, |
| { |
| "epoch": 19.518950437317784, |
| "grad_norm": 0.3665856122970581, |
| "learning_rate": 0.00036598949824970825, |
| "loss": 3.2408, |
| "step": 66950 |
| }, |
| { |
| "epoch": 19.533527696793, |
| "grad_norm": 0.40631458163261414, |
| "learning_rate": 0.0003658144690781796, |
| "loss": 3.2374, |
| "step": 67000 |
| }, |
| { |
| "epoch": 19.533527696793, |
| "eval_accuracy": 0.3723500458630062, |
| "eval_loss": 3.5477726459503174, |
| "eval_runtime": 181.6063, |
| "eval_samples_per_second": 91.539, |
| "eval_steps_per_second": 5.721, |
| "step": 67000 |
| }, |
| { |
| "epoch": 19.548104956268222, |
| "grad_norm": 0.382445365190506, |
| "learning_rate": 0.0003656394399066511, |
| "loss": 3.2436, |
| "step": 67050 |
| }, |
| { |
| "epoch": 19.56268221574344, |
| "grad_norm": 0.3691641390323639, |
| "learning_rate": 0.0003654644107351225, |
| "loss": 3.2403, |
| "step": 67100 |
| }, |
| { |
| "epoch": 19.57725947521866, |
| "grad_norm": 0.36600974202156067, |
| "learning_rate": 0.0003652893815635939, |
| "loss": 3.2469, |
| "step": 67150 |
| }, |
| { |
| "epoch": 19.591836734693878, |
| "grad_norm": 0.39847928285598755, |
| "learning_rate": 0.0003651143523920653, |
| "loss": 3.2514, |
| "step": 67200 |
| }, |
| { |
| "epoch": 19.606413994169095, |
| "grad_norm": 0.3576008975505829, |
| "learning_rate": 0.00036493932322053676, |
| "loss": 3.2444, |
| "step": 67250 |
| }, |
| { |
| "epoch": 19.620991253644316, |
| "grad_norm": 0.3533256947994232, |
| "learning_rate": 0.00036476429404900813, |
| "loss": 3.2448, |
| "step": 67300 |
| }, |
| { |
| "epoch": 19.635568513119534, |
| "grad_norm": 0.36868008971214294, |
| "learning_rate": 0.0003645892648774795, |
| "loss": 3.2503, |
| "step": 67350 |
| }, |
| { |
| "epoch": 19.65014577259475, |
| "grad_norm": 0.36897534132003784, |
| "learning_rate": 0.000364414235705951, |
| "loss": 3.2405, |
| "step": 67400 |
| }, |
| { |
| "epoch": 19.664723032069972, |
| "grad_norm": 0.4246874749660492, |
| "learning_rate": 0.00036423920653442236, |
| "loss": 3.2477, |
| "step": 67450 |
| }, |
| { |
| "epoch": 19.67930029154519, |
| "grad_norm": 0.38615843653678894, |
| "learning_rate": 0.0003640641773628938, |
| "loss": 3.2607, |
| "step": 67500 |
| }, |
| { |
| "epoch": 19.693877551020407, |
| "grad_norm": 0.3596295118331909, |
| "learning_rate": 0.00036388914819136516, |
| "loss": 3.2486, |
| "step": 67550 |
| }, |
| { |
| "epoch": 19.708454810495628, |
| "grad_norm": 0.4084990918636322, |
| "learning_rate": 0.00036371411901983664, |
| "loss": 3.2494, |
| "step": 67600 |
| }, |
| { |
| "epoch": 19.723032069970845, |
| "grad_norm": 0.41460683941841125, |
| "learning_rate": 0.000363539089848308, |
| "loss": 3.261, |
| "step": 67650 |
| }, |
| { |
| "epoch": 19.737609329446066, |
| "grad_norm": 0.372470498085022, |
| "learning_rate": 0.00036336406067677944, |
| "loss": 3.2581, |
| "step": 67700 |
| }, |
| { |
| "epoch": 19.752186588921283, |
| "grad_norm": 0.3677177131175995, |
| "learning_rate": 0.00036318903150525086, |
| "loss": 3.2554, |
| "step": 67750 |
| }, |
| { |
| "epoch": 19.7667638483965, |
| "grad_norm": 0.36630979180336, |
| "learning_rate": 0.0003630140023337223, |
| "loss": 3.2522, |
| "step": 67800 |
| }, |
| { |
| "epoch": 19.78134110787172, |
| "grad_norm": 0.3879360258579254, |
| "learning_rate": 0.00036283897316219366, |
| "loss": 3.2672, |
| "step": 67850 |
| }, |
| { |
| "epoch": 19.79591836734694, |
| "grad_norm": 0.39751380681991577, |
| "learning_rate": 0.00036266394399066504, |
| "loss": 3.2533, |
| "step": 67900 |
| }, |
| { |
| "epoch": 19.810495626822156, |
| "grad_norm": 0.38645872473716736, |
| "learning_rate": 0.0003624889148191365, |
| "loss": 3.2545, |
| "step": 67950 |
| }, |
| { |
| "epoch": 19.825072886297377, |
| "grad_norm": 0.3810601830482483, |
| "learning_rate": 0.0003623138856476079, |
| "loss": 3.2682, |
| "step": 68000 |
| }, |
| { |
| "epoch": 19.825072886297377, |
| "eval_accuracy": 0.3729859595162442, |
| "eval_loss": 3.536771059036255, |
| "eval_runtime": 181.5477, |
| "eval_samples_per_second": 91.568, |
| "eval_steps_per_second": 5.723, |
| "step": 68000 |
| }, |
| { |
| "epoch": 19.839650145772595, |
| "grad_norm": 0.36461886763572693, |
| "learning_rate": 0.0003621388564760793, |
| "loss": 3.2574, |
| "step": 68050 |
| }, |
| { |
| "epoch": 19.854227405247812, |
| "grad_norm": 0.3770570456981659, |
| "learning_rate": 0.00036196382730455074, |
| "loss": 3.2661, |
| "step": 68100 |
| }, |
| { |
| "epoch": 19.868804664723033, |
| "grad_norm": 0.3571155071258545, |
| "learning_rate": 0.00036178879813302217, |
| "loss": 3.2521, |
| "step": 68150 |
| }, |
| { |
| "epoch": 19.88338192419825, |
| "grad_norm": 0.4131433665752411, |
| "learning_rate": 0.00036161376896149354, |
| "loss": 3.2681, |
| "step": 68200 |
| }, |
| { |
| "epoch": 19.897959183673468, |
| "grad_norm": 0.385781466960907, |
| "learning_rate": 0.0003614387397899649, |
| "loss": 3.2668, |
| "step": 68250 |
| }, |
| { |
| "epoch": 19.91253644314869, |
| "grad_norm": 0.386671245098114, |
| "learning_rate": 0.0003612637106184364, |
| "loss": 3.2753, |
| "step": 68300 |
| }, |
| { |
| "epoch": 19.927113702623906, |
| "grad_norm": 0.3846534490585327, |
| "learning_rate": 0.00036108868144690777, |
| "loss": 3.2616, |
| "step": 68350 |
| }, |
| { |
| "epoch": 19.941690962099127, |
| "grad_norm": 0.38834303617477417, |
| "learning_rate": 0.0003609136522753792, |
| "loss": 3.2649, |
| "step": 68400 |
| }, |
| { |
| "epoch": 19.956268221574344, |
| "grad_norm": 0.3715721070766449, |
| "learning_rate": 0.00036073862310385057, |
| "loss": 3.2694, |
| "step": 68450 |
| }, |
| { |
| "epoch": 19.97084548104956, |
| "grad_norm": 0.35815873742103577, |
| "learning_rate": 0.00036056359393232205, |
| "loss": 3.2493, |
| "step": 68500 |
| }, |
| { |
| "epoch": 19.985422740524783, |
| "grad_norm": 0.41704729199409485, |
| "learning_rate": 0.0003603885647607934, |
| "loss": 3.2713, |
| "step": 68550 |
| }, |
| { |
| "epoch": 20.0, |
| "grad_norm": 0.39834514260292053, |
| "learning_rate": 0.00036021353558926485, |
| "loss": 3.274, |
| "step": 68600 |
| }, |
| { |
| "epoch": 20.014577259475217, |
| "grad_norm": 0.40443840622901917, |
| "learning_rate": 0.0003600385064177363, |
| "loss": 3.1637, |
| "step": 68650 |
| }, |
| { |
| "epoch": 20.02915451895044, |
| "grad_norm": 0.3989391624927521, |
| "learning_rate": 0.0003598634772462077, |
| "loss": 3.161, |
| "step": 68700 |
| }, |
| { |
| "epoch": 20.043731778425656, |
| "grad_norm": 0.3869283199310303, |
| "learning_rate": 0.0003596884480746791, |
| "loss": 3.1616, |
| "step": 68750 |
| }, |
| { |
| "epoch": 20.058309037900873, |
| "grad_norm": 0.3743650019168854, |
| "learning_rate": 0.00035951341890315045, |
| "loss": 3.1794, |
| "step": 68800 |
| }, |
| { |
| "epoch": 20.072886297376094, |
| "grad_norm": 0.3653443157672882, |
| "learning_rate": 0.00035933838973162193, |
| "loss": 3.1577, |
| "step": 68850 |
| }, |
| { |
| "epoch": 20.08746355685131, |
| "grad_norm": 0.36639904975891113, |
| "learning_rate": 0.0003591633605600933, |
| "loss": 3.1708, |
| "step": 68900 |
| }, |
| { |
| "epoch": 20.102040816326532, |
| "grad_norm": 0.3923637568950653, |
| "learning_rate": 0.00035898833138856473, |
| "loss": 3.1798, |
| "step": 68950 |
| }, |
| { |
| "epoch": 20.11661807580175, |
| "grad_norm": 0.3729211091995239, |
| "learning_rate": 0.00035881330221703615, |
| "loss": 3.1777, |
| "step": 69000 |
| }, |
| { |
| "epoch": 20.11661807580175, |
| "eval_accuracy": 0.3722634052764117, |
| "eval_loss": 3.555449962615967, |
| "eval_runtime": 181.5409, |
| "eval_samples_per_second": 91.572, |
| "eval_steps_per_second": 5.723, |
| "step": 69000 |
| }, |
| { |
| "epoch": 20.131195335276967, |
| "grad_norm": 0.3827017545700073, |
| "learning_rate": 0.0003586382730455076, |
| "loss": 3.1923, |
| "step": 69050 |
| }, |
| { |
| "epoch": 20.145772594752188, |
| "grad_norm": 0.39228540658950806, |
| "learning_rate": 0.00035846324387397895, |
| "loss": 3.1933, |
| "step": 69100 |
| }, |
| { |
| "epoch": 20.160349854227405, |
| "grad_norm": 0.4029063880443573, |
| "learning_rate": 0.0003582882147024503, |
| "loss": 3.1833, |
| "step": 69150 |
| }, |
| { |
| "epoch": 20.174927113702623, |
| "grad_norm": 0.39909449219703674, |
| "learning_rate": 0.0003581131855309218, |
| "loss": 3.1918, |
| "step": 69200 |
| }, |
| { |
| "epoch": 20.189504373177844, |
| "grad_norm": 0.3867656886577606, |
| "learning_rate": 0.0003579381563593932, |
| "loss": 3.1897, |
| "step": 69250 |
| }, |
| { |
| "epoch": 20.20408163265306, |
| "grad_norm": 0.3900681734085083, |
| "learning_rate": 0.0003577631271878646, |
| "loss": 3.2048, |
| "step": 69300 |
| }, |
| { |
| "epoch": 20.21865889212828, |
| "grad_norm": 0.3752710521221161, |
| "learning_rate": 0.00035758809801633603, |
| "loss": 3.2034, |
| "step": 69350 |
| }, |
| { |
| "epoch": 20.2332361516035, |
| "grad_norm": 0.3584527373313904, |
| "learning_rate": 0.00035741306884480746, |
| "loss": 3.2036, |
| "step": 69400 |
| }, |
| { |
| "epoch": 20.247813411078717, |
| "grad_norm": 0.41200047731399536, |
| "learning_rate": 0.00035723803967327883, |
| "loss": 3.1984, |
| "step": 69450 |
| }, |
| { |
| "epoch": 20.262390670553934, |
| "grad_norm": 0.3794945478439331, |
| "learning_rate": 0.00035706301050175026, |
| "loss": 3.2055, |
| "step": 69500 |
| }, |
| { |
| "epoch": 20.276967930029155, |
| "grad_norm": 0.3582080602645874, |
| "learning_rate": 0.0003568879813302217, |
| "loss": 3.1918, |
| "step": 69550 |
| }, |
| { |
| "epoch": 20.291545189504372, |
| "grad_norm": 0.3873555064201355, |
| "learning_rate": 0.0003567129521586931, |
| "loss": 3.2128, |
| "step": 69600 |
| }, |
| { |
| "epoch": 20.306122448979593, |
| "grad_norm": 0.3871956169605255, |
| "learning_rate": 0.0003565379229871645, |
| "loss": 3.2295, |
| "step": 69650 |
| }, |
| { |
| "epoch": 20.32069970845481, |
| "grad_norm": 0.39271074533462524, |
| "learning_rate": 0.00035636289381563586, |
| "loss": 3.2124, |
| "step": 69700 |
| }, |
| { |
| "epoch": 20.335276967930028, |
| "grad_norm": 0.37518835067749023, |
| "learning_rate": 0.00035618786464410734, |
| "loss": 3.2265, |
| "step": 69750 |
| }, |
| { |
| "epoch": 20.34985422740525, |
| "grad_norm": 0.41981539130210876, |
| "learning_rate": 0.0003560128354725787, |
| "loss": 3.2192, |
| "step": 69800 |
| }, |
| { |
| "epoch": 20.364431486880466, |
| "grad_norm": 0.3705388903617859, |
| "learning_rate": 0.00035583780630105014, |
| "loss": 3.2053, |
| "step": 69850 |
| }, |
| { |
| "epoch": 20.379008746355684, |
| "grad_norm": 0.3765951693058014, |
| "learning_rate": 0.00035566277712952157, |
| "loss": 3.2039, |
| "step": 69900 |
| }, |
| { |
| "epoch": 20.393586005830905, |
| "grad_norm": 0.3718447983264923, |
| "learning_rate": 0.000355487747957993, |
| "loss": 3.2242, |
| "step": 69950 |
| }, |
| { |
| "epoch": 20.408163265306122, |
| "grad_norm": 0.3854503929615021, |
| "learning_rate": 0.00035531271878646437, |
| "loss": 3.227, |
| "step": 70000 |
| }, |
| { |
| "epoch": 20.408163265306122, |
| "eval_accuracy": 0.3724764751972486, |
| "eval_loss": 3.549376964569092, |
| "eval_runtime": 181.7953, |
| "eval_samples_per_second": 91.444, |
| "eval_steps_per_second": 5.715, |
| "step": 70000 |
| }, |
| { |
| "epoch": 20.42274052478134, |
| "grad_norm": 0.4070553183555603, |
| "learning_rate": 0.00035513768961493574, |
| "loss": 3.2329, |
| "step": 70050 |
| }, |
| { |
| "epoch": 20.43731778425656, |
| "grad_norm": 0.3540206849575043, |
| "learning_rate": 0.0003549626604434072, |
| "loss": 3.2228, |
| "step": 70100 |
| }, |
| { |
| "epoch": 20.451895043731778, |
| "grad_norm": 0.36275020241737366, |
| "learning_rate": 0.0003547876312718786, |
| "loss": 3.2215, |
| "step": 70150 |
| }, |
| { |
| "epoch": 20.466472303207, |
| "grad_norm": 0.3675984740257263, |
| "learning_rate": 0.00035461260210035, |
| "loss": 3.2286, |
| "step": 70200 |
| }, |
| { |
| "epoch": 20.481049562682216, |
| "grad_norm": 0.39644601941108704, |
| "learning_rate": 0.00035443757292882145, |
| "loss": 3.2337, |
| "step": 70250 |
| }, |
| { |
| "epoch": 20.495626822157433, |
| "grad_norm": 0.3630872070789337, |
| "learning_rate": 0.00035426254375729287, |
| "loss": 3.2301, |
| "step": 70300 |
| }, |
| { |
| "epoch": 20.510204081632654, |
| "grad_norm": 0.3842460811138153, |
| "learning_rate": 0.00035408751458576424, |
| "loss": 3.2319, |
| "step": 70350 |
| }, |
| { |
| "epoch": 20.52478134110787, |
| "grad_norm": 0.4168015122413635, |
| "learning_rate": 0.00035391248541423567, |
| "loss": 3.2215, |
| "step": 70400 |
| }, |
| { |
| "epoch": 20.53935860058309, |
| "grad_norm": 0.3930748999118805, |
| "learning_rate": 0.0003537374562427071, |
| "loss": 3.2295, |
| "step": 70450 |
| }, |
| { |
| "epoch": 20.55393586005831, |
| "grad_norm": 0.4178406894207001, |
| "learning_rate": 0.0003535624270711785, |
| "loss": 3.242, |
| "step": 70500 |
| }, |
| { |
| "epoch": 20.568513119533527, |
| "grad_norm": 0.3626347482204437, |
| "learning_rate": 0.0003533873978996499, |
| "loss": 3.2405, |
| "step": 70550 |
| }, |
| { |
| "epoch": 20.583090379008745, |
| "grad_norm": 0.3929009437561035, |
| "learning_rate": 0.0003532123687281214, |
| "loss": 3.2233, |
| "step": 70600 |
| }, |
| { |
| "epoch": 20.597667638483966, |
| "grad_norm": 0.40020325779914856, |
| "learning_rate": 0.00035303733955659275, |
| "loss": 3.2492, |
| "step": 70650 |
| }, |
| { |
| "epoch": 20.612244897959183, |
| "grad_norm": 0.39611926674842834, |
| "learning_rate": 0.0003528623103850641, |
| "loss": 3.234, |
| "step": 70700 |
| }, |
| { |
| "epoch": 20.626822157434404, |
| "grad_norm": 0.386678546667099, |
| "learning_rate": 0.00035268728121353555, |
| "loss": 3.2254, |
| "step": 70750 |
| }, |
| { |
| "epoch": 20.64139941690962, |
| "grad_norm": 0.4055517911911011, |
| "learning_rate": 0.000352512252042007, |
| "loss": 3.2339, |
| "step": 70800 |
| }, |
| { |
| "epoch": 20.65597667638484, |
| "grad_norm": 0.365493506193161, |
| "learning_rate": 0.0003523372228704784, |
| "loss": 3.2382, |
| "step": 70850 |
| }, |
| { |
| "epoch": 20.67055393586006, |
| "grad_norm": 0.3866759240627289, |
| "learning_rate": 0.0003521621936989498, |
| "loss": 3.2367, |
| "step": 70900 |
| }, |
| { |
| "epoch": 20.685131195335277, |
| "grad_norm": 0.37846946716308594, |
| "learning_rate": 0.00035198716452742126, |
| "loss": 3.2349, |
| "step": 70950 |
| }, |
| { |
| "epoch": 20.699708454810494, |
| "grad_norm": 0.3800721764564514, |
| "learning_rate": 0.00035181213535589263, |
| "loss": 3.2344, |
| "step": 71000 |
| }, |
| { |
| "epoch": 20.699708454810494, |
| "eval_accuracy": 0.37278112986858886, |
| "eval_loss": 3.5447607040405273, |
| "eval_runtime": 181.6259, |
| "eval_samples_per_second": 91.529, |
| "eval_steps_per_second": 5.721, |
| "step": 71000 |
| }, |
| { |
| "epoch": 20.714285714285715, |
| "grad_norm": 0.39008185267448425, |
| "learning_rate": 0.000351637106184364, |
| "loss": 3.2458, |
| "step": 71050 |
| }, |
| { |
| "epoch": 20.728862973760933, |
| "grad_norm": 0.36584028601646423, |
| "learning_rate": 0.00035146207701283543, |
| "loss": 3.2476, |
| "step": 71100 |
| }, |
| { |
| "epoch": 20.74344023323615, |
| "grad_norm": 0.3683364689350128, |
| "learning_rate": 0.00035128704784130686, |
| "loss": 3.2401, |
| "step": 71150 |
| }, |
| { |
| "epoch": 20.75801749271137, |
| "grad_norm": 0.36139705777168274, |
| "learning_rate": 0.0003511120186697783, |
| "loss": 3.2551, |
| "step": 71200 |
| }, |
| { |
| "epoch": 20.77259475218659, |
| "grad_norm": 0.37175899744033813, |
| "learning_rate": 0.00035093698949824966, |
| "loss": 3.249, |
| "step": 71250 |
| }, |
| { |
| "epoch": 20.78717201166181, |
| "grad_norm": 0.3636099100112915, |
| "learning_rate": 0.0003507619603267211, |
| "loss": 3.2516, |
| "step": 71300 |
| }, |
| { |
| "epoch": 20.801749271137027, |
| "grad_norm": 0.3795507252216339, |
| "learning_rate": 0.0003505869311551925, |
| "loss": 3.2419, |
| "step": 71350 |
| }, |
| { |
| "epoch": 20.816326530612244, |
| "grad_norm": 0.3986833095550537, |
| "learning_rate": 0.00035041190198366394, |
| "loss": 3.2476, |
| "step": 71400 |
| }, |
| { |
| "epoch": 20.830903790087465, |
| "grad_norm": 0.38115549087524414, |
| "learning_rate": 0.0003502368728121353, |
| "loss": 3.2498, |
| "step": 71450 |
| }, |
| { |
| "epoch": 20.845481049562682, |
| "grad_norm": 0.38317617774009705, |
| "learning_rate": 0.0003500618436406068, |
| "loss": 3.242, |
| "step": 71500 |
| }, |
| { |
| "epoch": 20.8600583090379, |
| "grad_norm": 0.3691622018814087, |
| "learning_rate": 0.00034988681446907816, |
| "loss": 3.2586, |
| "step": 71550 |
| }, |
| { |
| "epoch": 20.87463556851312, |
| "grad_norm": 0.3669494092464447, |
| "learning_rate": 0.00034971178529754954, |
| "loss": 3.2447, |
| "step": 71600 |
| }, |
| { |
| "epoch": 20.889212827988338, |
| "grad_norm": 0.37029528617858887, |
| "learning_rate": 0.00034953675612602096, |
| "loss": 3.2453, |
| "step": 71650 |
| }, |
| { |
| "epoch": 20.903790087463555, |
| "grad_norm": 0.3869434893131256, |
| "learning_rate": 0.0003493617269544924, |
| "loss": 3.2529, |
| "step": 71700 |
| }, |
| { |
| "epoch": 20.918367346938776, |
| "grad_norm": 0.3559544086456299, |
| "learning_rate": 0.0003491866977829638, |
| "loss": 3.2472, |
| "step": 71750 |
| }, |
| { |
| "epoch": 20.932944606413994, |
| "grad_norm": 0.3823868930339813, |
| "learning_rate": 0.0003490116686114352, |
| "loss": 3.2558, |
| "step": 71800 |
| }, |
| { |
| "epoch": 20.947521865889215, |
| "grad_norm": 0.35983943939208984, |
| "learning_rate": 0.00034883663943990667, |
| "loss": 3.2524, |
| "step": 71850 |
| }, |
| { |
| "epoch": 20.962099125364432, |
| "grad_norm": 0.37556958198547363, |
| "learning_rate": 0.00034866161026837804, |
| "loss": 3.253, |
| "step": 71900 |
| }, |
| { |
| "epoch": 20.97667638483965, |
| "grad_norm": 0.3773977756500244, |
| "learning_rate": 0.0003484865810968494, |
| "loss": 3.2518, |
| "step": 71950 |
| }, |
| { |
| "epoch": 20.99125364431487, |
| "grad_norm": 0.37112659215927124, |
| "learning_rate": 0.00034831155192532084, |
| "loss": 3.2637, |
| "step": 72000 |
| }, |
| { |
| "epoch": 20.99125364431487, |
| "eval_accuracy": 0.3733703094010687, |
| "eval_loss": 3.5348708629608154, |
| "eval_runtime": 181.5452, |
| "eval_samples_per_second": 91.569, |
| "eval_steps_per_second": 5.723, |
| "step": 72000 |
| }, |
| { |
| "epoch": 21.005830903790088, |
| "grad_norm": 0.3866562843322754, |
| "learning_rate": 0.00034813652275379227, |
| "loss": 3.2154, |
| "step": 72050 |
| }, |
| { |
| "epoch": 21.020408163265305, |
| "grad_norm": 0.392566055059433, |
| "learning_rate": 0.0003479614935822637, |
| "loss": 3.1381, |
| "step": 72100 |
| }, |
| { |
| "epoch": 21.034985422740526, |
| "grad_norm": 0.378366619348526, |
| "learning_rate": 0.00034778646441073507, |
| "loss": 3.1497, |
| "step": 72150 |
| }, |
| { |
| "epoch": 21.049562682215743, |
| "grad_norm": 0.40309053659439087, |
| "learning_rate": 0.00034761143523920655, |
| "loss": 3.1555, |
| "step": 72200 |
| }, |
| { |
| "epoch": 21.06413994169096, |
| "grad_norm": 0.3924875557422638, |
| "learning_rate": 0.0003474364060676779, |
| "loss": 3.1664, |
| "step": 72250 |
| }, |
| { |
| "epoch": 21.07871720116618, |
| "grad_norm": 0.4179859757423401, |
| "learning_rate": 0.00034726137689614935, |
| "loss": 3.1594, |
| "step": 72300 |
| }, |
| { |
| "epoch": 21.0932944606414, |
| "grad_norm": 0.3732832670211792, |
| "learning_rate": 0.0003470863477246207, |
| "loss": 3.1762, |
| "step": 72350 |
| }, |
| { |
| "epoch": 21.107871720116616, |
| "grad_norm": 0.364841103553772, |
| "learning_rate": 0.0003469113185530922, |
| "loss": 3.1754, |
| "step": 72400 |
| }, |
| { |
| "epoch": 21.122448979591837, |
| "grad_norm": 0.3762074112892151, |
| "learning_rate": 0.0003467362893815636, |
| "loss": 3.1719, |
| "step": 72450 |
| }, |
| { |
| "epoch": 21.137026239067055, |
| "grad_norm": 0.3844176232814789, |
| "learning_rate": 0.00034656126021003495, |
| "loss": 3.1781, |
| "step": 72500 |
| }, |
| { |
| "epoch": 21.151603498542276, |
| "grad_norm": 0.4166364073753357, |
| "learning_rate": 0.0003463862310385064, |
| "loss": 3.1648, |
| "step": 72550 |
| }, |
| { |
| "epoch": 21.166180758017493, |
| "grad_norm": 0.42058807611465454, |
| "learning_rate": 0.0003462112018669778, |
| "loss": 3.1758, |
| "step": 72600 |
| }, |
| { |
| "epoch": 21.18075801749271, |
| "grad_norm": 0.427311509847641, |
| "learning_rate": 0.00034603617269544923, |
| "loss": 3.186, |
| "step": 72650 |
| }, |
| { |
| "epoch": 21.19533527696793, |
| "grad_norm": 0.405745267868042, |
| "learning_rate": 0.0003458611435239206, |
| "loss": 3.2023, |
| "step": 72700 |
| }, |
| { |
| "epoch": 21.20991253644315, |
| "grad_norm": 0.39251014590263367, |
| "learning_rate": 0.0003456861143523921, |
| "loss": 3.1813, |
| "step": 72750 |
| }, |
| { |
| "epoch": 21.224489795918366, |
| "grad_norm": 0.39821499586105347, |
| "learning_rate": 0.00034551108518086345, |
| "loss": 3.1856, |
| "step": 72800 |
| }, |
| { |
| "epoch": 21.239067055393587, |
| "grad_norm": 0.38475996255874634, |
| "learning_rate": 0.0003453360560093348, |
| "loss": 3.1818, |
| "step": 72850 |
| }, |
| { |
| "epoch": 21.253644314868804, |
| "grad_norm": 0.41487428545951843, |
| "learning_rate": 0.00034516102683780625, |
| "loss": 3.2021, |
| "step": 72900 |
| }, |
| { |
| "epoch": 21.26822157434402, |
| "grad_norm": 0.37080344557762146, |
| "learning_rate": 0.0003449859976662777, |
| "loss": 3.1959, |
| "step": 72950 |
| }, |
| { |
| "epoch": 21.282798833819243, |
| "grad_norm": 0.40031132102012634, |
| "learning_rate": 0.0003448109684947491, |
| "loss": 3.206, |
| "step": 73000 |
| }, |
| { |
| "epoch": 21.282798833819243, |
| "eval_accuracy": 0.3722320922383219, |
| "eval_loss": 3.5537631511688232, |
| "eval_runtime": 181.5434, |
| "eval_samples_per_second": 91.57, |
| "eval_steps_per_second": 5.723, |
| "step": 73000 |
| }, |
| { |
| "epoch": 21.29737609329446, |
| "grad_norm": 0.41430559754371643, |
| "learning_rate": 0.0003446359393232205, |
| "loss": 3.2039, |
| "step": 73050 |
| }, |
| { |
| "epoch": 21.31195335276968, |
| "grad_norm": 0.4002816677093506, |
| "learning_rate": 0.00034446091015169196, |
| "loss": 3.1988, |
| "step": 73100 |
| }, |
| { |
| "epoch": 21.3265306122449, |
| "grad_norm": 0.3660390079021454, |
| "learning_rate": 0.00034428588098016333, |
| "loss": 3.1918, |
| "step": 73150 |
| }, |
| { |
| "epoch": 21.341107871720116, |
| "grad_norm": 0.38535523414611816, |
| "learning_rate": 0.00034411085180863476, |
| "loss": 3.2009, |
| "step": 73200 |
| }, |
| { |
| "epoch": 21.355685131195337, |
| "grad_norm": 0.3930363655090332, |
| "learning_rate": 0.00034393582263710613, |
| "loss": 3.2043, |
| "step": 73250 |
| }, |
| { |
| "epoch": 21.370262390670554, |
| "grad_norm": 0.3775531053543091, |
| "learning_rate": 0.0003437607934655776, |
| "loss": 3.2063, |
| "step": 73300 |
| }, |
| { |
| "epoch": 21.38483965014577, |
| "grad_norm": 0.40686893463134766, |
| "learning_rate": 0.000343585764294049, |
| "loss": 3.1895, |
| "step": 73350 |
| }, |
| { |
| "epoch": 21.399416909620992, |
| "grad_norm": 0.4103091359138489, |
| "learning_rate": 0.00034341073512252036, |
| "loss": 3.209, |
| "step": 73400 |
| }, |
| { |
| "epoch": 21.41399416909621, |
| "grad_norm": 0.36624330282211304, |
| "learning_rate": 0.00034323570595099184, |
| "loss": 3.211, |
| "step": 73450 |
| }, |
| { |
| "epoch": 21.428571428571427, |
| "grad_norm": 0.43008551001548767, |
| "learning_rate": 0.0003430606767794632, |
| "loss": 3.217, |
| "step": 73500 |
| }, |
| { |
| "epoch": 21.443148688046648, |
| "grad_norm": 0.37908637523651123, |
| "learning_rate": 0.00034288564760793464, |
| "loss": 3.221, |
| "step": 73550 |
| }, |
| { |
| "epoch": 21.457725947521865, |
| "grad_norm": 0.37392768263816833, |
| "learning_rate": 0.000342710618436406, |
| "loss": 3.212, |
| "step": 73600 |
| }, |
| { |
| "epoch": 21.472303206997086, |
| "grad_norm": 0.40657246112823486, |
| "learning_rate": 0.0003425355892648775, |
| "loss": 3.213, |
| "step": 73650 |
| }, |
| { |
| "epoch": 21.486880466472304, |
| "grad_norm": 0.3826930820941925, |
| "learning_rate": 0.00034236056009334887, |
| "loss": 3.2103, |
| "step": 73700 |
| }, |
| { |
| "epoch": 21.50145772594752, |
| "grad_norm": 0.41526952385902405, |
| "learning_rate": 0.00034218553092182024, |
| "loss": 3.2188, |
| "step": 73750 |
| }, |
| { |
| "epoch": 21.516034985422742, |
| "grad_norm": 0.3791206479072571, |
| "learning_rate": 0.00034201050175029166, |
| "loss": 3.2171, |
| "step": 73800 |
| }, |
| { |
| "epoch": 21.53061224489796, |
| "grad_norm": 0.36035192012786865, |
| "learning_rate": 0.0003418354725787631, |
| "loss": 3.2221, |
| "step": 73850 |
| }, |
| { |
| "epoch": 21.545189504373177, |
| "grad_norm": 0.4223874509334564, |
| "learning_rate": 0.0003416604434072345, |
| "loss": 3.2262, |
| "step": 73900 |
| }, |
| { |
| "epoch": 21.559766763848398, |
| "grad_norm": 0.3832871615886688, |
| "learning_rate": 0.0003414854142357059, |
| "loss": 3.22, |
| "step": 73950 |
| }, |
| { |
| "epoch": 21.574344023323615, |
| "grad_norm": 0.39104190468788147, |
| "learning_rate": 0.00034131038506417737, |
| "loss": 3.227, |
| "step": 74000 |
| }, |
| { |
| "epoch": 21.574344023323615, |
| "eval_accuracy": 0.3729764243429913, |
| "eval_loss": 3.546137809753418, |
| "eval_runtime": 182.4945, |
| "eval_samples_per_second": 91.093, |
| "eval_steps_per_second": 5.693, |
| "step": 74000 |
| }, |
| { |
| "epoch": 21.588921282798832, |
| "grad_norm": 0.4001672863960266, |
| "learning_rate": 0.00034113535589264874, |
| "loss": 3.2296, |
| "step": 74050 |
| }, |
| { |
| "epoch": 21.603498542274053, |
| "grad_norm": 0.40520548820495605, |
| "learning_rate": 0.00034096032672112017, |
| "loss": 3.2265, |
| "step": 74100 |
| }, |
| { |
| "epoch": 21.61807580174927, |
| "grad_norm": 0.36908841133117676, |
| "learning_rate": 0.00034078529754959154, |
| "loss": 3.2176, |
| "step": 74150 |
| }, |
| { |
| "epoch": 21.632653061224488, |
| "grad_norm": 0.4079746603965759, |
| "learning_rate": 0.000340610268378063, |
| "loss": 3.2411, |
| "step": 74200 |
| }, |
| { |
| "epoch": 21.64723032069971, |
| "grad_norm": 0.39124277234077454, |
| "learning_rate": 0.0003404352392065344, |
| "loss": 3.2292, |
| "step": 74250 |
| }, |
| { |
| "epoch": 21.661807580174926, |
| "grad_norm": 0.39536523818969727, |
| "learning_rate": 0.00034026021003500577, |
| "loss": 3.2268, |
| "step": 74300 |
| }, |
| { |
| "epoch": 21.676384839650147, |
| "grad_norm": 0.3833157420158386, |
| "learning_rate": 0.00034008518086347725, |
| "loss": 3.2262, |
| "step": 74350 |
| }, |
| { |
| "epoch": 21.690962099125365, |
| "grad_norm": 0.37845858931541443, |
| "learning_rate": 0.0003399101516919486, |
| "loss": 3.2272, |
| "step": 74400 |
| }, |
| { |
| "epoch": 21.705539358600582, |
| "grad_norm": 0.41482678055763245, |
| "learning_rate": 0.00033973512252042005, |
| "loss": 3.2352, |
| "step": 74450 |
| }, |
| { |
| "epoch": 21.720116618075803, |
| "grad_norm": 0.3896600306034088, |
| "learning_rate": 0.0003395600933488914, |
| "loss": 3.2399, |
| "step": 74500 |
| }, |
| { |
| "epoch": 21.73469387755102, |
| "grad_norm": 0.36860349774360657, |
| "learning_rate": 0.0003393850641773629, |
| "loss": 3.2342, |
| "step": 74550 |
| }, |
| { |
| "epoch": 21.749271137026238, |
| "grad_norm": 0.380167692899704, |
| "learning_rate": 0.0003392100350058343, |
| "loss": 3.2331, |
| "step": 74600 |
| }, |
| { |
| "epoch": 21.76384839650146, |
| "grad_norm": 0.3892786204814911, |
| "learning_rate": 0.00033903500583430565, |
| "loss": 3.2339, |
| "step": 74650 |
| }, |
| { |
| "epoch": 21.778425655976676, |
| "grad_norm": 0.3820129334926605, |
| "learning_rate": 0.00033885997666277713, |
| "loss": 3.237, |
| "step": 74700 |
| }, |
| { |
| "epoch": 21.793002915451893, |
| "grad_norm": 0.41555044054985046, |
| "learning_rate": 0.0003386849474912485, |
| "loss": 3.2339, |
| "step": 74750 |
| }, |
| { |
| "epoch": 21.807580174927114, |
| "grad_norm": 0.41377243399620056, |
| "learning_rate": 0.00033850991831971993, |
| "loss": 3.2387, |
| "step": 74800 |
| }, |
| { |
| "epoch": 21.82215743440233, |
| "grad_norm": 0.38593509793281555, |
| "learning_rate": 0.0003383348891481913, |
| "loss": 3.2413, |
| "step": 74850 |
| }, |
| { |
| "epoch": 21.836734693877553, |
| "grad_norm": 0.37622013688087463, |
| "learning_rate": 0.0003381598599766628, |
| "loss": 3.244, |
| "step": 74900 |
| }, |
| { |
| "epoch": 21.85131195335277, |
| "grad_norm": 0.38674411177635193, |
| "learning_rate": 0.00033798483080513416, |
| "loss": 3.244, |
| "step": 74950 |
| }, |
| { |
| "epoch": 21.865889212827987, |
| "grad_norm": 0.3747655749320984, |
| "learning_rate": 0.0003378098016336056, |
| "loss": 3.2301, |
| "step": 75000 |
| }, |
| { |
| "epoch": 21.865889212827987, |
| "eval_accuracy": 0.373476020334169, |
| "eval_loss": 3.5379791259765625, |
| "eval_runtime": 181.5105, |
| "eval_samples_per_second": 91.587, |
| "eval_steps_per_second": 5.724, |
| "step": 75000 |
| }, |
| { |
| "epoch": 21.88046647230321, |
| "grad_norm": 0.4235680103302002, |
| "learning_rate": 0.00033763477246207696, |
| "loss": 3.2357, |
| "step": 75050 |
| }, |
| { |
| "epoch": 21.895043731778426, |
| "grad_norm": 0.3823429048061371, |
| "learning_rate": 0.00033745974329054844, |
| "loss": 3.2449, |
| "step": 75100 |
| }, |
| { |
| "epoch": 21.909620991253643, |
| "grad_norm": 0.41767048835754395, |
| "learning_rate": 0.0003372847141190198, |
| "loss": 3.2541, |
| "step": 75150 |
| }, |
| { |
| "epoch": 21.924198250728864, |
| "grad_norm": 0.39289626479148865, |
| "learning_rate": 0.0003371096849474912, |
| "loss": 3.2522, |
| "step": 75200 |
| }, |
| { |
| "epoch": 21.93877551020408, |
| "grad_norm": 0.4047614336013794, |
| "learning_rate": 0.00033693465577596266, |
| "loss": 3.2426, |
| "step": 75250 |
| }, |
| { |
| "epoch": 21.9533527696793, |
| "grad_norm": 0.3937784731388092, |
| "learning_rate": 0.00033675962660443404, |
| "loss": 3.2358, |
| "step": 75300 |
| }, |
| { |
| "epoch": 21.96793002915452, |
| "grad_norm": 0.4089590609073639, |
| "learning_rate": 0.00033658459743290546, |
| "loss": 3.241, |
| "step": 75350 |
| }, |
| { |
| "epoch": 21.982507288629737, |
| "grad_norm": 0.4090530574321747, |
| "learning_rate": 0.00033640956826137684, |
| "loss": 3.2541, |
| "step": 75400 |
| }, |
| { |
| "epoch": 21.997084548104958, |
| "grad_norm": 0.38579609990119934, |
| "learning_rate": 0.0003362345390898483, |
| "loss": 3.2452, |
| "step": 75450 |
| }, |
| { |
| "epoch": 22.011661807580175, |
| "grad_norm": 0.3704177439212799, |
| "learning_rate": 0.0003360595099183197, |
| "loss": 3.168, |
| "step": 75500 |
| }, |
| { |
| "epoch": 22.026239067055393, |
| "grad_norm": 0.3894430100917816, |
| "learning_rate": 0.00033588448074679106, |
| "loss": 3.1481, |
| "step": 75550 |
| }, |
| { |
| "epoch": 22.040816326530614, |
| "grad_norm": 0.42577895522117615, |
| "learning_rate": 0.00033570945157526254, |
| "loss": 3.1475, |
| "step": 75600 |
| }, |
| { |
| "epoch": 22.05539358600583, |
| "grad_norm": 0.3754096031188965, |
| "learning_rate": 0.0003355344224037339, |
| "loss": 3.1548, |
| "step": 75650 |
| }, |
| { |
| "epoch": 22.06997084548105, |
| "grad_norm": 0.3804377615451813, |
| "learning_rate": 0.00033535939323220534, |
| "loss": 3.1528, |
| "step": 75700 |
| }, |
| { |
| "epoch": 22.08454810495627, |
| "grad_norm": 0.4568529427051544, |
| "learning_rate": 0.0003351843640606767, |
| "loss": 3.1464, |
| "step": 75750 |
| }, |
| { |
| "epoch": 22.099125364431487, |
| "grad_norm": 0.40900859236717224, |
| "learning_rate": 0.0003350093348891482, |
| "loss": 3.1626, |
| "step": 75800 |
| }, |
| { |
| "epoch": 22.113702623906704, |
| "grad_norm": 0.4063328802585602, |
| "learning_rate": 0.00033483430571761957, |
| "loss": 3.1597, |
| "step": 75850 |
| }, |
| { |
| "epoch": 22.128279883381925, |
| "grad_norm": 0.40163469314575195, |
| "learning_rate": 0.000334659276546091, |
| "loss": 3.1652, |
| "step": 75900 |
| }, |
| { |
| "epoch": 22.142857142857142, |
| "grad_norm": 0.40903109312057495, |
| "learning_rate": 0.0003344842473745624, |
| "loss": 3.168, |
| "step": 75950 |
| }, |
| { |
| "epoch": 22.157434402332363, |
| "grad_norm": 0.3817201256752014, |
| "learning_rate": 0.00033430921820303385, |
| "loss": 3.1727, |
| "step": 76000 |
| }, |
| { |
| "epoch": 22.157434402332363, |
| "eval_accuracy": 0.3725627626292781, |
| "eval_loss": 3.5528297424316406, |
| "eval_runtime": 181.4979, |
| "eval_samples_per_second": 91.593, |
| "eval_steps_per_second": 5.725, |
| "step": 76000 |
| }, |
| { |
| "epoch": 22.17201166180758, |
| "grad_norm": 0.3955742120742798, |
| "learning_rate": 0.0003341341890315052, |
| "loss": 3.1637, |
| "step": 76050 |
| }, |
| { |
| "epoch": 22.186588921282798, |
| "grad_norm": 0.4320707321166992, |
| "learning_rate": 0.0003339591598599766, |
| "loss": 3.1777, |
| "step": 76100 |
| }, |
| { |
| "epoch": 22.20116618075802, |
| "grad_norm": 0.3852517902851105, |
| "learning_rate": 0.0003337841306884481, |
| "loss": 3.1858, |
| "step": 76150 |
| }, |
| { |
| "epoch": 22.215743440233236, |
| "grad_norm": 0.4392651319503784, |
| "learning_rate": 0.00033360910151691945, |
| "loss": 3.1815, |
| "step": 76200 |
| }, |
| { |
| "epoch": 22.230320699708454, |
| "grad_norm": 0.39310264587402344, |
| "learning_rate": 0.0003334340723453909, |
| "loss": 3.1695, |
| "step": 76250 |
| }, |
| { |
| "epoch": 22.244897959183675, |
| "grad_norm": 0.388776570558548, |
| "learning_rate": 0.00033325904317386225, |
| "loss": 3.1865, |
| "step": 76300 |
| }, |
| { |
| "epoch": 22.259475218658892, |
| "grad_norm": 0.3850609064102173, |
| "learning_rate": 0.00033308401400233373, |
| "loss": 3.1881, |
| "step": 76350 |
| }, |
| { |
| "epoch": 22.27405247813411, |
| "grad_norm": 0.3893771171569824, |
| "learning_rate": 0.0003329089848308051, |
| "loss": 3.1883, |
| "step": 76400 |
| }, |
| { |
| "epoch": 22.28862973760933, |
| "grad_norm": 0.4042390286922455, |
| "learning_rate": 0.0003327339556592765, |
| "loss": 3.1814, |
| "step": 76450 |
| }, |
| { |
| "epoch": 22.303206997084548, |
| "grad_norm": 0.37913334369659424, |
| "learning_rate": 0.00033255892648774795, |
| "loss": 3.185, |
| "step": 76500 |
| }, |
| { |
| "epoch": 22.317784256559765, |
| "grad_norm": 0.43061673641204834, |
| "learning_rate": 0.0003323838973162193, |
| "loss": 3.2021, |
| "step": 76550 |
| }, |
| { |
| "epoch": 22.332361516034986, |
| "grad_norm": 0.47006747126579285, |
| "learning_rate": 0.00033220886814469075, |
| "loss": 3.1932, |
| "step": 76600 |
| }, |
| { |
| "epoch": 22.346938775510203, |
| "grad_norm": 0.40174809098243713, |
| "learning_rate": 0.0003320338389731621, |
| "loss": 3.2008, |
| "step": 76650 |
| }, |
| { |
| "epoch": 22.361516034985424, |
| "grad_norm": 0.3931922912597656, |
| "learning_rate": 0.0003318588098016336, |
| "loss": 3.1946, |
| "step": 76700 |
| }, |
| { |
| "epoch": 22.37609329446064, |
| "grad_norm": 0.3940417468547821, |
| "learning_rate": 0.000331683780630105, |
| "loss": 3.1994, |
| "step": 76750 |
| }, |
| { |
| "epoch": 22.39067055393586, |
| "grad_norm": 0.37044382095336914, |
| "learning_rate": 0.0003315087514585764, |
| "loss": 3.198, |
| "step": 76800 |
| }, |
| { |
| "epoch": 22.40524781341108, |
| "grad_norm": 0.41557997465133667, |
| "learning_rate": 0.00033133372228704783, |
| "loss": 3.1991, |
| "step": 76850 |
| }, |
| { |
| "epoch": 22.419825072886297, |
| "grad_norm": 0.41059786081314087, |
| "learning_rate": 0.00033115869311551926, |
| "loss": 3.1885, |
| "step": 76900 |
| }, |
| { |
| "epoch": 22.434402332361515, |
| "grad_norm": 0.38646191358566284, |
| "learning_rate": 0.00033098366394399063, |
| "loss": 3.2002, |
| "step": 76950 |
| }, |
| { |
| "epoch": 22.448979591836736, |
| "grad_norm": 0.3769567608833313, |
| "learning_rate": 0.000330808634772462, |
| "loss": 3.2089, |
| "step": 77000 |
| }, |
| { |
| "epoch": 22.448979591836736, |
| "eval_accuracy": 0.37277524395917344, |
| "eval_loss": 3.5467875003814697, |
| "eval_runtime": 181.4854, |
| "eval_samples_per_second": 91.6, |
| "eval_steps_per_second": 5.725, |
| "step": 77000 |
| }, |
| { |
| "epoch": 22.463556851311953, |
| "grad_norm": 0.3794631063938141, |
| "learning_rate": 0.0003306336056009335, |
| "loss": 3.2089, |
| "step": 77050 |
| }, |
| { |
| "epoch": 22.47813411078717, |
| "grad_norm": 0.4073309600353241, |
| "learning_rate": 0.00033045857642940486, |
| "loss": 3.1998, |
| "step": 77100 |
| }, |
| { |
| "epoch": 22.49271137026239, |
| "grad_norm": 0.40729716420173645, |
| "learning_rate": 0.0003302835472578763, |
| "loss": 3.2111, |
| "step": 77150 |
| }, |
| { |
| "epoch": 22.50728862973761, |
| "grad_norm": 0.4218553602695465, |
| "learning_rate": 0.0003301085180863477, |
| "loss": 3.2033, |
| "step": 77200 |
| }, |
| { |
| "epoch": 22.52186588921283, |
| "grad_norm": 0.410984605550766, |
| "learning_rate": 0.00032993348891481914, |
| "loss": 3.1964, |
| "step": 77250 |
| }, |
| { |
| "epoch": 22.536443148688047, |
| "grad_norm": 0.3754239082336426, |
| "learning_rate": 0.0003297584597432905, |
| "loss": 3.2193, |
| "step": 77300 |
| }, |
| { |
| "epoch": 22.551020408163264, |
| "grad_norm": 0.40258893370628357, |
| "learning_rate": 0.0003295834305717619, |
| "loss": 3.2121, |
| "step": 77350 |
| }, |
| { |
| "epoch": 22.565597667638485, |
| "grad_norm": 0.4331571161746979, |
| "learning_rate": 0.00032940840140023337, |
| "loss": 3.2096, |
| "step": 77400 |
| }, |
| { |
| "epoch": 22.580174927113703, |
| "grad_norm": 0.39986467361450195, |
| "learning_rate": 0.00032923337222870474, |
| "loss": 3.2039, |
| "step": 77450 |
| }, |
| { |
| "epoch": 22.59475218658892, |
| "grad_norm": 0.39954131841659546, |
| "learning_rate": 0.00032905834305717616, |
| "loss": 3.22, |
| "step": 77500 |
| }, |
| { |
| "epoch": 22.60932944606414, |
| "grad_norm": 0.3756577968597412, |
| "learning_rate": 0.00032888331388564754, |
| "loss": 3.2104, |
| "step": 77550 |
| }, |
| { |
| "epoch": 22.62390670553936, |
| "grad_norm": 0.43170756101608276, |
| "learning_rate": 0.000328708284714119, |
| "loss": 3.2222, |
| "step": 77600 |
| }, |
| { |
| "epoch": 22.638483965014576, |
| "grad_norm": 0.4028014838695526, |
| "learning_rate": 0.0003285332555425904, |
| "loss": 3.2124, |
| "step": 77650 |
| }, |
| { |
| "epoch": 22.653061224489797, |
| "grad_norm": 0.4260345995426178, |
| "learning_rate": 0.0003283582263710618, |
| "loss": 3.223, |
| "step": 77700 |
| }, |
| { |
| "epoch": 22.667638483965014, |
| "grad_norm": 0.40206658840179443, |
| "learning_rate": 0.00032818319719953324, |
| "loss": 3.2277, |
| "step": 77750 |
| }, |
| { |
| "epoch": 22.682215743440235, |
| "grad_norm": 0.4037342369556427, |
| "learning_rate": 0.00032800816802800467, |
| "loss": 3.2276, |
| "step": 77800 |
| }, |
| { |
| "epoch": 22.696793002915452, |
| "grad_norm": 0.3831779360771179, |
| "learning_rate": 0.00032783313885647604, |
| "loss": 3.2048, |
| "step": 77850 |
| }, |
| { |
| "epoch": 22.71137026239067, |
| "grad_norm": 0.41773656010627747, |
| "learning_rate": 0.0003276581096849474, |
| "loss": 3.2226, |
| "step": 77900 |
| }, |
| { |
| "epoch": 22.72594752186589, |
| "grad_norm": 0.40552884340286255, |
| "learning_rate": 0.0003274830805134189, |
| "loss": 3.2213, |
| "step": 77950 |
| }, |
| { |
| "epoch": 22.740524781341108, |
| "grad_norm": 0.374747097492218, |
| "learning_rate": 0.00032730805134189027, |
| "loss": 3.2303, |
| "step": 78000 |
| }, |
| { |
| "epoch": 22.740524781341108, |
| "eval_accuracy": 0.3737029810012262, |
| "eval_loss": 3.538172721862793, |
| "eval_runtime": 183.0931, |
| "eval_samples_per_second": 90.795, |
| "eval_steps_per_second": 5.675, |
| "step": 78000 |
| }, |
| { |
| "epoch": 22.755102040816325, |
| "grad_norm": 0.3842918276786804, |
| "learning_rate": 0.0003271330221703617, |
| "loss": 3.2269, |
| "step": 78050 |
| }, |
| { |
| "epoch": 22.769679300291546, |
| "grad_norm": 0.3732106387615204, |
| "learning_rate": 0.0003269579929988331, |
| "loss": 3.2353, |
| "step": 78100 |
| }, |
| { |
| "epoch": 22.784256559766764, |
| "grad_norm": 0.39892974495887756, |
| "learning_rate": 0.00032678296382730455, |
| "loss": 3.2265, |
| "step": 78150 |
| }, |
| { |
| "epoch": 22.79883381924198, |
| "grad_norm": 0.3784988224506378, |
| "learning_rate": 0.0003266079346557759, |
| "loss": 3.2361, |
| "step": 78200 |
| }, |
| { |
| "epoch": 22.813411078717202, |
| "grad_norm": 0.4117169678211212, |
| "learning_rate": 0.0003264329054842473, |
| "loss": 3.2218, |
| "step": 78250 |
| }, |
| { |
| "epoch": 22.82798833819242, |
| "grad_norm": 0.3687457740306854, |
| "learning_rate": 0.0003262578763127188, |
| "loss": 3.2365, |
| "step": 78300 |
| }, |
| { |
| "epoch": 22.842565597667637, |
| "grad_norm": 0.4006026089191437, |
| "learning_rate": 0.00032608284714119015, |
| "loss": 3.2334, |
| "step": 78350 |
| }, |
| { |
| "epoch": 22.857142857142858, |
| "grad_norm": 0.38996270298957825, |
| "learning_rate": 0.0003259078179696616, |
| "loss": 3.2307, |
| "step": 78400 |
| }, |
| { |
| "epoch": 22.871720116618075, |
| "grad_norm": 0.39698663353919983, |
| "learning_rate": 0.000325732788798133, |
| "loss": 3.2265, |
| "step": 78450 |
| }, |
| { |
| "epoch": 22.886297376093296, |
| "grad_norm": 0.38513946533203125, |
| "learning_rate": 0.00032555775962660443, |
| "loss": 3.2263, |
| "step": 78500 |
| }, |
| { |
| "epoch": 22.900874635568513, |
| "grad_norm": 0.40252456068992615, |
| "learning_rate": 0.0003253827304550758, |
| "loss": 3.2238, |
| "step": 78550 |
| }, |
| { |
| "epoch": 22.91545189504373, |
| "grad_norm": 0.3779289424419403, |
| "learning_rate": 0.00032520770128354723, |
| "loss": 3.2383, |
| "step": 78600 |
| }, |
| { |
| "epoch": 22.93002915451895, |
| "grad_norm": 0.3678314983844757, |
| "learning_rate": 0.00032503267211201866, |
| "loss": 3.2413, |
| "step": 78650 |
| }, |
| { |
| "epoch": 22.94460641399417, |
| "grad_norm": 0.3785208761692047, |
| "learning_rate": 0.0003248576429404901, |
| "loss": 3.2335, |
| "step": 78700 |
| }, |
| { |
| "epoch": 22.959183673469386, |
| "grad_norm": 0.3818875849246979, |
| "learning_rate": 0.00032468261376896146, |
| "loss": 3.2398, |
| "step": 78750 |
| }, |
| { |
| "epoch": 22.973760932944607, |
| "grad_norm": 0.40353357791900635, |
| "learning_rate": 0.00032450758459743283, |
| "loss": 3.2332, |
| "step": 78800 |
| }, |
| { |
| "epoch": 22.988338192419825, |
| "grad_norm": 0.37254422903060913, |
| "learning_rate": 0.0003243325554259043, |
| "loss": 3.2311, |
| "step": 78850 |
| }, |
| { |
| "epoch": 23.002915451895042, |
| "grad_norm": 0.4074246883392334, |
| "learning_rate": 0.0003241575262543757, |
| "loss": 3.1996, |
| "step": 78900 |
| }, |
| { |
| "epoch": 23.017492711370263, |
| "grad_norm": 0.4083156883716583, |
| "learning_rate": 0.0003239824970828471, |
| "loss": 3.129, |
| "step": 78950 |
| }, |
| { |
| "epoch": 23.03206997084548, |
| "grad_norm": 0.40061071515083313, |
| "learning_rate": 0.00032380746791131854, |
| "loss": 3.1451, |
| "step": 79000 |
| }, |
| { |
| "epoch": 23.03206997084548, |
| "eval_accuracy": 0.3729666537333617, |
| "eval_loss": 3.5513992309570312, |
| "eval_runtime": 183.3231, |
| "eval_samples_per_second": 90.681, |
| "eval_steps_per_second": 5.668, |
| "step": 79000 |
| }, |
| { |
| "epoch": 23.0466472303207, |
| "grad_norm": 0.4193076193332672, |
| "learning_rate": 0.00032363243873978996, |
| "loss": 3.1481, |
| "step": 79050 |
| }, |
| { |
| "epoch": 23.06122448979592, |
| "grad_norm": 0.42379242181777954, |
| "learning_rate": 0.00032345740956826134, |
| "loss": 3.1376, |
| "step": 79100 |
| }, |
| { |
| "epoch": 23.075801749271136, |
| "grad_norm": 0.4095369279384613, |
| "learning_rate": 0.0003232823803967327, |
| "loss": 3.1415, |
| "step": 79150 |
| }, |
| { |
| "epoch": 23.090379008746357, |
| "grad_norm": 0.3854268491268158, |
| "learning_rate": 0.0003231073512252042, |
| "loss": 3.1506, |
| "step": 79200 |
| }, |
| { |
| "epoch": 23.104956268221574, |
| "grad_norm": 0.4058293104171753, |
| "learning_rate": 0.00032293232205367556, |
| "loss": 3.1607, |
| "step": 79250 |
| }, |
| { |
| "epoch": 23.11953352769679, |
| "grad_norm": 0.40621069073677063, |
| "learning_rate": 0.000322757292882147, |
| "loss": 3.1575, |
| "step": 79300 |
| }, |
| { |
| "epoch": 23.134110787172013, |
| "grad_norm": 0.4140237867832184, |
| "learning_rate": 0.0003225822637106184, |
| "loss": 3.1554, |
| "step": 79350 |
| }, |
| { |
| "epoch": 23.14868804664723, |
| "grad_norm": 0.38233980536460876, |
| "learning_rate": 0.00032240723453908984, |
| "loss": 3.1596, |
| "step": 79400 |
| }, |
| { |
| "epoch": 23.163265306122447, |
| "grad_norm": 0.3980560600757599, |
| "learning_rate": 0.0003222322053675612, |
| "loss": 3.1618, |
| "step": 79450 |
| }, |
| { |
| "epoch": 23.17784256559767, |
| "grad_norm": 0.40507349371910095, |
| "learning_rate": 0.00032205717619603264, |
| "loss": 3.1599, |
| "step": 79500 |
| }, |
| { |
| "epoch": 23.192419825072886, |
| "grad_norm": 0.4133625030517578, |
| "learning_rate": 0.00032188214702450407, |
| "loss": 3.1661, |
| "step": 79550 |
| }, |
| { |
| "epoch": 23.206997084548107, |
| "grad_norm": 0.4333428144454956, |
| "learning_rate": 0.0003217071178529755, |
| "loss": 3.1745, |
| "step": 79600 |
| }, |
| { |
| "epoch": 23.221574344023324, |
| "grad_norm": 0.41459041833877563, |
| "learning_rate": 0.00032153208868144687, |
| "loss": 3.1547, |
| "step": 79650 |
| }, |
| { |
| "epoch": 23.23615160349854, |
| "grad_norm": 0.3677409887313843, |
| "learning_rate": 0.00032135705950991835, |
| "loss": 3.1699, |
| "step": 79700 |
| }, |
| { |
| "epoch": 23.250728862973762, |
| "grad_norm": 0.3948548138141632, |
| "learning_rate": 0.0003211820303383897, |
| "loss": 3.1748, |
| "step": 79750 |
| }, |
| { |
| "epoch": 23.26530612244898, |
| "grad_norm": 0.3827493488788605, |
| "learning_rate": 0.0003210070011668611, |
| "loss": 3.1882, |
| "step": 79800 |
| }, |
| { |
| "epoch": 23.279883381924197, |
| "grad_norm": 0.40871936082839966, |
| "learning_rate": 0.0003208319719953325, |
| "loss": 3.1778, |
| "step": 79850 |
| }, |
| { |
| "epoch": 23.294460641399418, |
| "grad_norm": 0.3977854251861572, |
| "learning_rate": 0.00032065694282380395, |
| "loss": 3.1791, |
| "step": 79900 |
| }, |
| { |
| "epoch": 23.309037900874635, |
| "grad_norm": 0.38805514574050903, |
| "learning_rate": 0.0003204819136522754, |
| "loss": 3.1588, |
| "step": 79950 |
| }, |
| { |
| "epoch": 23.323615160349853, |
| "grad_norm": 0.41924408078193665, |
| "learning_rate": 0.00032030688448074675, |
| "loss": 3.1934, |
| "step": 80000 |
| }, |
| { |
| "epoch": 23.323615160349853, |
| "eval_accuracy": 0.3729095604120325, |
| "eval_loss": 3.5495593547821045, |
| "eval_runtime": 182.8971, |
| "eval_samples_per_second": 90.893, |
| "eval_steps_per_second": 5.681, |
| "step": 80000 |
| }, |
| { |
| "epoch": 23.338192419825074, |
| "grad_norm": 0.40964633226394653, |
| "learning_rate": 0.0003201318553092181, |
| "loss": 3.1388, |
| "step": 80050 |
| }, |
| { |
| "epoch": 23.35276967930029, |
| "grad_norm": 0.40592989325523376, |
| "learning_rate": 0.0003199568261376896, |
| "loss": 3.1356, |
| "step": 80100 |
| }, |
| { |
| "epoch": 23.367346938775512, |
| "grad_norm": 0.40325120091438293, |
| "learning_rate": 0.000319781796966161, |
| "loss": 3.1501, |
| "step": 80150 |
| }, |
| { |
| "epoch": 23.38192419825073, |
| "grad_norm": 0.37463781237602234, |
| "learning_rate": 0.0003196067677946324, |
| "loss": 3.1391, |
| "step": 80200 |
| }, |
| { |
| "epoch": 23.396501457725947, |
| "grad_norm": 0.3974815309047699, |
| "learning_rate": 0.0003194317386231038, |
| "loss": 3.1471, |
| "step": 80250 |
| }, |
| { |
| "epoch": 23.411078717201168, |
| "grad_norm": 0.3901289701461792, |
| "learning_rate": 0.00031925670945157525, |
| "loss": 3.1546, |
| "step": 80300 |
| }, |
| { |
| "epoch": 23.425655976676385, |
| "grad_norm": 0.3855806291103363, |
| "learning_rate": 0.0003190816802800466, |
| "loss": 3.1602, |
| "step": 80350 |
| }, |
| { |
| "epoch": 23.440233236151602, |
| "grad_norm": 0.4171423017978668, |
| "learning_rate": 0.00031890665110851805, |
| "loss": 3.1602, |
| "step": 80400 |
| }, |
| { |
| "epoch": 23.454810495626823, |
| "grad_norm": 0.4090583920478821, |
| "learning_rate": 0.0003187316219369895, |
| "loss": 3.1586, |
| "step": 80450 |
| }, |
| { |
| "epoch": 23.46938775510204, |
| "grad_norm": 0.40589964389801025, |
| "learning_rate": 0.0003185565927654609, |
| "loss": 3.161, |
| "step": 80500 |
| }, |
| { |
| "epoch": 23.483965014577258, |
| "grad_norm": 0.4126732647418976, |
| "learning_rate": 0.0003183815635939323, |
| "loss": 3.1643, |
| "step": 80550 |
| }, |
| { |
| "epoch": 23.49854227405248, |
| "grad_norm": 0.38648244738578796, |
| "learning_rate": 0.00031820653442240376, |
| "loss": 3.1724, |
| "step": 80600 |
| }, |
| { |
| "epoch": 23.513119533527696, |
| "grad_norm": 0.4161723852157593, |
| "learning_rate": 0.00031803150525087513, |
| "loss": 3.1661, |
| "step": 80650 |
| }, |
| { |
| "epoch": 23.527696793002917, |
| "grad_norm": 0.463445782661438, |
| "learning_rate": 0.0003178564760793465, |
| "loss": 3.1686, |
| "step": 80700 |
| }, |
| { |
| "epoch": 23.542274052478135, |
| "grad_norm": 0.3802230954170227, |
| "learning_rate": 0.00031768144690781793, |
| "loss": 3.1774, |
| "step": 80750 |
| }, |
| { |
| "epoch": 23.556851311953352, |
| "grad_norm": 0.40115004777908325, |
| "learning_rate": 0.00031750641773628936, |
| "loss": 3.177, |
| "step": 80800 |
| }, |
| { |
| "epoch": 23.571428571428573, |
| "grad_norm": 0.3771075904369354, |
| "learning_rate": 0.0003173313885647608, |
| "loss": 3.1733, |
| "step": 80850 |
| }, |
| { |
| "epoch": 23.58600583090379, |
| "grad_norm": 0.4014439284801483, |
| "learning_rate": 0.00031715635939323216, |
| "loss": 3.1715, |
| "step": 80900 |
| }, |
| { |
| "epoch": 23.600583090379008, |
| "grad_norm": 0.4014812707901001, |
| "learning_rate": 0.00031698133022170364, |
| "loss": 3.1898, |
| "step": 80950 |
| }, |
| { |
| "epoch": 23.61516034985423, |
| "grad_norm": 0.3988043963909149, |
| "learning_rate": 0.000316806301050175, |
| "loss": 3.1733, |
| "step": 81000 |
| }, |
| { |
| "epoch": 23.61516034985423, |
| "eval_accuracy": 0.37280632156088667, |
| "eval_loss": 3.553154468536377, |
| "eval_runtime": 183.8151, |
| "eval_samples_per_second": 90.439, |
| "eval_steps_per_second": 5.652, |
| "step": 81000 |
| }, |
| { |
| "epoch": 23.629737609329446, |
| "grad_norm": 0.41778406500816345, |
| "learning_rate": 0.0003166312718786464, |
| "loss": 3.1837, |
| "step": 81050 |
| }, |
| { |
| "epoch": 23.644314868804663, |
| "grad_norm": 0.4018956124782562, |
| "learning_rate": 0.0003164562427071178, |
| "loss": 3.1654, |
| "step": 81100 |
| }, |
| { |
| "epoch": 23.658892128279884, |
| "grad_norm": 0.4062495529651642, |
| "learning_rate": 0.00031628121353558924, |
| "loss": 3.1817, |
| "step": 81150 |
| }, |
| { |
| "epoch": 23.6734693877551, |
| "grad_norm": 0.42387402057647705, |
| "learning_rate": 0.00031610618436406066, |
| "loss": 3.1934, |
| "step": 81200 |
| }, |
| { |
| "epoch": 23.68804664723032, |
| "grad_norm": 0.41637706756591797, |
| "learning_rate": 0.00031593115519253204, |
| "loss": 3.1909, |
| "step": 81250 |
| }, |
| { |
| "epoch": 23.70262390670554, |
| "grad_norm": 0.4212613105773926, |
| "learning_rate": 0.00031575612602100346, |
| "loss": 3.1765, |
| "step": 81300 |
| }, |
| { |
| "epoch": 23.717201166180757, |
| "grad_norm": 0.404997318983078, |
| "learning_rate": 0.0003155810968494749, |
| "loss": 3.1889, |
| "step": 81350 |
| }, |
| { |
| "epoch": 23.73177842565598, |
| "grad_norm": 0.3999521732330322, |
| "learning_rate": 0.0003154060676779463, |
| "loss": 3.1951, |
| "step": 81400 |
| }, |
| { |
| "epoch": 23.746355685131196, |
| "grad_norm": 0.4004182517528534, |
| "learning_rate": 0.0003152310385064177, |
| "loss": 3.1944, |
| "step": 81450 |
| }, |
| { |
| "epoch": 23.760932944606413, |
| "grad_norm": 0.3951795697212219, |
| "learning_rate": 0.00031505600933488917, |
| "loss": 3.1882, |
| "step": 81500 |
| }, |
| { |
| "epoch": 23.775510204081634, |
| "grad_norm": 0.4371464252471924, |
| "learning_rate": 0.00031488098016336054, |
| "loss": 3.1815, |
| "step": 81550 |
| }, |
| { |
| "epoch": 23.79008746355685, |
| "grad_norm": 0.4174312949180603, |
| "learning_rate": 0.0003147059509918319, |
| "loss": 3.1957, |
| "step": 81600 |
| }, |
| { |
| "epoch": 23.80466472303207, |
| "grad_norm": 0.37675541639328003, |
| "learning_rate": 0.00031453092182030334, |
| "loss": 3.1946, |
| "step": 81650 |
| }, |
| { |
| "epoch": 23.81924198250729, |
| "grad_norm": 0.40973252058029175, |
| "learning_rate": 0.00031435589264877477, |
| "loss": 3.1906, |
| "step": 81700 |
| }, |
| { |
| "epoch": 23.833819241982507, |
| "grad_norm": 0.41370999813079834, |
| "learning_rate": 0.0003141808634772462, |
| "loss": 3.1871, |
| "step": 81750 |
| }, |
| { |
| "epoch": 23.848396501457724, |
| "grad_norm": 0.3945741057395935, |
| "learning_rate": 0.00031400583430571757, |
| "loss": 3.2006, |
| "step": 81800 |
| }, |
| { |
| "epoch": 23.862973760932945, |
| "grad_norm": 0.409297913312912, |
| "learning_rate": 0.00031383080513418905, |
| "loss": 3.2033, |
| "step": 81850 |
| }, |
| { |
| "epoch": 23.877551020408163, |
| "grad_norm": 0.4507441818714142, |
| "learning_rate": 0.0003136557759626604, |
| "loss": 3.2035, |
| "step": 81900 |
| }, |
| { |
| "epoch": 23.892128279883384, |
| "grad_norm": 0.3964073359966278, |
| "learning_rate": 0.0003134807467911318, |
| "loss": 3.2036, |
| "step": 81950 |
| }, |
| { |
| "epoch": 23.9067055393586, |
| "grad_norm": 0.4502336084842682, |
| "learning_rate": 0.0003133057176196032, |
| "loss": 3.2024, |
| "step": 82000 |
| }, |
| { |
| "epoch": 23.9067055393586, |
| "eval_accuracy": 0.37321904152909335, |
| "eval_loss": 3.5442070960998535, |
| "eval_runtime": 183.0942, |
| "eval_samples_per_second": 90.795, |
| "eval_steps_per_second": 5.675, |
| "step": 82000 |
| }, |
| { |
| "epoch": 23.92128279883382, |
| "grad_norm": 0.3888959586620331, |
| "learning_rate": 0.00031313068844807465, |
| "loss": 3.2061, |
| "step": 82050 |
| }, |
| { |
| "epoch": 23.93586005830904, |
| "grad_norm": 0.3921671509742737, |
| "learning_rate": 0.0003129556592765461, |
| "loss": 3.2084, |
| "step": 82100 |
| }, |
| { |
| "epoch": 23.950437317784257, |
| "grad_norm": 0.40496307611465454, |
| "learning_rate": 0.00031278063010501745, |
| "loss": 3.1999, |
| "step": 82150 |
| }, |
| { |
| "epoch": 23.965014577259474, |
| "grad_norm": 0.40495121479034424, |
| "learning_rate": 0.00031260560093348893, |
| "loss": 3.2047, |
| "step": 82200 |
| }, |
| { |
| "epoch": 23.979591836734695, |
| "grad_norm": 0.3954131305217743, |
| "learning_rate": 0.0003124305717619603, |
| "loss": 3.2014, |
| "step": 82250 |
| }, |
| { |
| "epoch": 23.994169096209912, |
| "grad_norm": 0.4274619519710541, |
| "learning_rate": 0.00031225554259043173, |
| "loss": 3.2172, |
| "step": 82300 |
| }, |
| { |
| "epoch": 24.00874635568513, |
| "grad_norm": 0.4204201102256775, |
| "learning_rate": 0.0003120805134189031, |
| "loss": 3.1609, |
| "step": 82350 |
| }, |
| { |
| "epoch": 24.02332361516035, |
| "grad_norm": 0.4107939302921295, |
| "learning_rate": 0.0003119054842473746, |
| "loss": 3.1226, |
| "step": 82400 |
| }, |
| { |
| "epoch": 24.037900874635568, |
| "grad_norm": 0.7560588717460632, |
| "learning_rate": 0.00031173045507584596, |
| "loss": 3.1357, |
| "step": 82450 |
| }, |
| { |
| "epoch": 24.05247813411079, |
| "grad_norm": 0.42420998215675354, |
| "learning_rate": 0.00031155542590431733, |
| "loss": 3.1287, |
| "step": 82500 |
| }, |
| { |
| "epoch": 24.067055393586006, |
| "grad_norm": 0.3994666337966919, |
| "learning_rate": 0.00031138039673278876, |
| "loss": 3.1346, |
| "step": 82550 |
| }, |
| { |
| "epoch": 24.081632653061224, |
| "grad_norm": 0.42727458477020264, |
| "learning_rate": 0.0003112053675612602, |
| "loss": 3.1467, |
| "step": 82600 |
| }, |
| { |
| "epoch": 24.096209912536445, |
| "grad_norm": 0.4032772183418274, |
| "learning_rate": 0.0003110303383897316, |
| "loss": 3.1444, |
| "step": 82650 |
| }, |
| { |
| "epoch": 24.110787172011662, |
| "grad_norm": 0.40215247869491577, |
| "learning_rate": 0.000310855309218203, |
| "loss": 3.154, |
| "step": 82700 |
| }, |
| { |
| "epoch": 24.12536443148688, |
| "grad_norm": 0.39542341232299805, |
| "learning_rate": 0.00031068028004667446, |
| "loss": 3.1496, |
| "step": 82750 |
| }, |
| { |
| "epoch": 24.1399416909621, |
| "grad_norm": 0.4189421534538269, |
| "learning_rate": 0.00031050525087514584, |
| "loss": 3.1573, |
| "step": 82800 |
| }, |
| { |
| "epoch": 24.154518950437318, |
| "grad_norm": 0.42610371112823486, |
| "learning_rate": 0.0003103302217036172, |
| "loss": 3.1536, |
| "step": 82850 |
| }, |
| { |
| "epoch": 24.169096209912535, |
| "grad_norm": 0.41727983951568604, |
| "learning_rate": 0.00031015519253208863, |
| "loss": 3.1639, |
| "step": 82900 |
| }, |
| { |
| "epoch": 24.183673469387756, |
| "grad_norm": 0.3976685702800751, |
| "learning_rate": 0.00030998016336056006, |
| "loss": 3.1504, |
| "step": 82950 |
| }, |
| { |
| "epoch": 24.198250728862973, |
| "grad_norm": 0.42174771428108215, |
| "learning_rate": 0.0003098051341890315, |
| "loss": 3.1588, |
| "step": 83000 |
| }, |
| { |
| "epoch": 24.198250728862973, |
| "eval_accuracy": 0.3729486428505506, |
| "eval_loss": 3.5548107624053955, |
| "eval_runtime": 182.5732, |
| "eval_samples_per_second": 91.054, |
| "eval_steps_per_second": 5.691, |
| "step": 83000 |
| }, |
| { |
| "epoch": 24.21282798833819, |
| "grad_norm": 0.43244871497154236, |
| "learning_rate": 0.00030963010501750286, |
| "loss": 3.1648, |
| "step": 83050 |
| }, |
| { |
| "epoch": 24.22740524781341, |
| "grad_norm": 0.4010060727596283, |
| "learning_rate": 0.00030945507584597434, |
| "loss": 3.1591, |
| "step": 83100 |
| }, |
| { |
| "epoch": 24.24198250728863, |
| "grad_norm": 0.409901887178421, |
| "learning_rate": 0.0003092800466744457, |
| "loss": 3.1687, |
| "step": 83150 |
| }, |
| { |
| "epoch": 24.25655976676385, |
| "grad_norm": 0.41060811281204224, |
| "learning_rate": 0.00030910501750291714, |
| "loss": 3.1708, |
| "step": 83200 |
| }, |
| { |
| "epoch": 24.271137026239067, |
| "grad_norm": 0.4232666790485382, |
| "learning_rate": 0.0003089299883313885, |
| "loss": 3.1688, |
| "step": 83250 |
| }, |
| { |
| "epoch": 24.285714285714285, |
| "grad_norm": 0.39276716113090515, |
| "learning_rate": 0.00030875495915986, |
| "loss": 3.1786, |
| "step": 83300 |
| }, |
| { |
| "epoch": 24.300291545189506, |
| "grad_norm": 0.38360247015953064, |
| "learning_rate": 0.00030857992998833137, |
| "loss": 3.1669, |
| "step": 83350 |
| }, |
| { |
| "epoch": 24.314868804664723, |
| "grad_norm": 0.3872515857219696, |
| "learning_rate": 0.00030840490081680274, |
| "loss": 3.1863, |
| "step": 83400 |
| }, |
| { |
| "epoch": 24.32944606413994, |
| "grad_norm": 0.438064306974411, |
| "learning_rate": 0.0003082298716452742, |
| "loss": 3.1957, |
| "step": 83450 |
| }, |
| { |
| "epoch": 24.34402332361516, |
| "grad_norm": 0.4066021144390106, |
| "learning_rate": 0.0003080548424737456, |
| "loss": 3.1811, |
| "step": 83500 |
| }, |
| { |
| "epoch": 24.35860058309038, |
| "grad_norm": 0.3971489369869232, |
| "learning_rate": 0.000307879813302217, |
| "loss": 3.1741, |
| "step": 83550 |
| }, |
| { |
| "epoch": 24.373177842565596, |
| "grad_norm": 0.4131510853767395, |
| "learning_rate": 0.0003077047841306884, |
| "loss": 3.1878, |
| "step": 83600 |
| }, |
| { |
| "epoch": 24.387755102040817, |
| "grad_norm": 0.4342701733112335, |
| "learning_rate": 0.0003075297549591599, |
| "loss": 3.18, |
| "step": 83650 |
| }, |
| { |
| "epoch": 24.402332361516034, |
| "grad_norm": 0.4129961133003235, |
| "learning_rate": 0.00030735472578763125, |
| "loss": 3.1804, |
| "step": 83700 |
| }, |
| { |
| "epoch": 24.416909620991255, |
| "grad_norm": 0.4195544123649597, |
| "learning_rate": 0.0003071796966161026, |
| "loss": 3.1703, |
| "step": 83750 |
| }, |
| { |
| "epoch": 24.431486880466473, |
| "grad_norm": 0.4069652557373047, |
| "learning_rate": 0.00030700466744457405, |
| "loss": 3.1859, |
| "step": 83800 |
| }, |
| { |
| "epoch": 24.44606413994169, |
| "grad_norm": 0.41638901829719543, |
| "learning_rate": 0.0003068296382730455, |
| "loss": 3.1812, |
| "step": 83850 |
| }, |
| { |
| "epoch": 24.46064139941691, |
| "grad_norm": 0.3933592736721039, |
| "learning_rate": 0.0003066546091015169, |
| "loss": 3.1845, |
| "step": 83900 |
| }, |
| { |
| "epoch": 24.47521865889213, |
| "grad_norm": 0.44356077909469604, |
| "learning_rate": 0.00030647957992998827, |
| "loss": 3.1838, |
| "step": 83950 |
| }, |
| { |
| "epoch": 24.489795918367346, |
| "grad_norm": 0.39887094497680664, |
| "learning_rate": 0.00030630455075845975, |
| "loss": 3.1918, |
| "step": 84000 |
| }, |
| { |
| "epoch": 24.489795918367346, |
| "eval_accuracy": 0.3730551778109691, |
| "eval_loss": 3.548759937286377, |
| "eval_runtime": 182.4228, |
| "eval_samples_per_second": 91.129, |
| "eval_steps_per_second": 5.696, |
| "step": 84000 |
| }, |
| { |
| "epoch": 24.504373177842567, |
| "grad_norm": 0.3996953070163727, |
| "learning_rate": 0.0003061295215869311, |
| "loss": 3.186, |
| "step": 84050 |
| }, |
| { |
| "epoch": 24.518950437317784, |
| "grad_norm": 0.4084763526916504, |
| "learning_rate": 0.00030595449241540255, |
| "loss": 3.2012, |
| "step": 84100 |
| }, |
| { |
| "epoch": 24.533527696793, |
| "grad_norm": 0.4281042516231537, |
| "learning_rate": 0.0003057794632438739, |
| "loss": 3.2031, |
| "step": 84150 |
| }, |
| { |
| "epoch": 24.548104956268222, |
| "grad_norm": 0.4151524305343628, |
| "learning_rate": 0.0003056044340723454, |
| "loss": 3.1991, |
| "step": 84200 |
| }, |
| { |
| "epoch": 24.56268221574344, |
| "grad_norm": 0.41447973251342773, |
| "learning_rate": 0.0003054294049008168, |
| "loss": 3.1998, |
| "step": 84250 |
| }, |
| { |
| "epoch": 24.57725947521866, |
| "grad_norm": 0.43595123291015625, |
| "learning_rate": 0.00030525437572928815, |
| "loss": 3.2024, |
| "step": 84300 |
| }, |
| { |
| "epoch": 24.591836734693878, |
| "grad_norm": 0.392225980758667, |
| "learning_rate": 0.00030507934655775963, |
| "loss": 3.2039, |
| "step": 84350 |
| }, |
| { |
| "epoch": 24.606413994169095, |
| "grad_norm": 0.4449690580368042, |
| "learning_rate": 0.000304904317386231, |
| "loss": 3.1923, |
| "step": 84400 |
| }, |
| { |
| "epoch": 24.620991253644316, |
| "grad_norm": 0.44518622756004333, |
| "learning_rate": 0.00030472928821470243, |
| "loss": 3.1881, |
| "step": 84450 |
| }, |
| { |
| "epoch": 24.635568513119534, |
| "grad_norm": 0.411631315946579, |
| "learning_rate": 0.0003045542590431738, |
| "loss": 3.2078, |
| "step": 84500 |
| }, |
| { |
| "epoch": 24.65014577259475, |
| "grad_norm": 0.41331884264945984, |
| "learning_rate": 0.0003043792298716453, |
| "loss": 3.2033, |
| "step": 84550 |
| }, |
| { |
| "epoch": 24.664723032069972, |
| "grad_norm": 0.4580352008342743, |
| "learning_rate": 0.00030420420070011666, |
| "loss": 3.2081, |
| "step": 84600 |
| }, |
| { |
| "epoch": 24.67930029154519, |
| "grad_norm": 0.38811588287353516, |
| "learning_rate": 0.00030402917152858803, |
| "loss": 3.1988, |
| "step": 84650 |
| }, |
| { |
| "epoch": 24.693877551020407, |
| "grad_norm": 0.384888619184494, |
| "learning_rate": 0.0003038541423570595, |
| "loss": 3.2015, |
| "step": 84700 |
| }, |
| { |
| "epoch": 24.708454810495628, |
| "grad_norm": 0.4199409484863281, |
| "learning_rate": 0.0003036791131855309, |
| "loss": 3.1869, |
| "step": 84750 |
| }, |
| { |
| "epoch": 24.723032069970845, |
| "grad_norm": 0.40446436405181885, |
| "learning_rate": 0.0003035040840140023, |
| "loss": 3.1997, |
| "step": 84800 |
| }, |
| { |
| "epoch": 24.737609329446066, |
| "grad_norm": 0.3871822953224182, |
| "learning_rate": 0.0003033290548424737, |
| "loss": 3.2059, |
| "step": 84850 |
| }, |
| { |
| "epoch": 24.752186588921283, |
| "grad_norm": 0.41010650992393494, |
| "learning_rate": 0.00030315402567094517, |
| "loss": 3.2043, |
| "step": 84900 |
| }, |
| { |
| "epoch": 24.7667638483965, |
| "grad_norm": 0.3981761634349823, |
| "learning_rate": 0.00030297899649941654, |
| "loss": 3.2215, |
| "step": 84950 |
| }, |
| { |
| "epoch": 24.78134110787172, |
| "grad_norm": 0.4111586809158325, |
| "learning_rate": 0.00030280396732788796, |
| "loss": 3.204, |
| "step": 85000 |
| }, |
| { |
| "epoch": 24.78134110787172, |
| "eval_accuracy": 0.3736616619171302, |
| "eval_loss": 3.53930401802063, |
| "eval_runtime": 182.661, |
| "eval_samples_per_second": 91.01, |
| "eval_steps_per_second": 5.688, |
| "step": 85000 |
| }, |
| { |
| "epoch": 24.79591836734694, |
| "grad_norm": 0.41816040873527527, |
| "learning_rate": 0.00030262893815635934, |
| "loss": 3.2047, |
| "step": 85050 |
| }, |
| { |
| "epoch": 24.810495626822156, |
| "grad_norm": 0.3968750238418579, |
| "learning_rate": 0.0003024539089848308, |
| "loss": 3.2043, |
| "step": 85100 |
| }, |
| { |
| "epoch": 24.825072886297377, |
| "grad_norm": 0.395974338054657, |
| "learning_rate": 0.0003022788798133022, |
| "loss": 3.2157, |
| "step": 85150 |
| }, |
| { |
| "epoch": 24.839650145772595, |
| "grad_norm": 0.41137680411338806, |
| "learning_rate": 0.00030210385064177356, |
| "loss": 3.2179, |
| "step": 85200 |
| }, |
| { |
| "epoch": 24.854227405247812, |
| "grad_norm": 0.40540027618408203, |
| "learning_rate": 0.00030192882147024504, |
| "loss": 3.2213, |
| "step": 85250 |
| }, |
| { |
| "epoch": 24.868804664723033, |
| "grad_norm": 0.39145246148109436, |
| "learning_rate": 0.0003017537922987164, |
| "loss": 3.2152, |
| "step": 85300 |
| }, |
| { |
| "epoch": 24.88338192419825, |
| "grad_norm": 0.3660285770893097, |
| "learning_rate": 0.00030157876312718784, |
| "loss": 3.2211, |
| "step": 85350 |
| }, |
| { |
| "epoch": 24.897959183673468, |
| "grad_norm": 0.3978833556175232, |
| "learning_rate": 0.0003014037339556592, |
| "loss": 3.2169, |
| "step": 85400 |
| }, |
| { |
| "epoch": 24.91253644314869, |
| "grad_norm": 0.411643385887146, |
| "learning_rate": 0.0003012287047841307, |
| "loss": 3.2271, |
| "step": 85450 |
| }, |
| { |
| "epoch": 24.927113702623906, |
| "grad_norm": 0.4295453727245331, |
| "learning_rate": 0.00030105367561260207, |
| "loss": 3.2275, |
| "step": 85500 |
| }, |
| { |
| "epoch": 24.941690962099127, |
| "grad_norm": 0.40958255529403687, |
| "learning_rate": 0.00030087864644107344, |
| "loss": 3.2266, |
| "step": 85550 |
| }, |
| { |
| "epoch": 24.956268221574344, |
| "grad_norm": 0.4321279525756836, |
| "learning_rate": 0.0003007036172695449, |
| "loss": 3.2174, |
| "step": 85600 |
| }, |
| { |
| "epoch": 24.97084548104956, |
| "grad_norm": 0.4050544500350952, |
| "learning_rate": 0.0003005285880980163, |
| "loss": 3.2255, |
| "step": 85650 |
| }, |
| { |
| "epoch": 24.985422740524783, |
| "grad_norm": 0.4148904085159302, |
| "learning_rate": 0.0003003535589264877, |
| "loss": 3.2268, |
| "step": 85700 |
| }, |
| { |
| "epoch": 25.0, |
| "grad_norm": 0.39421969652175903, |
| "learning_rate": 0.0003001785297549591, |
| "loss": 3.2183, |
| "step": 85750 |
| }, |
| { |
| "epoch": 25.014577259475217, |
| "grad_norm": 0.4394013285636902, |
| "learning_rate": 0.0003000035005834306, |
| "loss": 3.1087, |
| "step": 85800 |
| }, |
| { |
| "epoch": 25.02915451895044, |
| "grad_norm": 0.42048442363739014, |
| "learning_rate": 0.00029982847141190195, |
| "loss": 3.1202, |
| "step": 85850 |
| }, |
| { |
| "epoch": 25.043731778425656, |
| "grad_norm": 0.45491907000541687, |
| "learning_rate": 0.0002996534422403734, |
| "loss": 3.1221, |
| "step": 85900 |
| }, |
| { |
| "epoch": 25.058309037900873, |
| "grad_norm": 0.43091389536857605, |
| "learning_rate": 0.0002994784130688448, |
| "loss": 3.1416, |
| "step": 85950 |
| }, |
| { |
| "epoch": 25.072886297376094, |
| "grad_norm": 0.4495410621166229, |
| "learning_rate": 0.00029930338389731623, |
| "loss": 3.1397, |
| "step": 86000 |
| }, |
| { |
| "epoch": 25.072886297376094, |
| "eval_accuracy": 0.373125926442142, |
| "eval_loss": 3.553874969482422, |
| "eval_runtime": 182.5843, |
| "eval_samples_per_second": 91.048, |
| "eval_steps_per_second": 5.691, |
| "step": 86000 |
| }, |
| { |
| "epoch": 25.08746355685131, |
| "grad_norm": 0.412474125623703, |
| "learning_rate": 0.0002991283547257876, |
| "loss": 3.1423, |
| "step": 86050 |
| }, |
| { |
| "epoch": 25.102040816326532, |
| "grad_norm": 0.3976951539516449, |
| "learning_rate": 0.00029895332555425903, |
| "loss": 3.1351, |
| "step": 86100 |
| }, |
| { |
| "epoch": 25.11661807580175, |
| "grad_norm": 0.40554651618003845, |
| "learning_rate": 0.0002987782963827304, |
| "loss": 3.13, |
| "step": 86150 |
| }, |
| { |
| "epoch": 25.131195335276967, |
| "grad_norm": 0.4113754332065582, |
| "learning_rate": 0.00029860326721120183, |
| "loss": 3.1354, |
| "step": 86200 |
| }, |
| { |
| "epoch": 25.145772594752188, |
| "grad_norm": 0.39526087045669556, |
| "learning_rate": 0.00029842823803967326, |
| "loss": 3.1438, |
| "step": 86250 |
| }, |
| { |
| "epoch": 25.160349854227405, |
| "grad_norm": 0.42777061462402344, |
| "learning_rate": 0.0002982532088681447, |
| "loss": 3.1446, |
| "step": 86300 |
| }, |
| { |
| "epoch": 25.174927113702623, |
| "grad_norm": 0.41080230474472046, |
| "learning_rate": 0.0002980781796966161, |
| "loss": 3.1497, |
| "step": 86350 |
| }, |
| { |
| "epoch": 25.189504373177844, |
| "grad_norm": 0.40987899899482727, |
| "learning_rate": 0.0002979031505250875, |
| "loss": 3.166, |
| "step": 86400 |
| }, |
| { |
| "epoch": 25.20408163265306, |
| "grad_norm": 0.42429783940315247, |
| "learning_rate": 0.0002977281213535589, |
| "loss": 3.1537, |
| "step": 86450 |
| }, |
| { |
| "epoch": 25.21865889212828, |
| "grad_norm": 0.4256708323955536, |
| "learning_rate": 0.0002975530921820303, |
| "loss": 3.1527, |
| "step": 86500 |
| }, |
| { |
| "epoch": 25.2332361516035, |
| "grad_norm": 0.41000252962112427, |
| "learning_rate": 0.0002973780630105017, |
| "loss": 3.1513, |
| "step": 86550 |
| }, |
| { |
| "epoch": 25.247813411078717, |
| "grad_norm": 0.4126875102519989, |
| "learning_rate": 0.00029720303383897313, |
| "loss": 3.1622, |
| "step": 86600 |
| }, |
| { |
| "epoch": 25.262390670553934, |
| "grad_norm": 0.3998701572418213, |
| "learning_rate": 0.00029702800466744456, |
| "loss": 3.1508, |
| "step": 86650 |
| }, |
| { |
| "epoch": 25.276967930029155, |
| "grad_norm": 0.4031747877597809, |
| "learning_rate": 0.00029685297549591593, |
| "loss": 3.1588, |
| "step": 86700 |
| }, |
| { |
| "epoch": 25.291545189504372, |
| "grad_norm": 0.411139577627182, |
| "learning_rate": 0.00029667794632438736, |
| "loss": 3.1485, |
| "step": 86750 |
| }, |
| { |
| "epoch": 25.306122448979593, |
| "grad_norm": 0.41821885108947754, |
| "learning_rate": 0.0002965029171528588, |
| "loss": 3.165, |
| "step": 86800 |
| }, |
| { |
| "epoch": 25.32069970845481, |
| "grad_norm": 0.4090188443660736, |
| "learning_rate": 0.0002963278879813302, |
| "loss": 3.1456, |
| "step": 86850 |
| }, |
| { |
| "epoch": 25.335276967930028, |
| "grad_norm": 0.4112292230129242, |
| "learning_rate": 0.00029615285880980164, |
| "loss": 3.1734, |
| "step": 86900 |
| }, |
| { |
| "epoch": 25.34985422740525, |
| "grad_norm": 0.40837302803993225, |
| "learning_rate": 0.000295977829638273, |
| "loss": 3.1718, |
| "step": 86950 |
| }, |
| { |
| "epoch": 25.364431486880466, |
| "grad_norm": 0.4170677363872528, |
| "learning_rate": 0.00029580280046674444, |
| "loss": 3.1854, |
| "step": 87000 |
| }, |
| { |
| "epoch": 25.364431486880466, |
| "eval_accuracy": 0.3733238107166872, |
| "eval_loss": 3.5502431392669678, |
| "eval_runtime": 182.4009, |
| "eval_samples_per_second": 91.14, |
| "eval_steps_per_second": 5.696, |
| "step": 87000 |
| }, |
| { |
| "epoch": 25.379008746355684, |
| "grad_norm": 0.41980138421058655, |
| "learning_rate": 0.0002956277712952158, |
| "loss": 3.1741, |
| "step": 87050 |
| }, |
| { |
| "epoch": 25.393586005830905, |
| "grad_norm": 0.4188307225704193, |
| "learning_rate": 0.00029545274212368724, |
| "loss": 3.1697, |
| "step": 87100 |
| }, |
| { |
| "epoch": 25.408163265306122, |
| "grad_norm": 0.43848034739494324, |
| "learning_rate": 0.00029527771295215867, |
| "loss": 3.185, |
| "step": 87150 |
| }, |
| { |
| "epoch": 25.42274052478134, |
| "grad_norm": 0.39616858959198, |
| "learning_rate": 0.0002951026837806301, |
| "loss": 3.1797, |
| "step": 87200 |
| }, |
| { |
| "epoch": 25.43731778425656, |
| "grad_norm": 0.41143250465393066, |
| "learning_rate": 0.0002949276546091015, |
| "loss": 3.1751, |
| "step": 87250 |
| }, |
| { |
| "epoch": 25.451895043731778, |
| "grad_norm": 0.4165962338447571, |
| "learning_rate": 0.0002947526254375729, |
| "loss": 3.1736, |
| "step": 87300 |
| }, |
| { |
| "epoch": 25.466472303207, |
| "grad_norm": 0.47294145822525024, |
| "learning_rate": 0.0002945775962660443, |
| "loss": 3.181, |
| "step": 87350 |
| }, |
| { |
| "epoch": 25.481049562682216, |
| "grad_norm": 0.4045226275920868, |
| "learning_rate": 0.0002944025670945157, |
| "loss": 3.1849, |
| "step": 87400 |
| }, |
| { |
| "epoch": 25.495626822157433, |
| "grad_norm": 0.4241320788860321, |
| "learning_rate": 0.0002942275379229871, |
| "loss": 3.1866, |
| "step": 87450 |
| }, |
| { |
| "epoch": 25.510204081632654, |
| "grad_norm": 0.42185652256011963, |
| "learning_rate": 0.00029405250875145855, |
| "loss": 3.1941, |
| "step": 87500 |
| }, |
| { |
| "epoch": 25.52478134110787, |
| "grad_norm": 0.4017044007778168, |
| "learning_rate": 0.00029387747957993, |
| "loss": 3.1831, |
| "step": 87550 |
| }, |
| { |
| "epoch": 25.53935860058309, |
| "grad_norm": 0.4189952313899994, |
| "learning_rate": 0.0002937024504084014, |
| "loss": 3.187, |
| "step": 87600 |
| }, |
| { |
| "epoch": 25.55393586005831, |
| "grad_norm": 0.4023108184337616, |
| "learning_rate": 0.00029352742123687277, |
| "loss": 3.1787, |
| "step": 87650 |
| }, |
| { |
| "epoch": 25.568513119533527, |
| "grad_norm": 0.41992291808128357, |
| "learning_rate": 0.0002933523920653442, |
| "loss": 3.185, |
| "step": 87700 |
| }, |
| { |
| "epoch": 25.583090379008745, |
| "grad_norm": 0.40843623876571655, |
| "learning_rate": 0.0002931773628938156, |
| "loss": 3.1947, |
| "step": 87750 |
| }, |
| { |
| "epoch": 25.597667638483966, |
| "grad_norm": 0.474385142326355, |
| "learning_rate": 0.00029300233372228705, |
| "loss": 3.1747, |
| "step": 87800 |
| }, |
| { |
| "epoch": 25.612244897959183, |
| "grad_norm": 0.39669227600097656, |
| "learning_rate": 0.0002928273045507584, |
| "loss": 3.1836, |
| "step": 87850 |
| }, |
| { |
| "epoch": 25.626822157434404, |
| "grad_norm": 0.40173813700675964, |
| "learning_rate": 0.00029265227537922985, |
| "loss": 3.1895, |
| "step": 87900 |
| }, |
| { |
| "epoch": 25.64139941690962, |
| "grad_norm": 0.424064964056015, |
| "learning_rate": 0.0002924772462077012, |
| "loss": 3.1904, |
| "step": 87950 |
| }, |
| { |
| "epoch": 25.65597667638484, |
| "grad_norm": 0.4088822603225708, |
| "learning_rate": 0.00029230221703617265, |
| "loss": 3.1857, |
| "step": 88000 |
| }, |
| { |
| "epoch": 25.65597667638484, |
| "eval_accuracy": 0.37384942242748087, |
| "eval_loss": 3.5422239303588867, |
| "eval_runtime": 182.3439, |
| "eval_samples_per_second": 91.168, |
| "eval_steps_per_second": 5.698, |
| "step": 88000 |
| }, |
| { |
| "epoch": 25.67055393586006, |
| "grad_norm": 0.4238326847553253, |
| "learning_rate": 0.0002921271878646441, |
| "loss": 3.1843, |
| "step": 88050 |
| }, |
| { |
| "epoch": 25.685131195335277, |
| "grad_norm": 0.4173918664455414, |
| "learning_rate": 0.0002919521586931155, |
| "loss": 3.1886, |
| "step": 88100 |
| }, |
| { |
| "epoch": 25.699708454810494, |
| "grad_norm": 0.4285440444946289, |
| "learning_rate": 0.00029177712952158693, |
| "loss": 3.1955, |
| "step": 88150 |
| }, |
| { |
| "epoch": 25.714285714285715, |
| "grad_norm": 0.42694538831710815, |
| "learning_rate": 0.0002916021003500583, |
| "loss": 3.1886, |
| "step": 88200 |
| }, |
| { |
| "epoch": 25.728862973760933, |
| "grad_norm": 0.4153097867965698, |
| "learning_rate": 0.00029142707117852973, |
| "loss": 3.2084, |
| "step": 88250 |
| }, |
| { |
| "epoch": 25.74344023323615, |
| "grad_norm": 0.4131809175014496, |
| "learning_rate": 0.0002912520420070011, |
| "loss": 3.1952, |
| "step": 88300 |
| }, |
| { |
| "epoch": 25.75801749271137, |
| "grad_norm": 0.4065682888031006, |
| "learning_rate": 0.00029107701283547253, |
| "loss": 3.2032, |
| "step": 88350 |
| }, |
| { |
| "epoch": 25.77259475218659, |
| "grad_norm": 0.41387709975242615, |
| "learning_rate": 0.00029090198366394396, |
| "loss": 3.1827, |
| "step": 88400 |
| }, |
| { |
| "epoch": 25.78717201166181, |
| "grad_norm": 0.4111236333847046, |
| "learning_rate": 0.0002907269544924154, |
| "loss": 3.1942, |
| "step": 88450 |
| }, |
| { |
| "epoch": 25.801749271137027, |
| "grad_norm": 0.3958951532840729, |
| "learning_rate": 0.0002905519253208868, |
| "loss": 3.2109, |
| "step": 88500 |
| }, |
| { |
| "epoch": 25.816326530612244, |
| "grad_norm": 0.38538554310798645, |
| "learning_rate": 0.0002903768961493582, |
| "loss": 3.1974, |
| "step": 88550 |
| }, |
| { |
| "epoch": 25.830903790087465, |
| "grad_norm": 0.43065372109413147, |
| "learning_rate": 0.0002902018669778296, |
| "loss": 3.2017, |
| "step": 88600 |
| }, |
| { |
| "epoch": 25.845481049562682, |
| "grad_norm": 0.4160745143890381, |
| "learning_rate": 0.00029002683780630104, |
| "loss": 3.2081, |
| "step": 88650 |
| }, |
| { |
| "epoch": 25.8600583090379, |
| "grad_norm": 0.4086984694004059, |
| "learning_rate": 0.00028985180863477246, |
| "loss": 3.2086, |
| "step": 88700 |
| }, |
| { |
| "epoch": 25.87463556851312, |
| "grad_norm": 0.4291251003742218, |
| "learning_rate": 0.00028967677946324384, |
| "loss": 3.1999, |
| "step": 88750 |
| }, |
| { |
| "epoch": 25.889212827988338, |
| "grad_norm": 0.4759675860404968, |
| "learning_rate": 0.00028950175029171526, |
| "loss": 3.2125, |
| "step": 88800 |
| }, |
| { |
| "epoch": 25.903790087463555, |
| "grad_norm": 0.4120366871356964, |
| "learning_rate": 0.0002893267211201867, |
| "loss": 3.1932, |
| "step": 88850 |
| }, |
| { |
| "epoch": 25.918367346938776, |
| "grad_norm": 0.42245393991470337, |
| "learning_rate": 0.00028915169194865806, |
| "loss": 3.1992, |
| "step": 88900 |
| }, |
| { |
| "epoch": 25.932944606413994, |
| "grad_norm": 0.4053804874420166, |
| "learning_rate": 0.0002889766627771295, |
| "loss": 3.1935, |
| "step": 88950 |
| }, |
| { |
| "epoch": 25.947521865889215, |
| "grad_norm": 0.4270218014717102, |
| "learning_rate": 0.0002888016336056009, |
| "loss": 3.2093, |
| "step": 89000 |
| }, |
| { |
| "epoch": 25.947521865889215, |
| "eval_accuracy": 0.374267675150538, |
| "eval_loss": 3.533585786819458, |
| "eval_runtime": 182.2598, |
| "eval_samples_per_second": 91.21, |
| "eval_steps_per_second": 5.701, |
| "step": 89000 |
| }, |
| { |
| "epoch": 25.962099125364432, |
| "grad_norm": 0.4217967987060547, |
| "learning_rate": 0.00028862660443407234, |
| "loss": 3.2001, |
| "step": 89050 |
| }, |
| { |
| "epoch": 25.97667638483965, |
| "grad_norm": 0.4118807911872864, |
| "learning_rate": 0.0002884515752625437, |
| "loss": 3.2036, |
| "step": 89100 |
| }, |
| { |
| "epoch": 25.99125364431487, |
| "grad_norm": 0.42750346660614014, |
| "learning_rate": 0.00028827654609101514, |
| "loss": 3.2162, |
| "step": 89150 |
| }, |
| { |
| "epoch": 26.005830903790088, |
| "grad_norm": 0.418365478515625, |
| "learning_rate": 0.0002881015169194865, |
| "loss": 3.1788, |
| "step": 89200 |
| }, |
| { |
| "epoch": 26.020408163265305, |
| "grad_norm": 0.4194610118865967, |
| "learning_rate": 0.00028792648774795794, |
| "loss": 3.1106, |
| "step": 89250 |
| }, |
| { |
| "epoch": 26.034985422740526, |
| "grad_norm": 0.4437054991722107, |
| "learning_rate": 0.00028775145857642937, |
| "loss": 3.1061, |
| "step": 89300 |
| }, |
| { |
| "epoch": 26.049562682215743, |
| "grad_norm": 0.424712598323822, |
| "learning_rate": 0.0002875764294049008, |
| "loss": 3.1115, |
| "step": 89350 |
| }, |
| { |
| "epoch": 26.06413994169096, |
| "grad_norm": 0.39572784304618835, |
| "learning_rate": 0.0002874014002333722, |
| "loss": 3.1158, |
| "step": 89400 |
| }, |
| { |
| "epoch": 26.07871720116618, |
| "grad_norm": 0.4253915846347809, |
| "learning_rate": 0.0002872263710618436, |
| "loss": 3.1204, |
| "step": 89450 |
| }, |
| { |
| "epoch": 26.0932944606414, |
| "grad_norm": 0.4415355324745178, |
| "learning_rate": 0.000287051341890315, |
| "loss": 3.1092, |
| "step": 89500 |
| }, |
| { |
| "epoch": 26.107871720116616, |
| "grad_norm": 0.4195983111858368, |
| "learning_rate": 0.00028687631271878645, |
| "loss": 3.1289, |
| "step": 89550 |
| }, |
| { |
| "epoch": 26.122448979591837, |
| "grad_norm": 0.4112869203090668, |
| "learning_rate": 0.0002867012835472579, |
| "loss": 3.1264, |
| "step": 89600 |
| }, |
| { |
| "epoch": 26.137026239067055, |
| "grad_norm": 0.4152833819389343, |
| "learning_rate": 0.00028652625437572925, |
| "loss": 3.1271, |
| "step": 89650 |
| }, |
| { |
| "epoch": 26.151603498542276, |
| "grad_norm": 0.4177822172641754, |
| "learning_rate": 0.0002863512252042007, |
| "loss": 3.1187, |
| "step": 89700 |
| }, |
| { |
| "epoch": 26.166180758017493, |
| "grad_norm": 0.3986111581325531, |
| "learning_rate": 0.0002861761960326721, |
| "loss": 3.1502, |
| "step": 89750 |
| }, |
| { |
| "epoch": 26.18075801749271, |
| "grad_norm": 0.4292623698711395, |
| "learning_rate": 0.0002860011668611435, |
| "loss": 3.1275, |
| "step": 89800 |
| }, |
| { |
| "epoch": 26.19533527696793, |
| "grad_norm": 0.4087209701538086, |
| "learning_rate": 0.0002858261376896149, |
| "loss": 3.1387, |
| "step": 89850 |
| }, |
| { |
| "epoch": 26.20991253644315, |
| "grad_norm": 0.44936609268188477, |
| "learning_rate": 0.00028565110851808633, |
| "loss": 3.1336, |
| "step": 89900 |
| }, |
| { |
| "epoch": 26.224489795918366, |
| "grad_norm": 0.46571049094200134, |
| "learning_rate": 0.00028547607934655776, |
| "loss": 3.1375, |
| "step": 89950 |
| }, |
| { |
| "epoch": 26.239067055393587, |
| "grad_norm": 0.4057162404060364, |
| "learning_rate": 0.00028530105017502913, |
| "loss": 3.1503, |
| "step": 90000 |
| }, |
| { |
| "epoch": 26.239067055393587, |
| "eval_accuracy": 0.3732979127152595, |
| "eval_loss": 3.5532782077789307, |
| "eval_runtime": 182.1912, |
| "eval_samples_per_second": 91.245, |
| "eval_steps_per_second": 5.703, |
| "step": 90000 |
| }, |
| { |
| "epoch": 26.253644314868804, |
| "grad_norm": 0.4367867112159729, |
| "learning_rate": 0.00028512602100350055, |
| "loss": 3.1442, |
| "step": 90050 |
| }, |
| { |
| "epoch": 26.26822157434402, |
| "grad_norm": 0.45157337188720703, |
| "learning_rate": 0.000284950991831972, |
| "loss": 3.1556, |
| "step": 90100 |
| }, |
| { |
| "epoch": 26.282798833819243, |
| "grad_norm": 0.3997921347618103, |
| "learning_rate": 0.00028477596266044335, |
| "loss": 3.1671, |
| "step": 90150 |
| }, |
| { |
| "epoch": 26.29737609329446, |
| "grad_norm": 0.5283579230308533, |
| "learning_rate": 0.0002846009334889148, |
| "loss": 3.1531, |
| "step": 90200 |
| }, |
| { |
| "epoch": 26.31195335276968, |
| "grad_norm": 0.4433899223804474, |
| "learning_rate": 0.0002844259043173862, |
| "loss": 3.1575, |
| "step": 90250 |
| }, |
| { |
| "epoch": 26.3265306122449, |
| "grad_norm": 0.4299410283565521, |
| "learning_rate": 0.00028425087514585763, |
| "loss": 3.149, |
| "step": 90300 |
| }, |
| { |
| "epoch": 26.341107871720116, |
| "grad_norm": 0.39908355474472046, |
| "learning_rate": 0.000284075845974329, |
| "loss": 3.1512, |
| "step": 90350 |
| }, |
| { |
| "epoch": 26.355685131195337, |
| "grad_norm": 0.4303843080997467, |
| "learning_rate": 0.00028390081680280043, |
| "loss": 3.1551, |
| "step": 90400 |
| }, |
| { |
| "epoch": 26.370262390670554, |
| "grad_norm": 0.4121764004230499, |
| "learning_rate": 0.00028372578763127186, |
| "loss": 3.1556, |
| "step": 90450 |
| }, |
| { |
| "epoch": 26.38483965014577, |
| "grad_norm": 0.4113008677959442, |
| "learning_rate": 0.0002835507584597433, |
| "loss": 3.1623, |
| "step": 90500 |
| }, |
| { |
| "epoch": 26.399416909620992, |
| "grad_norm": 0.4334237575531006, |
| "learning_rate": 0.0002833757292882147, |
| "loss": 3.1508, |
| "step": 90550 |
| }, |
| { |
| "epoch": 26.41399416909621, |
| "grad_norm": 0.4339134693145752, |
| "learning_rate": 0.0002832007001166861, |
| "loss": 3.1664, |
| "step": 90600 |
| }, |
| { |
| "epoch": 26.428571428571427, |
| "grad_norm": 0.42791739106178284, |
| "learning_rate": 0.0002830256709451575, |
| "loss": 3.1589, |
| "step": 90650 |
| }, |
| { |
| "epoch": 26.443148688046648, |
| "grad_norm": 0.4143845736980438, |
| "learning_rate": 0.0002828506417736289, |
| "loss": 3.1752, |
| "step": 90700 |
| }, |
| { |
| "epoch": 26.457725947521865, |
| "grad_norm": 0.40137404203414917, |
| "learning_rate": 0.0002826756126021003, |
| "loss": 3.1681, |
| "step": 90750 |
| }, |
| { |
| "epoch": 26.472303206997086, |
| "grad_norm": 0.4495770335197449, |
| "learning_rate": 0.00028250058343057174, |
| "loss": 3.1732, |
| "step": 90800 |
| }, |
| { |
| "epoch": 26.486880466472304, |
| "grad_norm": 0.42186465859413147, |
| "learning_rate": 0.00028232555425904317, |
| "loss": 3.1692, |
| "step": 90850 |
| }, |
| { |
| "epoch": 26.50145772594752, |
| "grad_norm": 0.4350731074810028, |
| "learning_rate": 0.00028215052508751454, |
| "loss": 3.1882, |
| "step": 90900 |
| }, |
| { |
| "epoch": 26.516034985422742, |
| "grad_norm": 0.40870970487594604, |
| "learning_rate": 0.00028197549591598597, |
| "loss": 3.1807, |
| "step": 90950 |
| }, |
| { |
| "epoch": 26.53061224489796, |
| "grad_norm": 0.39996737241744995, |
| "learning_rate": 0.0002818004667444574, |
| "loss": 3.1896, |
| "step": 91000 |
| }, |
| { |
| "epoch": 26.53061224489796, |
| "eval_accuracy": 0.3737564250587178, |
| "eval_loss": 3.5460140705108643, |
| "eval_runtime": 182.2217, |
| "eval_samples_per_second": 91.23, |
| "eval_steps_per_second": 5.702, |
| "step": 91000 |
| }, |
| { |
| "epoch": 26.545189504373177, |
| "grad_norm": 0.43439486622810364, |
| "learning_rate": 0.00028162543757292877, |
| "loss": 3.1718, |
| "step": 91050 |
| }, |
| { |
| "epoch": 26.559766763848398, |
| "grad_norm": 0.4343793988227844, |
| "learning_rate": 0.0002814504084014002, |
| "loss": 3.181, |
| "step": 91100 |
| }, |
| { |
| "epoch": 26.574344023323615, |
| "grad_norm": 0.4268733263015747, |
| "learning_rate": 0.0002812753792298716, |
| "loss": 3.1965, |
| "step": 91150 |
| }, |
| { |
| "epoch": 26.588921282798832, |
| "grad_norm": 0.42242977023124695, |
| "learning_rate": 0.00028110035005834305, |
| "loss": 3.1764, |
| "step": 91200 |
| }, |
| { |
| "epoch": 26.603498542274053, |
| "grad_norm": 0.4688321053981781, |
| "learning_rate": 0.0002809253208868144, |
| "loss": 3.1757, |
| "step": 91250 |
| }, |
| { |
| "epoch": 26.61807580174927, |
| "grad_norm": 0.4145612120628357, |
| "learning_rate": 0.00028075029171528585, |
| "loss": 3.18, |
| "step": 91300 |
| }, |
| { |
| "epoch": 26.632653061224488, |
| "grad_norm": 0.4039236009120941, |
| "learning_rate": 0.00028057526254375727, |
| "loss": 3.185, |
| "step": 91350 |
| }, |
| { |
| "epoch": 26.64723032069971, |
| "grad_norm": 0.44142764806747437, |
| "learning_rate": 0.0002804002333722287, |
| "loss": 3.1799, |
| "step": 91400 |
| }, |
| { |
| "epoch": 26.661807580174926, |
| "grad_norm": 0.45887356996536255, |
| "learning_rate": 0.0002802252042007001, |
| "loss": 3.1985, |
| "step": 91450 |
| }, |
| { |
| "epoch": 26.676384839650147, |
| "grad_norm": 0.4100106954574585, |
| "learning_rate": 0.0002800501750291715, |
| "loss": 3.1803, |
| "step": 91500 |
| }, |
| { |
| "epoch": 26.690962099125365, |
| "grad_norm": 0.3970423638820648, |
| "learning_rate": 0.0002798751458576429, |
| "loss": 3.2064, |
| "step": 91550 |
| }, |
| { |
| "epoch": 26.705539358600582, |
| "grad_norm": 0.43205025792121887, |
| "learning_rate": 0.0002797001166861143, |
| "loss": 3.1793, |
| "step": 91600 |
| }, |
| { |
| "epoch": 26.720116618075803, |
| "grad_norm": 0.405561238527298, |
| "learning_rate": 0.0002795250875145857, |
| "loss": 3.1937, |
| "step": 91650 |
| }, |
| { |
| "epoch": 26.73469387755102, |
| "grad_norm": 0.4518439769744873, |
| "learning_rate": 0.00027935005834305715, |
| "loss": 3.1808, |
| "step": 91700 |
| }, |
| { |
| "epoch": 26.749271137026238, |
| "grad_norm": 0.41998225450515747, |
| "learning_rate": 0.0002791750291715286, |
| "loss": 3.1911, |
| "step": 91750 |
| }, |
| { |
| "epoch": 26.76384839650146, |
| "grad_norm": 0.41252264380455017, |
| "learning_rate": 0.000279, |
| "loss": 3.1918, |
| "step": 91800 |
| }, |
| { |
| "epoch": 26.778425655976676, |
| "grad_norm": 0.3939790427684784, |
| "learning_rate": 0.0002788249708284714, |
| "loss": 3.1961, |
| "step": 91850 |
| }, |
| { |
| "epoch": 26.793002915451893, |
| "grad_norm": 0.42222079634666443, |
| "learning_rate": 0.0002786499416569428, |
| "loss": 3.1969, |
| "step": 91900 |
| }, |
| { |
| "epoch": 26.807580174927114, |
| "grad_norm": 0.45875927805900574, |
| "learning_rate": 0.0002784749124854142, |
| "loss": 3.1919, |
| "step": 91950 |
| }, |
| { |
| "epoch": 26.82215743440233, |
| "grad_norm": 0.43025851249694824, |
| "learning_rate": 0.0002782998833138856, |
| "loss": 3.182, |
| "step": 92000 |
| }, |
| { |
| "epoch": 26.82215743440233, |
| "eval_accuracy": 0.3743816263568198, |
| "eval_loss": 3.536444902420044, |
| "eval_runtime": 182.341, |
| "eval_samples_per_second": 91.17, |
| "eval_steps_per_second": 5.698, |
| "step": 92000 |
| }, |
| { |
| "epoch": 26.836734693877553, |
| "grad_norm": 0.39891061186790466, |
| "learning_rate": 0.00027812485414235703, |
| "loss": 3.206, |
| "step": 92050 |
| }, |
| { |
| "epoch": 26.85131195335277, |
| "grad_norm": 0.4320659935474396, |
| "learning_rate": 0.00027794982497082846, |
| "loss": 3.2002, |
| "step": 92100 |
| }, |
| { |
| "epoch": 26.865889212827987, |
| "grad_norm": 0.4368121027946472, |
| "learning_rate": 0.00027777479579929983, |
| "loss": 3.1955, |
| "step": 92150 |
| }, |
| { |
| "epoch": 26.88046647230321, |
| "grad_norm": 0.41299423575401306, |
| "learning_rate": 0.00027759976662777126, |
| "loss": 3.1841, |
| "step": 92200 |
| }, |
| { |
| "epoch": 26.895043731778426, |
| "grad_norm": 0.46896249055862427, |
| "learning_rate": 0.0002774247374562427, |
| "loss": 3.1841, |
| "step": 92250 |
| }, |
| { |
| "epoch": 26.909620991253643, |
| "grad_norm": 0.41294729709625244, |
| "learning_rate": 0.0002772497082847141, |
| "loss": 3.1996, |
| "step": 92300 |
| }, |
| { |
| "epoch": 26.924198250728864, |
| "grad_norm": 0.41092661023139954, |
| "learning_rate": 0.00027707467911318554, |
| "loss": 3.1942, |
| "step": 92350 |
| }, |
| { |
| "epoch": 26.93877551020408, |
| "grad_norm": 0.42133837938308716, |
| "learning_rate": 0.0002768996499416569, |
| "loss": 3.1925, |
| "step": 92400 |
| }, |
| { |
| "epoch": 26.9533527696793, |
| "grad_norm": 0.4176836609840393, |
| "learning_rate": 0.00027672462077012834, |
| "loss": 3.1996, |
| "step": 92450 |
| }, |
| { |
| "epoch": 26.96793002915452, |
| "grad_norm": 0.41681262850761414, |
| "learning_rate": 0.0002765495915985997, |
| "loss": 3.2085, |
| "step": 92500 |
| }, |
| { |
| "epoch": 26.982507288629737, |
| "grad_norm": 0.4322628080844879, |
| "learning_rate": 0.00027637456242707114, |
| "loss": 3.1991, |
| "step": 92550 |
| }, |
| { |
| "epoch": 26.997084548104958, |
| "grad_norm": 0.4040561318397522, |
| "learning_rate": 0.00027619953325554256, |
| "loss": 3.1959, |
| "step": 92600 |
| }, |
| { |
| "epoch": 27.011661807580175, |
| "grad_norm": 0.4253135919570923, |
| "learning_rate": 0.000276024504084014, |
| "loss": 3.1227, |
| "step": 92650 |
| }, |
| { |
| "epoch": 27.026239067055393, |
| "grad_norm": 0.406501829624176, |
| "learning_rate": 0.0002758494749124854, |
| "loss": 3.1051, |
| "step": 92700 |
| }, |
| { |
| "epoch": 27.040816326530614, |
| "grad_norm": 0.42616963386535645, |
| "learning_rate": 0.0002756744457409568, |
| "loss": 3.1009, |
| "step": 92750 |
| }, |
| { |
| "epoch": 27.05539358600583, |
| "grad_norm": 0.41401493549346924, |
| "learning_rate": 0.0002754994165694282, |
| "loss": 3.1149, |
| "step": 92800 |
| }, |
| { |
| "epoch": 27.06997084548105, |
| "grad_norm": 0.46265655755996704, |
| "learning_rate": 0.0002753243873978996, |
| "loss": 3.1054, |
| "step": 92850 |
| }, |
| { |
| "epoch": 27.08454810495627, |
| "grad_norm": 0.4060516059398651, |
| "learning_rate": 0.000275149358226371, |
| "loss": 3.1261, |
| "step": 92900 |
| }, |
| { |
| "epoch": 27.099125364431487, |
| "grad_norm": 0.4009002149105072, |
| "learning_rate": 0.00027497432905484244, |
| "loss": 3.1259, |
| "step": 92950 |
| }, |
| { |
| "epoch": 27.113702623906704, |
| "grad_norm": 0.40886712074279785, |
| "learning_rate": 0.00027479929988331387, |
| "loss": 3.132, |
| "step": 93000 |
| }, |
| { |
| "epoch": 27.113702623906704, |
| "eval_accuracy": 0.37335924389136776, |
| "eval_loss": 3.5546579360961914, |
| "eval_runtime": 182.3175, |
| "eval_samples_per_second": 91.182, |
| "eval_steps_per_second": 5.699, |
| "step": 93000 |
| }, |
| { |
| "epoch": 27.128279883381925, |
| "grad_norm": 0.44393426179885864, |
| "learning_rate": 0.0002746242707117853, |
| "loss": 3.1276, |
| "step": 93050 |
| }, |
| { |
| "epoch": 27.142857142857142, |
| "grad_norm": 0.4561188519001007, |
| "learning_rate": 0.00027444924154025667, |
| "loss": 3.1334, |
| "step": 93100 |
| }, |
| { |
| "epoch": 27.157434402332363, |
| "grad_norm": 0.41121312975883484, |
| "learning_rate": 0.0002742742123687281, |
| "loss": 3.1328, |
| "step": 93150 |
| }, |
| { |
| "epoch": 27.17201166180758, |
| "grad_norm": 0.41144025325775146, |
| "learning_rate": 0.0002740991831971995, |
| "loss": 3.1183, |
| "step": 93200 |
| }, |
| { |
| "epoch": 27.186588921282798, |
| "grad_norm": 0.42008209228515625, |
| "learning_rate": 0.00027392415402567095, |
| "loss": 3.1315, |
| "step": 93250 |
| }, |
| { |
| "epoch": 27.20116618075802, |
| "grad_norm": 0.3998580276966095, |
| "learning_rate": 0.0002737491248541423, |
| "loss": 3.1425, |
| "step": 93300 |
| }, |
| { |
| "epoch": 27.215743440233236, |
| "grad_norm": 0.4630013108253479, |
| "learning_rate": 0.00027357409568261375, |
| "loss": 3.1392, |
| "step": 93350 |
| }, |
| { |
| "epoch": 27.230320699708454, |
| "grad_norm": 0.4226396381855011, |
| "learning_rate": 0.0002733990665110851, |
| "loss": 3.1364, |
| "step": 93400 |
| }, |
| { |
| "epoch": 27.244897959183675, |
| "grad_norm": 0.41518664360046387, |
| "learning_rate": 0.00027322403733955655, |
| "loss": 3.1334, |
| "step": 93450 |
| }, |
| { |
| "epoch": 27.259475218658892, |
| "grad_norm": 0.42913633584976196, |
| "learning_rate": 0.000273049008168028, |
| "loss": 3.1365, |
| "step": 93500 |
| }, |
| { |
| "epoch": 27.27405247813411, |
| "grad_norm": 0.42923083901405334, |
| "learning_rate": 0.0002728739789964994, |
| "loss": 3.1459, |
| "step": 93550 |
| }, |
| { |
| "epoch": 27.28862973760933, |
| "grad_norm": 0.4026493728160858, |
| "learning_rate": 0.00027269894982497083, |
| "loss": 3.1444, |
| "step": 93600 |
| }, |
| { |
| "epoch": 27.303206997084548, |
| "grad_norm": 0.41380855441093445, |
| "learning_rate": 0.0002725239206534422, |
| "loss": 3.1418, |
| "step": 93650 |
| }, |
| { |
| "epoch": 27.317784256559765, |
| "grad_norm": 0.3932145833969116, |
| "learning_rate": 0.00027234889148191363, |
| "loss": 3.1472, |
| "step": 93700 |
| }, |
| { |
| "epoch": 27.332361516034986, |
| "grad_norm": 0.39401277899742126, |
| "learning_rate": 0.000272173862310385, |
| "loss": 3.1552, |
| "step": 93750 |
| }, |
| { |
| "epoch": 27.346938775510203, |
| "grad_norm": 0.42028194665908813, |
| "learning_rate": 0.00027199883313885643, |
| "loss": 3.1562, |
| "step": 93800 |
| }, |
| { |
| "epoch": 27.361516034985424, |
| "grad_norm": 0.4550812244415283, |
| "learning_rate": 0.00027182380396732785, |
| "loss": 3.1481, |
| "step": 93850 |
| }, |
| { |
| "epoch": 27.37609329446064, |
| "grad_norm": 0.44632673263549805, |
| "learning_rate": 0.0002716487747957993, |
| "loss": 3.1464, |
| "step": 93900 |
| }, |
| { |
| "epoch": 27.39067055393586, |
| "grad_norm": 0.42995485663414, |
| "learning_rate": 0.0002714737456242707, |
| "loss": 3.1464, |
| "step": 93950 |
| }, |
| { |
| "epoch": 27.40524781341108, |
| "grad_norm": 0.43392640352249146, |
| "learning_rate": 0.0002712987164527421, |
| "loss": 3.1532, |
| "step": 94000 |
| }, |
| { |
| "epoch": 27.40524781341108, |
| "eval_accuracy": 0.3739876235805541, |
| "eval_loss": 3.5490474700927734, |
| "eval_runtime": 182.2581, |
| "eval_samples_per_second": 91.211, |
| "eval_steps_per_second": 5.701, |
| "step": 94000 |
| }, |
| { |
| "epoch": 27.419825072886297, |
| "grad_norm": 0.47143322229385376, |
| "learning_rate": 0.0002711236872812135, |
| "loss": 3.1634, |
| "step": 94050 |
| }, |
| { |
| "epoch": 27.434402332361515, |
| "grad_norm": 0.42636147141456604, |
| "learning_rate": 0.00027094865810968493, |
| "loss": 3.1727, |
| "step": 94100 |
| }, |
| { |
| "epoch": 27.448979591836736, |
| "grad_norm": 0.42488712072372437, |
| "learning_rate": 0.00027077362893815636, |
| "loss": 3.1599, |
| "step": 94150 |
| }, |
| { |
| "epoch": 27.463556851311953, |
| "grad_norm": 0.41161873936653137, |
| "learning_rate": 0.00027059859976662773, |
| "loss": 3.1518, |
| "step": 94200 |
| }, |
| { |
| "epoch": 27.47813411078717, |
| "grad_norm": 0.4553752541542053, |
| "learning_rate": 0.00027042357059509916, |
| "loss": 3.1597, |
| "step": 94250 |
| }, |
| { |
| "epoch": 27.49271137026239, |
| "grad_norm": 0.44867464900016785, |
| "learning_rate": 0.0002702485414235706, |
| "loss": 3.1586, |
| "step": 94300 |
| }, |
| { |
| "epoch": 27.50728862973761, |
| "grad_norm": 0.4399424195289612, |
| "learning_rate": 0.00027007351225204196, |
| "loss": 3.1653, |
| "step": 94350 |
| }, |
| { |
| "epoch": 27.52186588921283, |
| "grad_norm": 0.40725404024124146, |
| "learning_rate": 0.0002698984830805134, |
| "loss": 3.1611, |
| "step": 94400 |
| }, |
| { |
| "epoch": 27.536443148688047, |
| "grad_norm": 0.4193655252456665, |
| "learning_rate": 0.0002697234539089848, |
| "loss": 3.1653, |
| "step": 94450 |
| }, |
| { |
| "epoch": 27.551020408163264, |
| "grad_norm": 0.4339175224304199, |
| "learning_rate": 0.00026954842473745624, |
| "loss": 3.1616, |
| "step": 94500 |
| }, |
| { |
| "epoch": 27.565597667638485, |
| "grad_norm": 0.44409579038619995, |
| "learning_rate": 0.0002693733955659276, |
| "loss": 3.1797, |
| "step": 94550 |
| }, |
| { |
| "epoch": 27.580174927113703, |
| "grad_norm": 0.43605875968933105, |
| "learning_rate": 0.00026919836639439904, |
| "loss": 3.1683, |
| "step": 94600 |
| }, |
| { |
| "epoch": 27.59475218658892, |
| "grad_norm": 0.4129311442375183, |
| "learning_rate": 0.0002690233372228704, |
| "loss": 3.1685, |
| "step": 94650 |
| }, |
| { |
| "epoch": 27.60932944606414, |
| "grad_norm": 0.4296424388885498, |
| "learning_rate": 0.00026884830805134184, |
| "loss": 3.1754, |
| "step": 94700 |
| }, |
| { |
| "epoch": 27.62390670553936, |
| "grad_norm": 0.4571221172809601, |
| "learning_rate": 0.00026867327887981327, |
| "loss": 3.1584, |
| "step": 94750 |
| }, |
| { |
| "epoch": 27.638483965014576, |
| "grad_norm": 0.4272068440914154, |
| "learning_rate": 0.0002684982497082847, |
| "loss": 3.1751, |
| "step": 94800 |
| }, |
| { |
| "epoch": 27.653061224489797, |
| "grad_norm": 0.41919034719467163, |
| "learning_rate": 0.0002683232205367561, |
| "loss": 3.1656, |
| "step": 94850 |
| }, |
| { |
| "epoch": 27.667638483965014, |
| "grad_norm": 0.42660972476005554, |
| "learning_rate": 0.0002681481913652275, |
| "loss": 3.1793, |
| "step": 94900 |
| }, |
| { |
| "epoch": 27.682215743440235, |
| "grad_norm": 0.4040818214416504, |
| "learning_rate": 0.0002679731621936989, |
| "loss": 3.1682, |
| "step": 94950 |
| }, |
| { |
| "epoch": 27.696793002915452, |
| "grad_norm": 0.4299778342247009, |
| "learning_rate": 0.00026779813302217035, |
| "loss": 3.183, |
| "step": 95000 |
| }, |
| { |
| "epoch": 27.696793002915452, |
| "eval_accuracy": 0.374235891239695, |
| "eval_loss": 3.542213201522827, |
| "eval_runtime": 182.3118, |
| "eval_samples_per_second": 91.184, |
| "eval_steps_per_second": 5.699, |
| "step": 95000 |
| }, |
| { |
| "epoch": 27.71137026239067, |
| "grad_norm": 0.417683869600296, |
| "learning_rate": 0.00026762310385064177, |
| "loss": 3.1681, |
| "step": 95050 |
| }, |
| { |
| "epoch": 27.72594752186589, |
| "grad_norm": 0.46521857380867004, |
| "learning_rate": 0.0002674480746791132, |
| "loss": 3.1957, |
| "step": 95100 |
| }, |
| { |
| "epoch": 27.740524781341108, |
| "grad_norm": 0.43779146671295166, |
| "learning_rate": 0.00026727304550758457, |
| "loss": 3.174, |
| "step": 95150 |
| }, |
| { |
| "epoch": 27.755102040816325, |
| "grad_norm": 0.43895670771598816, |
| "learning_rate": 0.000267098016336056, |
| "loss": 3.1767, |
| "step": 95200 |
| }, |
| { |
| "epoch": 27.769679300291546, |
| "grad_norm": 0.4087444245815277, |
| "learning_rate": 0.00026692298716452737, |
| "loss": 3.1691, |
| "step": 95250 |
| }, |
| { |
| "epoch": 27.784256559766764, |
| "grad_norm": 0.4138129949569702, |
| "learning_rate": 0.0002667479579929988, |
| "loss": 3.1816, |
| "step": 95300 |
| }, |
| { |
| "epoch": 27.79883381924198, |
| "grad_norm": 0.4436318278312683, |
| "learning_rate": 0.0002665729288214702, |
| "loss": 3.1838, |
| "step": 95350 |
| }, |
| { |
| "epoch": 27.813411078717202, |
| "grad_norm": 0.40394556522369385, |
| "learning_rate": 0.00026639789964994165, |
| "loss": 3.1816, |
| "step": 95400 |
| }, |
| { |
| "epoch": 27.82798833819242, |
| "grad_norm": 0.3930664360523224, |
| "learning_rate": 0.000266222870478413, |
| "loss": 3.1737, |
| "step": 95450 |
| }, |
| { |
| "epoch": 27.842565597667637, |
| "grad_norm": 0.41876089572906494, |
| "learning_rate": 0.00026604784130688445, |
| "loss": 3.1655, |
| "step": 95500 |
| }, |
| { |
| "epoch": 27.857142857142858, |
| "grad_norm": 0.4264007806777954, |
| "learning_rate": 0.0002658728121353559, |
| "loss": 3.1931, |
| "step": 95550 |
| }, |
| { |
| "epoch": 27.871720116618075, |
| "grad_norm": 0.4150126874446869, |
| "learning_rate": 0.00026569778296382725, |
| "loss": 3.1939, |
| "step": 95600 |
| }, |
| { |
| "epoch": 27.886297376093296, |
| "grad_norm": 0.42122021317481995, |
| "learning_rate": 0.0002655227537922987, |
| "loss": 3.1902, |
| "step": 95650 |
| }, |
| { |
| "epoch": 27.900874635568513, |
| "grad_norm": 0.41215264797210693, |
| "learning_rate": 0.0002653477246207701, |
| "loss": 3.1878, |
| "step": 95700 |
| }, |
| { |
| "epoch": 27.91545189504373, |
| "grad_norm": 0.4309203624725342, |
| "learning_rate": 0.00026517269544924153, |
| "loss": 3.181, |
| "step": 95750 |
| }, |
| { |
| "epoch": 27.93002915451895, |
| "grad_norm": 0.42477867007255554, |
| "learning_rate": 0.0002649976662777129, |
| "loss": 3.1958, |
| "step": 95800 |
| }, |
| { |
| "epoch": 27.94460641399417, |
| "grad_norm": 0.44038382172584534, |
| "learning_rate": 0.00026482263710618433, |
| "loss": 3.1817, |
| "step": 95850 |
| }, |
| { |
| "epoch": 27.959183673469386, |
| "grad_norm": 0.41017380356788635, |
| "learning_rate": 0.00026464760793465576, |
| "loss": 3.1799, |
| "step": 95900 |
| }, |
| { |
| "epoch": 27.973760932944607, |
| "grad_norm": 0.40887367725372314, |
| "learning_rate": 0.0002644725787631272, |
| "loss": 3.1856, |
| "step": 95950 |
| }, |
| { |
| "epoch": 27.988338192419825, |
| "grad_norm": 0.4441905915737152, |
| "learning_rate": 0.0002642975495915986, |
| "loss": 3.1996, |
| "step": 96000 |
| }, |
| { |
| "epoch": 27.988338192419825, |
| "eval_accuracy": 0.3744487257241552, |
| "eval_loss": 3.5362908840179443, |
| "eval_runtime": 182.2992, |
| "eval_samples_per_second": 91.191, |
| "eval_steps_per_second": 5.699, |
| "step": 96000 |
| }, |
| { |
| "epoch": 28.002915451895042, |
| "grad_norm": 0.4339207112789154, |
| "learning_rate": 0.00026412252042007, |
| "loss": 3.1705, |
| "step": 96050 |
| }, |
| { |
| "epoch": 28.017492711370263, |
| "grad_norm": 0.4093478322029114, |
| "learning_rate": 0.0002639474912485414, |
| "loss": 3.0895, |
| "step": 96100 |
| }, |
| { |
| "epoch": 28.03206997084548, |
| "grad_norm": 0.41591787338256836, |
| "learning_rate": 0.0002637724620770128, |
| "loss": 3.0936, |
| "step": 96150 |
| }, |
| { |
| "epoch": 28.0466472303207, |
| "grad_norm": 0.41636621952056885, |
| "learning_rate": 0.0002635974329054842, |
| "loss": 3.0817, |
| "step": 96200 |
| }, |
| { |
| "epoch": 28.06122448979592, |
| "grad_norm": 0.4211430847644806, |
| "learning_rate": 0.00026342240373395564, |
| "loss": 3.1026, |
| "step": 96250 |
| }, |
| { |
| "epoch": 28.075801749271136, |
| "grad_norm": 0.4202626049518585, |
| "learning_rate": 0.00026324737456242706, |
| "loss": 3.1065, |
| "step": 96300 |
| }, |
| { |
| "epoch": 28.090379008746357, |
| "grad_norm": 0.4492689371109009, |
| "learning_rate": 0.0002630723453908985, |
| "loss": 3.1069, |
| "step": 96350 |
| }, |
| { |
| "epoch": 28.104956268221574, |
| "grad_norm": 0.4519897997379303, |
| "learning_rate": 0.00026289731621936986, |
| "loss": 3.1109, |
| "step": 96400 |
| }, |
| { |
| "epoch": 28.11953352769679, |
| "grad_norm": 0.43334949016571045, |
| "learning_rate": 0.0002627222870478413, |
| "loss": 3.1146, |
| "step": 96450 |
| }, |
| { |
| "epoch": 28.134110787172013, |
| "grad_norm": 0.44164830446243286, |
| "learning_rate": 0.00026254725787631266, |
| "loss": 3.1177, |
| "step": 96500 |
| }, |
| { |
| "epoch": 28.14868804664723, |
| "grad_norm": 0.4185495972633362, |
| "learning_rate": 0.0002623722287047841, |
| "loss": 3.1323, |
| "step": 96550 |
| }, |
| { |
| "epoch": 28.163265306122447, |
| "grad_norm": 0.42650115489959717, |
| "learning_rate": 0.0002621971995332555, |
| "loss": 3.1329, |
| "step": 96600 |
| }, |
| { |
| "epoch": 28.17784256559767, |
| "grad_norm": 0.4367852807044983, |
| "learning_rate": 0.00026202217036172694, |
| "loss": 3.1257, |
| "step": 96650 |
| }, |
| { |
| "epoch": 28.192419825072886, |
| "grad_norm": 0.45244693756103516, |
| "learning_rate": 0.0002618471411901983, |
| "loss": 3.1137, |
| "step": 96700 |
| }, |
| { |
| "epoch": 28.206997084548107, |
| "grad_norm": 0.4422908425331116, |
| "learning_rate": 0.00026167211201866974, |
| "loss": 3.1321, |
| "step": 96750 |
| }, |
| { |
| "epoch": 28.221574344023324, |
| "grad_norm": 0.43210363388061523, |
| "learning_rate": 0.00026149708284714117, |
| "loss": 3.1408, |
| "step": 96800 |
| }, |
| { |
| "epoch": 28.23615160349854, |
| "grad_norm": 0.45949944853782654, |
| "learning_rate": 0.0002613220536756126, |
| "loss": 3.1283, |
| "step": 96850 |
| }, |
| { |
| "epoch": 28.250728862973762, |
| "grad_norm": 0.41855332255363464, |
| "learning_rate": 0.000261147024504084, |
| "loss": 3.1189, |
| "step": 96900 |
| }, |
| { |
| "epoch": 28.26530612244898, |
| "grad_norm": 0.42870011925697327, |
| "learning_rate": 0.0002609719953325554, |
| "loss": 3.1422, |
| "step": 96950 |
| }, |
| { |
| "epoch": 28.279883381924197, |
| "grad_norm": 0.43530869483947754, |
| "learning_rate": 0.0002607969661610268, |
| "loss": 3.1371, |
| "step": 97000 |
| }, |
| { |
| "epoch": 28.279883381924197, |
| "eval_accuracy": 0.3735036841084213, |
| "eval_loss": 3.553114652633667, |
| "eval_runtime": 182.2048, |
| "eval_samples_per_second": 91.238, |
| "eval_steps_per_second": 5.702, |
| "step": 97000 |
| }, |
| { |
| "epoch": 28.294460641399418, |
| "grad_norm": 0.42264923453330994, |
| "learning_rate": 0.0002606219369894982, |
| "loss": 3.1177, |
| "step": 97050 |
| }, |
| { |
| "epoch": 28.309037900874635, |
| "grad_norm": 0.4130590260028839, |
| "learning_rate": 0.0002604469078179696, |
| "loss": 3.1388, |
| "step": 97100 |
| }, |
| { |
| "epoch": 28.323615160349853, |
| "grad_norm": 0.40744999051094055, |
| "learning_rate": 0.00026027187864644105, |
| "loss": 3.1388, |
| "step": 97150 |
| }, |
| { |
| "epoch": 28.338192419825074, |
| "grad_norm": 0.46660253405570984, |
| "learning_rate": 0.0002600968494749125, |
| "loss": 3.1389, |
| "step": 97200 |
| }, |
| { |
| "epoch": 28.35276967930029, |
| "grad_norm": 0.43974146246910095, |
| "learning_rate": 0.0002599218203033839, |
| "loss": 3.1489, |
| "step": 97250 |
| }, |
| { |
| "epoch": 28.367346938775512, |
| "grad_norm": 0.43621718883514404, |
| "learning_rate": 0.0002597467911318553, |
| "loss": 3.1529, |
| "step": 97300 |
| }, |
| { |
| "epoch": 28.38192419825073, |
| "grad_norm": 0.42073875665664673, |
| "learning_rate": 0.0002595717619603267, |
| "loss": 3.1414, |
| "step": 97350 |
| }, |
| { |
| "epoch": 28.396501457725947, |
| "grad_norm": 0.4213981032371521, |
| "learning_rate": 0.0002593967327887981, |
| "loss": 3.1533, |
| "step": 97400 |
| }, |
| { |
| "epoch": 28.411078717201168, |
| "grad_norm": 0.4182080030441284, |
| "learning_rate": 0.0002592217036172695, |
| "loss": 3.1385, |
| "step": 97450 |
| }, |
| { |
| "epoch": 28.425655976676385, |
| "grad_norm": 0.4123639464378357, |
| "learning_rate": 0.00025904667444574093, |
| "loss": 3.1479, |
| "step": 97500 |
| }, |
| { |
| "epoch": 28.440233236151602, |
| "grad_norm": 0.4333662688732147, |
| "learning_rate": 0.00025887164527421235, |
| "loss": 3.1448, |
| "step": 97550 |
| }, |
| { |
| "epoch": 28.454810495626823, |
| "grad_norm": 0.4333436191082001, |
| "learning_rate": 0.0002586966161026838, |
| "loss": 3.1457, |
| "step": 97600 |
| }, |
| { |
| "epoch": 28.46938775510204, |
| "grad_norm": 0.4152778089046478, |
| "learning_rate": 0.00025852158693115515, |
| "loss": 3.1555, |
| "step": 97650 |
| }, |
| { |
| "epoch": 28.483965014577258, |
| "grad_norm": 0.4503307044506073, |
| "learning_rate": 0.0002583465577596266, |
| "loss": 3.1413, |
| "step": 97700 |
| }, |
| { |
| "epoch": 28.49854227405248, |
| "grad_norm": 0.4560645520687103, |
| "learning_rate": 0.000258171528588098, |
| "loss": 3.1536, |
| "step": 97750 |
| }, |
| { |
| "epoch": 28.513119533527696, |
| "grad_norm": 0.4178006947040558, |
| "learning_rate": 0.00025799649941656943, |
| "loss": 3.1502, |
| "step": 97800 |
| }, |
| { |
| "epoch": 28.527696793002917, |
| "grad_norm": 0.4329889416694641, |
| "learning_rate": 0.0002578214702450408, |
| "loss": 3.1638, |
| "step": 97850 |
| }, |
| { |
| "epoch": 28.542274052478135, |
| "grad_norm": 0.42388278245925903, |
| "learning_rate": 0.00025764644107351223, |
| "loss": 3.1715, |
| "step": 97900 |
| }, |
| { |
| "epoch": 28.556851311953352, |
| "grad_norm": 0.43577128648757935, |
| "learning_rate": 0.0002574714119019836, |
| "loss": 3.1687, |
| "step": 97950 |
| }, |
| { |
| "epoch": 28.571428571428573, |
| "grad_norm": 0.4220936894416809, |
| "learning_rate": 0.00025729638273045503, |
| "loss": 3.1674, |
| "step": 98000 |
| }, |
| { |
| "epoch": 28.571428571428573, |
| "eval_accuracy": 0.3740367120650784, |
| "eval_loss": 3.5461931228637695, |
| "eval_runtime": 182.2109, |
| "eval_samples_per_second": 91.235, |
| "eval_steps_per_second": 5.702, |
| "step": 98000 |
| }, |
| { |
| "epoch": 28.58600583090379, |
| "grad_norm": 0.43830054998397827, |
| "learning_rate": 0.00025712135355892646, |
| "loss": 3.1637, |
| "step": 98050 |
| }, |
| { |
| "epoch": 28.600583090379008, |
| "grad_norm": 0.4078274071216583, |
| "learning_rate": 0.0002569463243873979, |
| "loss": 3.16, |
| "step": 98100 |
| }, |
| { |
| "epoch": 28.61516034985423, |
| "grad_norm": 0.4261452555656433, |
| "learning_rate": 0.0002567712952158693, |
| "loss": 3.1605, |
| "step": 98150 |
| }, |
| { |
| "epoch": 28.629737609329446, |
| "grad_norm": 0.41544103622436523, |
| "learning_rate": 0.0002565962660443407, |
| "loss": 3.1539, |
| "step": 98200 |
| }, |
| { |
| "epoch": 28.644314868804663, |
| "grad_norm": 0.45621052384376526, |
| "learning_rate": 0.0002564212368728121, |
| "loss": 3.1737, |
| "step": 98250 |
| }, |
| { |
| "epoch": 28.658892128279884, |
| "grad_norm": 0.4310225248336792, |
| "learning_rate": 0.0002562462077012835, |
| "loss": 3.1557, |
| "step": 98300 |
| }, |
| { |
| "epoch": 28.6734693877551, |
| "grad_norm": 0.4316820800304413, |
| "learning_rate": 0.0002560711785297549, |
| "loss": 3.1647, |
| "step": 98350 |
| }, |
| { |
| "epoch": 28.68804664723032, |
| "grad_norm": 0.43771031498908997, |
| "learning_rate": 0.00025589614935822634, |
| "loss": 3.1694, |
| "step": 98400 |
| }, |
| { |
| "epoch": 28.70262390670554, |
| "grad_norm": 0.4118792414665222, |
| "learning_rate": 0.00025572112018669777, |
| "loss": 3.1737, |
| "step": 98450 |
| }, |
| { |
| "epoch": 28.717201166180757, |
| "grad_norm": 0.4393903315067291, |
| "learning_rate": 0.0002555460910151692, |
| "loss": 3.1775, |
| "step": 98500 |
| }, |
| { |
| "epoch": 28.73177842565598, |
| "grad_norm": 0.43764179944992065, |
| "learning_rate": 0.00025537106184364057, |
| "loss": 3.1672, |
| "step": 98550 |
| }, |
| { |
| "epoch": 28.746355685131196, |
| "grad_norm": 0.4366304874420166, |
| "learning_rate": 0.000255196032672112, |
| "loss": 3.169, |
| "step": 98600 |
| }, |
| { |
| "epoch": 28.760932944606413, |
| "grad_norm": 0.45967209339141846, |
| "learning_rate": 0.0002550210035005834, |
| "loss": 3.1722, |
| "step": 98650 |
| }, |
| { |
| "epoch": 28.775510204081634, |
| "grad_norm": 0.43880704045295715, |
| "learning_rate": 0.00025484597432905485, |
| "loss": 3.1695, |
| "step": 98700 |
| }, |
| { |
| "epoch": 28.79008746355685, |
| "grad_norm": 0.43362289667129517, |
| "learning_rate": 0.0002546709451575262, |
| "loss": 3.1679, |
| "step": 98750 |
| }, |
| { |
| "epoch": 28.80466472303207, |
| "grad_norm": 0.4177137613296509, |
| "learning_rate": 0.00025449591598599765, |
| "loss": 3.1803, |
| "step": 98800 |
| }, |
| { |
| "epoch": 28.81924198250729, |
| "grad_norm": 0.4206119179725647, |
| "learning_rate": 0.00025432088681446907, |
| "loss": 3.1719, |
| "step": 98850 |
| }, |
| { |
| "epoch": 28.833819241982507, |
| "grad_norm": 0.41267478466033936, |
| "learning_rate": 0.00025414585764294044, |
| "loss": 3.1715, |
| "step": 98900 |
| }, |
| { |
| "epoch": 28.848396501457724, |
| "grad_norm": 0.44268399477005005, |
| "learning_rate": 0.00025397082847141187, |
| "loss": 3.1749, |
| "step": 98950 |
| }, |
| { |
| "epoch": 28.862973760932945, |
| "grad_norm": 0.4493129849433899, |
| "learning_rate": 0.0002537957992998833, |
| "loss": 3.1818, |
| "step": 99000 |
| }, |
| { |
| "epoch": 28.862973760932945, |
| "eval_accuracy": 0.3747328974307299, |
| "eval_loss": 3.538551092147827, |
| "eval_runtime": 182.3234, |
| "eval_samples_per_second": 91.179, |
| "eval_steps_per_second": 5.699, |
| "step": 99000 |
| }, |
| { |
| "epoch": 28.877551020408163, |
| "grad_norm": 0.4239693880081177, |
| "learning_rate": 0.0002536207701283547, |
| "loss": 3.1753, |
| "step": 99050 |
| }, |
| { |
| "epoch": 28.892128279883384, |
| "grad_norm": 0.4513675570487976, |
| "learning_rate": 0.0002534457409568261, |
| "loss": 3.1725, |
| "step": 99100 |
| }, |
| { |
| "epoch": 28.9067055393586, |
| "grad_norm": 0.40344876050949097, |
| "learning_rate": 0.0002532707117852975, |
| "loss": 3.1764, |
| "step": 99150 |
| }, |
| { |
| "epoch": 28.92128279883382, |
| "grad_norm": 0.43702664971351624, |
| "learning_rate": 0.00025309568261376895, |
| "loss": 3.1875, |
| "step": 99200 |
| }, |
| { |
| "epoch": 28.93586005830904, |
| "grad_norm": 0.4061313271522522, |
| "learning_rate": 0.0002529206534422403, |
| "loss": 3.1757, |
| "step": 99250 |
| }, |
| { |
| "epoch": 28.950437317784257, |
| "grad_norm": 0.43192869424819946, |
| "learning_rate": 0.00025274562427071175, |
| "loss": 3.1771, |
| "step": 99300 |
| }, |
| { |
| "epoch": 28.965014577259474, |
| "grad_norm": 0.4228961169719696, |
| "learning_rate": 0.0002525705950991832, |
| "loss": 3.1744, |
| "step": 99350 |
| }, |
| { |
| "epoch": 28.979591836734695, |
| "grad_norm": 0.4422495365142822, |
| "learning_rate": 0.0002523955659276546, |
| "loss": 3.179, |
| "step": 99400 |
| }, |
| { |
| "epoch": 28.994169096209912, |
| "grad_norm": 0.4269513487815857, |
| "learning_rate": 0.000252220536756126, |
| "loss": 3.1753, |
| "step": 99450 |
| }, |
| { |
| "epoch": 29.00874635568513, |
| "grad_norm": 0.48027440905570984, |
| "learning_rate": 0.0002520455075845974, |
| "loss": 3.1244, |
| "step": 99500 |
| }, |
| { |
| "epoch": 29.02332361516035, |
| "grad_norm": 0.45419687032699585, |
| "learning_rate": 0.00025187047841306883, |
| "loss": 3.0918, |
| "step": 99550 |
| }, |
| { |
| "epoch": 29.037900874635568, |
| "grad_norm": 0.45154982805252075, |
| "learning_rate": 0.00025169544924154026, |
| "loss": 3.0949, |
| "step": 99600 |
| }, |
| { |
| "epoch": 29.05247813411079, |
| "grad_norm": 0.455363005399704, |
| "learning_rate": 0.00025152042007001163, |
| "loss": 3.0937, |
| "step": 99650 |
| }, |
| { |
| "epoch": 29.067055393586006, |
| "grad_norm": 0.4598212242126465, |
| "learning_rate": 0.00025134539089848306, |
| "loss": 3.0988, |
| "step": 99700 |
| }, |
| { |
| "epoch": 29.081632653061224, |
| "grad_norm": 0.4252084791660309, |
| "learning_rate": 0.0002511703617269545, |
| "loss": 3.1003, |
| "step": 99750 |
| }, |
| { |
| "epoch": 29.096209912536445, |
| "grad_norm": 0.4729098081588745, |
| "learning_rate": 0.00025099533255542586, |
| "loss": 3.092, |
| "step": 99800 |
| }, |
| { |
| "epoch": 29.110787172011662, |
| "grad_norm": 0.4432501792907715, |
| "learning_rate": 0.0002508203033838973, |
| "loss": 3.1063, |
| "step": 99850 |
| }, |
| { |
| "epoch": 29.12536443148688, |
| "grad_norm": 0.42664968967437744, |
| "learning_rate": 0.0002506452742123687, |
| "loss": 3.0943, |
| "step": 99900 |
| }, |
| { |
| "epoch": 29.1399416909621, |
| "grad_norm": 0.4493328332901001, |
| "learning_rate": 0.00025047024504084014, |
| "loss": 3.108, |
| "step": 99950 |
| }, |
| { |
| "epoch": 29.154518950437318, |
| "grad_norm": 0.4413515627384186, |
| "learning_rate": 0.0002502952158693115, |
| "loss": 3.1052, |
| "step": 100000 |
| }, |
| { |
| "epoch": 29.154518950437318, |
| "eval_accuracy": 0.3735989181227622, |
| "eval_loss": 3.555105686187744, |
| "eval_runtime": 182.4652, |
| "eval_samples_per_second": 91.108, |
| "eval_steps_per_second": 5.694, |
| "step": 100000 |
| }, |
| { |
| "epoch": 29.169096209912535, |
| "grad_norm": 0.46616682410240173, |
| "learning_rate": 0.00025012018669778294, |
| "loss": 3.11, |
| "step": 100050 |
| }, |
| { |
| "epoch": 29.183673469387756, |
| "grad_norm": 0.42780494689941406, |
| "learning_rate": 0.00024994515752625436, |
| "loss": 3.108, |
| "step": 100100 |
| }, |
| { |
| "epoch": 29.198250728862973, |
| "grad_norm": 0.4434570074081421, |
| "learning_rate": 0.0002497701283547258, |
| "loss": 3.1273, |
| "step": 100150 |
| }, |
| { |
| "epoch": 29.21282798833819, |
| "grad_norm": 0.45673868060112, |
| "learning_rate": 0.00024959509918319716, |
| "loss": 3.1163, |
| "step": 100200 |
| }, |
| { |
| "epoch": 29.22740524781341, |
| "grad_norm": 0.43610596656799316, |
| "learning_rate": 0.0002494200700116686, |
| "loss": 3.1242, |
| "step": 100250 |
| }, |
| { |
| "epoch": 29.24198250728863, |
| "grad_norm": 0.46187329292297363, |
| "learning_rate": 0.00024924504084014, |
| "loss": 3.1222, |
| "step": 100300 |
| }, |
| { |
| "epoch": 29.25655976676385, |
| "grad_norm": 0.4247560501098633, |
| "learning_rate": 0.0002490700116686114, |
| "loss": 3.1194, |
| "step": 100350 |
| }, |
| { |
| "epoch": 29.271137026239067, |
| "grad_norm": 0.4311138391494751, |
| "learning_rate": 0.0002488949824970828, |
| "loss": 3.1222, |
| "step": 100400 |
| }, |
| { |
| "epoch": 29.285714285714285, |
| "grad_norm": 0.4713177978992462, |
| "learning_rate": 0.00024871995332555424, |
| "loss": 3.1259, |
| "step": 100450 |
| }, |
| { |
| "epoch": 29.300291545189506, |
| "grad_norm": 0.48516809940338135, |
| "learning_rate": 0.00024854492415402567, |
| "loss": 3.144, |
| "step": 100500 |
| }, |
| { |
| "epoch": 29.314868804664723, |
| "grad_norm": 0.4494832754135132, |
| "learning_rate": 0.0002483698949824971, |
| "loss": 3.1357, |
| "step": 100550 |
| }, |
| { |
| "epoch": 29.32944606413994, |
| "grad_norm": 0.4127950966358185, |
| "learning_rate": 0.00024819486581096847, |
| "loss": 3.1354, |
| "step": 100600 |
| }, |
| { |
| "epoch": 29.34402332361516, |
| "grad_norm": 0.4290008544921875, |
| "learning_rate": 0.0002480198366394399, |
| "loss": 3.1288, |
| "step": 100650 |
| }, |
| { |
| "epoch": 29.35860058309038, |
| "grad_norm": 0.4509317874908447, |
| "learning_rate": 0.00024784480746791127, |
| "loss": 3.1309, |
| "step": 100700 |
| }, |
| { |
| "epoch": 29.373177842565596, |
| "grad_norm": 0.45486029982566833, |
| "learning_rate": 0.0002476697782963827, |
| "loss": 3.1393, |
| "step": 100750 |
| }, |
| { |
| "epoch": 29.387755102040817, |
| "grad_norm": 0.42254355549812317, |
| "learning_rate": 0.0002474947491248541, |
| "loss": 3.1312, |
| "step": 100800 |
| }, |
| { |
| "epoch": 29.402332361516034, |
| "grad_norm": 0.42709168791770935, |
| "learning_rate": 0.00024731971995332555, |
| "loss": 3.1449, |
| "step": 100850 |
| }, |
| { |
| "epoch": 29.416909620991255, |
| "grad_norm": 0.4207426905632019, |
| "learning_rate": 0.0002471446907817969, |
| "loss": 3.1467, |
| "step": 100900 |
| }, |
| { |
| "epoch": 29.431486880466473, |
| "grad_norm": 0.46891316771507263, |
| "learning_rate": 0.00024696966161026835, |
| "loss": 3.1429, |
| "step": 100950 |
| }, |
| { |
| "epoch": 29.44606413994169, |
| "grad_norm": 0.449980229139328, |
| "learning_rate": 0.0002467946324387398, |
| "loss": 3.1293, |
| "step": 101000 |
| }, |
| { |
| "epoch": 29.44606413994169, |
| "eval_accuracy": 0.37401199124553375, |
| "eval_loss": 3.549697160720825, |
| "eval_runtime": 182.3939, |
| "eval_samples_per_second": 91.143, |
| "eval_steps_per_second": 5.696, |
| "step": 101000 |
| }, |
| { |
| "epoch": 29.46064139941691, |
| "grad_norm": 0.4162759780883789, |
| "learning_rate": 0.0002466196032672112, |
| "loss": 3.1306, |
| "step": 101050 |
| }, |
| { |
| "epoch": 29.47521865889213, |
| "grad_norm": 0.44192177057266235, |
| "learning_rate": 0.0002464445740956826, |
| "loss": 3.1477, |
| "step": 101100 |
| }, |
| { |
| "epoch": 29.489795918367346, |
| "grad_norm": 0.44102761149406433, |
| "learning_rate": 0.000246269544924154, |
| "loss": 3.144, |
| "step": 101150 |
| }, |
| { |
| "epoch": 29.504373177842567, |
| "grad_norm": 0.4400504231452942, |
| "learning_rate": 0.00024609451575262543, |
| "loss": 3.1505, |
| "step": 101200 |
| }, |
| { |
| "epoch": 29.518950437317784, |
| "grad_norm": 0.46671465039253235, |
| "learning_rate": 0.0002459194865810968, |
| "loss": 3.1379, |
| "step": 101250 |
| }, |
| { |
| "epoch": 29.533527696793, |
| "grad_norm": 0.427625447511673, |
| "learning_rate": 0.0002457444574095682, |
| "loss": 3.1468, |
| "step": 101300 |
| }, |
| { |
| "epoch": 29.548104956268222, |
| "grad_norm": 0.458279550075531, |
| "learning_rate": 0.00024556942823803965, |
| "loss": 3.1479, |
| "step": 101350 |
| }, |
| { |
| "epoch": 29.56268221574344, |
| "grad_norm": 0.4169410169124603, |
| "learning_rate": 0.0002453943990665111, |
| "loss": 3.1595, |
| "step": 101400 |
| }, |
| { |
| "epoch": 29.57725947521866, |
| "grad_norm": 0.4758301079273224, |
| "learning_rate": 0.0002452193698949825, |
| "loss": 3.145, |
| "step": 101450 |
| }, |
| { |
| "epoch": 29.591836734693878, |
| "grad_norm": 0.4280540347099304, |
| "learning_rate": 0.0002450443407234539, |
| "loss": 3.1523, |
| "step": 101500 |
| }, |
| { |
| "epoch": 29.606413994169095, |
| "grad_norm": 0.43501847982406616, |
| "learning_rate": 0.0002448693115519253, |
| "loss": 3.1499, |
| "step": 101550 |
| }, |
| { |
| "epoch": 29.620991253644316, |
| "grad_norm": 0.43182411789894104, |
| "learning_rate": 0.0002446942823803967, |
| "loss": 3.1456, |
| "step": 101600 |
| }, |
| { |
| "epoch": 29.635568513119534, |
| "grad_norm": 0.4231785237789154, |
| "learning_rate": 0.0002445192532088681, |
| "loss": 3.1622, |
| "step": 101650 |
| }, |
| { |
| "epoch": 29.65014577259475, |
| "grad_norm": 0.4608800709247589, |
| "learning_rate": 0.00024434422403733953, |
| "loss": 3.1573, |
| "step": 101700 |
| }, |
| { |
| "epoch": 29.664723032069972, |
| "grad_norm": 0.45372170209884644, |
| "learning_rate": 0.00024416919486581096, |
| "loss": 3.1656, |
| "step": 101750 |
| }, |
| { |
| "epoch": 29.67930029154519, |
| "grad_norm": 0.43550124764442444, |
| "learning_rate": 0.00024399416569428236, |
| "loss": 3.1573, |
| "step": 101800 |
| }, |
| { |
| "epoch": 29.693877551020407, |
| "grad_norm": 0.4416351318359375, |
| "learning_rate": 0.00024381913652275376, |
| "loss": 3.1695, |
| "step": 101850 |
| }, |
| { |
| "epoch": 29.708454810495628, |
| "grad_norm": 0.4657592177391052, |
| "learning_rate": 0.00024364410735122519, |
| "loss": 3.1636, |
| "step": 101900 |
| }, |
| { |
| "epoch": 29.723032069970845, |
| "grad_norm": 0.46444663405418396, |
| "learning_rate": 0.00024346907817969659, |
| "loss": 3.17, |
| "step": 101950 |
| }, |
| { |
| "epoch": 29.737609329446066, |
| "grad_norm": 0.45541027188301086, |
| "learning_rate": 0.000243294049008168, |
| "loss": 3.1532, |
| "step": 102000 |
| }, |
| { |
| "epoch": 29.737609329446066, |
| "eval_accuracy": 0.3743919855573909, |
| "eval_loss": 3.5395925045013428, |
| "eval_runtime": 182.8413, |
| "eval_samples_per_second": 90.92, |
| "eval_steps_per_second": 5.683, |
| "step": 102000 |
| }, |
| { |
| "epoch": 29.752186588921283, |
| "grad_norm": 0.4434316158294678, |
| "learning_rate": 0.0002431190198366394, |
| "loss": 3.1644, |
| "step": 102050 |
| }, |
| { |
| "epoch": 29.7667638483965, |
| "grad_norm": 0.42412641644477844, |
| "learning_rate": 0.00024294399066511084, |
| "loss": 3.1669, |
| "step": 102100 |
| }, |
| { |
| "epoch": 29.78134110787172, |
| "grad_norm": 0.45610010623931885, |
| "learning_rate": 0.00024276896149358224, |
| "loss": 3.1668, |
| "step": 102150 |
| }, |
| { |
| "epoch": 29.79591836734694, |
| "grad_norm": 0.4540196359157562, |
| "learning_rate": 0.00024259393232205367, |
| "loss": 3.1735, |
| "step": 102200 |
| }, |
| { |
| "epoch": 29.810495626822156, |
| "grad_norm": 0.4640849530696869, |
| "learning_rate": 0.0002424189031505251, |
| "loss": 3.1642, |
| "step": 102250 |
| }, |
| { |
| "epoch": 29.825072886297377, |
| "grad_norm": 0.43845105171203613, |
| "learning_rate": 0.00024224387397899647, |
| "loss": 3.1592, |
| "step": 102300 |
| }, |
| { |
| "epoch": 29.839650145772595, |
| "grad_norm": 0.44450509548187256, |
| "learning_rate": 0.0002420688448074679, |
| "loss": 3.1668, |
| "step": 102350 |
| }, |
| { |
| "epoch": 29.854227405247812, |
| "grad_norm": 0.4392869472503662, |
| "learning_rate": 0.0002418938156359393, |
| "loss": 3.1691, |
| "step": 102400 |
| }, |
| { |
| "epoch": 29.868804664723033, |
| "grad_norm": 0.4368334412574768, |
| "learning_rate": 0.00024171878646441072, |
| "loss": 3.1672, |
| "step": 102450 |
| }, |
| { |
| "epoch": 29.88338192419825, |
| "grad_norm": 0.4234115779399872, |
| "learning_rate": 0.00024154375729288212, |
| "loss": 3.159, |
| "step": 102500 |
| }, |
| { |
| "epoch": 29.897959183673468, |
| "grad_norm": 0.43562474846839905, |
| "learning_rate": 0.00024136872812135354, |
| "loss": 3.1774, |
| "step": 102550 |
| }, |
| { |
| "epoch": 29.91253644314869, |
| "grad_norm": 0.48254790902137756, |
| "learning_rate": 0.00024119369894982497, |
| "loss": 3.1659, |
| "step": 102600 |
| }, |
| { |
| "epoch": 29.927113702623906, |
| "grad_norm": 0.4869457185268402, |
| "learning_rate": 0.00024101866977829637, |
| "loss": 3.1699, |
| "step": 102650 |
| }, |
| { |
| "epoch": 29.941690962099127, |
| "grad_norm": 0.44558846950531006, |
| "learning_rate": 0.0002408436406067678, |
| "loss": 3.1649, |
| "step": 102700 |
| }, |
| { |
| "epoch": 29.956268221574344, |
| "grad_norm": 0.4565999209880829, |
| "learning_rate": 0.00024066861143523917, |
| "loss": 3.1667, |
| "step": 102750 |
| }, |
| { |
| "epoch": 29.97084548104956, |
| "grad_norm": 0.4984128475189209, |
| "learning_rate": 0.0002404935822637106, |
| "loss": 3.1783, |
| "step": 102800 |
| }, |
| { |
| "epoch": 29.985422740524783, |
| "grad_norm": 0.430683970451355, |
| "learning_rate": 0.000240318553092182, |
| "loss": 3.167, |
| "step": 102850 |
| }, |
| { |
| "epoch": 30.0, |
| "grad_norm": 0.47676679491996765, |
| "learning_rate": 0.00024014352392065342, |
| "loss": 3.1735, |
| "step": 102900 |
| }, |
| { |
| "epoch": 30.014577259475217, |
| "grad_norm": 0.4585511386394501, |
| "learning_rate": 0.00023996849474912482, |
| "loss": 3.0723, |
| "step": 102950 |
| }, |
| { |
| "epoch": 30.02915451895044, |
| "grad_norm": 0.44532549381256104, |
| "learning_rate": 0.00023979346557759625, |
| "loss": 3.0842, |
| "step": 103000 |
| }, |
| { |
| "epoch": 30.02915451895044, |
| "eval_accuracy": 0.3739382996596532, |
| "eval_loss": 3.555943250656128, |
| "eval_runtime": 182.3297, |
| "eval_samples_per_second": 91.176, |
| "eval_steps_per_second": 5.698, |
| "step": 103000 |
| }, |
| { |
| "epoch": 30.043731778425656, |
| "grad_norm": 0.4594990909099579, |
| "learning_rate": 0.00023961843640606768, |
| "loss": 3.0802, |
| "step": 103050 |
| }, |
| { |
| "epoch": 30.058309037900873, |
| "grad_norm": 0.46604761481285095, |
| "learning_rate": 0.00023944340723453908, |
| "loss": 3.0899, |
| "step": 103100 |
| }, |
| { |
| "epoch": 30.072886297376094, |
| "grad_norm": 0.4697359502315521, |
| "learning_rate": 0.0002392683780630105, |
| "loss": 3.0908, |
| "step": 103150 |
| }, |
| { |
| "epoch": 30.08746355685131, |
| "grad_norm": 0.43642377853393555, |
| "learning_rate": 0.00023909334889148188, |
| "loss": 3.0955, |
| "step": 103200 |
| }, |
| { |
| "epoch": 30.102040816326532, |
| "grad_norm": 0.47640782594680786, |
| "learning_rate": 0.0002389183197199533, |
| "loss": 3.0973, |
| "step": 103250 |
| }, |
| { |
| "epoch": 30.11661807580175, |
| "grad_norm": 0.41699403524398804, |
| "learning_rate": 0.0002387432905484247, |
| "loss": 3.101, |
| "step": 103300 |
| }, |
| { |
| "epoch": 30.131195335276967, |
| "grad_norm": 0.4841386079788208, |
| "learning_rate": 0.00023856826137689613, |
| "loss": 3.1155, |
| "step": 103350 |
| }, |
| { |
| "epoch": 30.145772594752188, |
| "grad_norm": 0.47871625423431396, |
| "learning_rate": 0.00023839323220536756, |
| "loss": 3.101, |
| "step": 103400 |
| }, |
| { |
| "epoch": 30.160349854227405, |
| "grad_norm": 0.4356785714626312, |
| "learning_rate": 0.00023821820303383896, |
| "loss": 3.1043, |
| "step": 103450 |
| }, |
| { |
| "epoch": 30.174927113702623, |
| "grad_norm": 0.5174174308776855, |
| "learning_rate": 0.00023804317386231038, |
| "loss": 3.1073, |
| "step": 103500 |
| }, |
| { |
| "epoch": 30.189504373177844, |
| "grad_norm": 0.44571802020072937, |
| "learning_rate": 0.00023786814469078178, |
| "loss": 3.1039, |
| "step": 103550 |
| }, |
| { |
| "epoch": 30.20408163265306, |
| "grad_norm": 0.45123785734176636, |
| "learning_rate": 0.0002376931155192532, |
| "loss": 3.1133, |
| "step": 103600 |
| }, |
| { |
| "epoch": 30.21865889212828, |
| "grad_norm": 0.4339126646518707, |
| "learning_rate": 0.00023751808634772458, |
| "loss": 3.1271, |
| "step": 103650 |
| }, |
| { |
| "epoch": 30.2332361516035, |
| "grad_norm": 0.4566538631916046, |
| "learning_rate": 0.000237343057176196, |
| "loss": 3.1273, |
| "step": 103700 |
| }, |
| { |
| "epoch": 30.247813411078717, |
| "grad_norm": 0.5019856691360474, |
| "learning_rate": 0.0002371680280046674, |
| "loss": 3.1091, |
| "step": 103750 |
| }, |
| { |
| "epoch": 30.262390670553934, |
| "grad_norm": 0.4492882490158081, |
| "learning_rate": 0.00023699299883313884, |
| "loss": 3.0973, |
| "step": 103800 |
| }, |
| { |
| "epoch": 30.276967930029155, |
| "grad_norm": 0.44337910413742065, |
| "learning_rate": 0.00023681796966161026, |
| "loss": 3.1213, |
| "step": 103850 |
| }, |
| { |
| "epoch": 30.291545189504372, |
| "grad_norm": 0.4558570683002472, |
| "learning_rate": 0.00023664294049008166, |
| "loss": 3.1266, |
| "step": 103900 |
| }, |
| { |
| "epoch": 30.306122448979593, |
| "grad_norm": 0.4441608786582947, |
| "learning_rate": 0.0002364679113185531, |
| "loss": 3.1165, |
| "step": 103950 |
| }, |
| { |
| "epoch": 30.32069970845481, |
| "grad_norm": 0.4322434365749359, |
| "learning_rate": 0.0002362928821470245, |
| "loss": 3.1202, |
| "step": 104000 |
| }, |
| { |
| "epoch": 30.32069970845481, |
| "eval_accuracy": 0.37388403157484335, |
| "eval_loss": 3.5529487133026123, |
| "eval_runtime": 182.4101, |
| "eval_samples_per_second": 91.135, |
| "eval_steps_per_second": 5.696, |
| "step": 104000 |
| }, |
| { |
| "epoch": 30.335276967930028, |
| "grad_norm": 0.4699609875679016, |
| "learning_rate": 0.00023611785297549592, |
| "loss": 3.1181, |
| "step": 104050 |
| }, |
| { |
| "epoch": 30.34985422740525, |
| "grad_norm": 0.4313887655735016, |
| "learning_rate": 0.0002359428238039673, |
| "loss": 3.1199, |
| "step": 104100 |
| }, |
| { |
| "epoch": 30.364431486880466, |
| "grad_norm": 0.4372082054615021, |
| "learning_rate": 0.00023576779463243872, |
| "loss": 3.1379, |
| "step": 104150 |
| }, |
| { |
| "epoch": 30.379008746355684, |
| "grad_norm": 0.48427414894104004, |
| "learning_rate": 0.00023559276546091011, |
| "loss": 3.1321, |
| "step": 104200 |
| }, |
| { |
| "epoch": 30.393586005830905, |
| "grad_norm": 0.47731345891952515, |
| "learning_rate": 0.00023541773628938154, |
| "loss": 3.138, |
| "step": 104250 |
| }, |
| { |
| "epoch": 30.408163265306122, |
| "grad_norm": 0.44795680046081543, |
| "learning_rate": 0.00023524270711785297, |
| "loss": 3.1338, |
| "step": 104300 |
| }, |
| { |
| "epoch": 30.42274052478134, |
| "grad_norm": 0.4745403230190277, |
| "learning_rate": 0.00023506767794632437, |
| "loss": 3.1312, |
| "step": 104350 |
| }, |
| { |
| "epoch": 30.43731778425656, |
| "grad_norm": 0.4556351602077484, |
| "learning_rate": 0.0002348926487747958, |
| "loss": 3.1254, |
| "step": 104400 |
| }, |
| { |
| "epoch": 30.451895043731778, |
| "grad_norm": 0.4545431435108185, |
| "learning_rate": 0.0002347176196032672, |
| "loss": 3.1416, |
| "step": 104450 |
| }, |
| { |
| "epoch": 30.466472303207, |
| "grad_norm": 0.4592565596103668, |
| "learning_rate": 0.00023454259043173862, |
| "loss": 3.1285, |
| "step": 104500 |
| }, |
| { |
| "epoch": 30.481049562682216, |
| "grad_norm": 0.456776887178421, |
| "learning_rate": 0.00023436756126021, |
| "loss": 3.1428, |
| "step": 104550 |
| }, |
| { |
| "epoch": 30.495626822157433, |
| "grad_norm": 0.4269309639930725, |
| "learning_rate": 0.00023419253208868142, |
| "loss": 3.1424, |
| "step": 104600 |
| }, |
| { |
| "epoch": 30.510204081632654, |
| "grad_norm": 0.436102032661438, |
| "learning_rate": 0.00023401750291715285, |
| "loss": 3.1285, |
| "step": 104650 |
| }, |
| { |
| "epoch": 30.52478134110787, |
| "grad_norm": 0.43040576577186584, |
| "learning_rate": 0.00023384247374562425, |
| "loss": 3.1361, |
| "step": 104700 |
| }, |
| { |
| "epoch": 30.53935860058309, |
| "grad_norm": 0.48119938373565674, |
| "learning_rate": 0.00023366744457409567, |
| "loss": 3.1409, |
| "step": 104750 |
| }, |
| { |
| "epoch": 30.55393586005831, |
| "grad_norm": 0.4720640778541565, |
| "learning_rate": 0.00023349241540256707, |
| "loss": 3.1386, |
| "step": 104800 |
| }, |
| { |
| "epoch": 30.568513119533527, |
| "grad_norm": 0.45356231927871704, |
| "learning_rate": 0.0002333173862310385, |
| "loss": 3.1327, |
| "step": 104850 |
| }, |
| { |
| "epoch": 30.583090379008745, |
| "grad_norm": 0.4686661958694458, |
| "learning_rate": 0.0002331423570595099, |
| "loss": 3.1445, |
| "step": 104900 |
| }, |
| { |
| "epoch": 30.597667638483966, |
| "grad_norm": 0.44459545612335205, |
| "learning_rate": 0.00023296732788798133, |
| "loss": 3.1407, |
| "step": 104950 |
| }, |
| { |
| "epoch": 30.612244897959183, |
| "grad_norm": 0.45365190505981445, |
| "learning_rate": 0.0002327922987164527, |
| "loss": 3.1488, |
| "step": 105000 |
| }, |
| { |
| "epoch": 30.612244897959183, |
| "eval_accuracy": 0.37435149050061306, |
| "eval_loss": 3.5447380542755127, |
| "eval_runtime": 182.8207, |
| "eval_samples_per_second": 90.931, |
| "eval_steps_per_second": 5.683, |
| "step": 105000 |
| }, |
| { |
| "epoch": 30.626822157434404, |
| "grad_norm": 0.4460262656211853, |
| "learning_rate": 0.00023261726954492413, |
| "loss": 3.1396, |
| "step": 105050 |
| }, |
| { |
| "epoch": 30.64139941690962, |
| "grad_norm": 0.45241525769233704, |
| "learning_rate": 0.00023244224037339555, |
| "loss": 3.1427, |
| "step": 105100 |
| }, |
| { |
| "epoch": 30.65597667638484, |
| "grad_norm": 0.44525688886642456, |
| "learning_rate": 0.00023226721120186695, |
| "loss": 3.1477, |
| "step": 105150 |
| }, |
| { |
| "epoch": 30.67055393586006, |
| "grad_norm": 0.4357472360134125, |
| "learning_rate": 0.00023209218203033838, |
| "loss": 3.1524, |
| "step": 105200 |
| }, |
| { |
| "epoch": 30.685131195335277, |
| "grad_norm": 0.46180203557014465, |
| "learning_rate": 0.00023191715285880978, |
| "loss": 3.1406, |
| "step": 105250 |
| }, |
| { |
| "epoch": 30.699708454810494, |
| "grad_norm": 0.43893539905548096, |
| "learning_rate": 0.0002317421236872812, |
| "loss": 3.1644, |
| "step": 105300 |
| }, |
| { |
| "epoch": 30.714285714285715, |
| "grad_norm": 0.45540228486061096, |
| "learning_rate": 0.0002315670945157526, |
| "loss": 3.1574, |
| "step": 105350 |
| }, |
| { |
| "epoch": 30.728862973760933, |
| "grad_norm": 0.4692342281341553, |
| "learning_rate": 0.00023139206534422403, |
| "loss": 3.1624, |
| "step": 105400 |
| }, |
| { |
| "epoch": 30.74344023323615, |
| "grad_norm": 0.4671376049518585, |
| "learning_rate": 0.0002312170361726954, |
| "loss": 3.1451, |
| "step": 105450 |
| }, |
| { |
| "epoch": 30.75801749271137, |
| "grad_norm": 0.48794859647750854, |
| "learning_rate": 0.00023104200700116683, |
| "loss": 3.1566, |
| "step": 105500 |
| }, |
| { |
| "epoch": 30.77259475218659, |
| "grad_norm": 0.4582727551460266, |
| "learning_rate": 0.00023086697782963826, |
| "loss": 3.1436, |
| "step": 105550 |
| }, |
| { |
| "epoch": 30.78717201166181, |
| "grad_norm": 0.5062020421028137, |
| "learning_rate": 0.00023069194865810966, |
| "loss": 3.1498, |
| "step": 105600 |
| }, |
| { |
| "epoch": 30.801749271137027, |
| "grad_norm": 0.4618031978607178, |
| "learning_rate": 0.00023051691948658109, |
| "loss": 3.1509, |
| "step": 105650 |
| }, |
| { |
| "epoch": 30.816326530612244, |
| "grad_norm": 0.4281717538833618, |
| "learning_rate": 0.00023034189031505249, |
| "loss": 3.1481, |
| "step": 105700 |
| }, |
| { |
| "epoch": 30.830903790087465, |
| "grad_norm": 0.45748892426490784, |
| "learning_rate": 0.0002301668611435239, |
| "loss": 3.1511, |
| "step": 105750 |
| }, |
| { |
| "epoch": 30.845481049562682, |
| "grad_norm": 0.4601440131664276, |
| "learning_rate": 0.0002299918319719953, |
| "loss": 3.1632, |
| "step": 105800 |
| }, |
| { |
| "epoch": 30.8600583090379, |
| "grad_norm": 0.45072951912879944, |
| "learning_rate": 0.00022981680280046674, |
| "loss": 3.1635, |
| "step": 105850 |
| }, |
| { |
| "epoch": 30.87463556851312, |
| "grad_norm": 0.45450711250305176, |
| "learning_rate": 0.00022964177362893817, |
| "loss": 3.1546, |
| "step": 105900 |
| }, |
| { |
| "epoch": 30.889212827988338, |
| "grad_norm": 0.4641033113002777, |
| "learning_rate": 0.00022946674445740954, |
| "loss": 3.1569, |
| "step": 105950 |
| }, |
| { |
| "epoch": 30.903790087463555, |
| "grad_norm": 0.4180297553539276, |
| "learning_rate": 0.00022929171528588097, |
| "loss": 3.1591, |
| "step": 106000 |
| }, |
| { |
| "epoch": 30.903790087463555, |
| "eval_accuracy": 0.3749691578346634, |
| "eval_loss": 3.5384469032287598, |
| "eval_runtime": 182.6987, |
| "eval_samples_per_second": 90.991, |
| "eval_steps_per_second": 5.687, |
| "step": 106000 |
| }, |
| { |
| "epoch": 30.918367346938776, |
| "grad_norm": 0.4221542179584503, |
| "learning_rate": 0.00022911668611435236, |
| "loss": 3.1583, |
| "step": 106050 |
| }, |
| { |
| "epoch": 30.932944606413994, |
| "grad_norm": 0.47019174695014954, |
| "learning_rate": 0.0002289416569428238, |
| "loss": 3.1603, |
| "step": 106100 |
| }, |
| { |
| "epoch": 30.947521865889215, |
| "grad_norm": 0.43644431233406067, |
| "learning_rate": 0.0002287666277712952, |
| "loss": 3.1593, |
| "step": 106150 |
| }, |
| { |
| "epoch": 30.962099125364432, |
| "grad_norm": 0.4441497027873993, |
| "learning_rate": 0.00022859159859976662, |
| "loss": 3.1701, |
| "step": 106200 |
| }, |
| { |
| "epoch": 30.97667638483965, |
| "grad_norm": 0.43236950039863586, |
| "learning_rate": 0.00022841656942823802, |
| "loss": 3.1581, |
| "step": 106250 |
| }, |
| { |
| "epoch": 30.99125364431487, |
| "grad_norm": 0.4527025520801544, |
| "learning_rate": 0.00022824154025670944, |
| "loss": 3.1718, |
| "step": 106300 |
| }, |
| { |
| "epoch": 31.005830903790088, |
| "grad_norm": 0.4435344934463501, |
| "learning_rate": 0.00022806651108518087, |
| "loss": 3.1195, |
| "step": 106350 |
| }, |
| { |
| "epoch": 31.020408163265305, |
| "grad_norm": 0.4386453926563263, |
| "learning_rate": 0.00022789148191365224, |
| "loss": 3.072, |
| "step": 106400 |
| }, |
| { |
| "epoch": 31.034985422740526, |
| "grad_norm": 0.45795875787734985, |
| "learning_rate": 0.00022771645274212367, |
| "loss": 3.0613, |
| "step": 106450 |
| }, |
| { |
| "epoch": 31.049562682215743, |
| "grad_norm": 0.424356609582901, |
| "learning_rate": 0.00022754142357059507, |
| "loss": 3.0828, |
| "step": 106500 |
| }, |
| { |
| "epoch": 31.06413994169096, |
| "grad_norm": 0.44901180267333984, |
| "learning_rate": 0.0002273663943990665, |
| "loss": 3.0799, |
| "step": 106550 |
| }, |
| { |
| "epoch": 31.07871720116618, |
| "grad_norm": 0.4648335576057434, |
| "learning_rate": 0.0002271913652275379, |
| "loss": 3.0804, |
| "step": 106600 |
| }, |
| { |
| "epoch": 31.0932944606414, |
| "grad_norm": 0.4268786311149597, |
| "learning_rate": 0.00022701633605600932, |
| "loss": 3.0826, |
| "step": 106650 |
| }, |
| { |
| "epoch": 31.107871720116616, |
| "grad_norm": 0.5035303831100464, |
| "learning_rate": 0.00022684130688448072, |
| "loss": 3.0941, |
| "step": 106700 |
| }, |
| { |
| "epoch": 31.122448979591837, |
| "grad_norm": 0.46598684787750244, |
| "learning_rate": 0.00022666627771295215, |
| "loss": 3.0922, |
| "step": 106750 |
| }, |
| { |
| "epoch": 31.137026239067055, |
| "grad_norm": 0.4177006483078003, |
| "learning_rate": 0.00022649124854142358, |
| "loss": 3.1001, |
| "step": 106800 |
| }, |
| { |
| "epoch": 31.151603498542276, |
| "grad_norm": 0.44359204173088074, |
| "learning_rate": 0.00022631621936989495, |
| "loss": 3.0948, |
| "step": 106850 |
| }, |
| { |
| "epoch": 31.166180758017493, |
| "grad_norm": 0.4663641154766083, |
| "learning_rate": 0.00022614119019836638, |
| "loss": 3.1159, |
| "step": 106900 |
| }, |
| { |
| "epoch": 31.18075801749271, |
| "grad_norm": 0.43660664558410645, |
| "learning_rate": 0.00022596616102683778, |
| "loss": 3.0966, |
| "step": 106950 |
| }, |
| { |
| "epoch": 31.19533527696793, |
| "grad_norm": 0.45154282450675964, |
| "learning_rate": 0.0002257911318553092, |
| "loss": 3.1026, |
| "step": 107000 |
| }, |
| { |
| "epoch": 31.19533527696793, |
| "eval_accuracy": 0.3739278227408938, |
| "eval_loss": 3.558981418609619, |
| "eval_runtime": 182.4643, |
| "eval_samples_per_second": 91.108, |
| "eval_steps_per_second": 5.694, |
| "step": 107000 |
| }, |
| { |
| "epoch": 31.20991253644315, |
| "grad_norm": 0.4914431571960449, |
| "learning_rate": 0.0002256161026837806, |
| "loss": 3.1109, |
| "step": 107050 |
| }, |
| { |
| "epoch": 31.224489795918366, |
| "grad_norm": 0.4672841727733612, |
| "learning_rate": 0.00022544107351225203, |
| "loss": 3.0984, |
| "step": 107100 |
| }, |
| { |
| "epoch": 31.239067055393587, |
| "grad_norm": 0.46421849727630615, |
| "learning_rate": 0.00022526604434072346, |
| "loss": 3.1086, |
| "step": 107150 |
| }, |
| { |
| "epoch": 31.253644314868804, |
| "grad_norm": 0.4598676860332489, |
| "learning_rate": 0.00022509101516919486, |
| "loss": 3.0982, |
| "step": 107200 |
| }, |
| { |
| "epoch": 31.26822157434402, |
| "grad_norm": 0.4586013853549957, |
| "learning_rate": 0.00022491598599766628, |
| "loss": 3.0929, |
| "step": 107250 |
| }, |
| { |
| "epoch": 31.282798833819243, |
| "grad_norm": 0.44894787669181824, |
| "learning_rate": 0.00022474095682613766, |
| "loss": 3.1093, |
| "step": 107300 |
| }, |
| { |
| "epoch": 31.29737609329446, |
| "grad_norm": 0.43039682507514954, |
| "learning_rate": 0.00022456592765460908, |
| "loss": 3.1057, |
| "step": 107350 |
| }, |
| { |
| "epoch": 31.31195335276968, |
| "grad_norm": 0.45393604040145874, |
| "learning_rate": 0.00022439089848308048, |
| "loss": 3.1132, |
| "step": 107400 |
| }, |
| { |
| "epoch": 31.3265306122449, |
| "grad_norm": 0.5165916681289673, |
| "learning_rate": 0.0002242158693115519, |
| "loss": 3.1134, |
| "step": 107450 |
| }, |
| { |
| "epoch": 31.341107871720116, |
| "grad_norm": 0.4439025819301605, |
| "learning_rate": 0.0002240408401400233, |
| "loss": 3.126, |
| "step": 107500 |
| }, |
| { |
| "epoch": 31.355685131195337, |
| "grad_norm": 0.4372089207172394, |
| "learning_rate": 0.00022386581096849474, |
| "loss": 3.1231, |
| "step": 107550 |
| }, |
| { |
| "epoch": 31.370262390670554, |
| "grad_norm": 0.4908539056777954, |
| "learning_rate": 0.00022369078179696616, |
| "loss": 3.113, |
| "step": 107600 |
| }, |
| { |
| "epoch": 31.38483965014577, |
| "grad_norm": 0.47500312328338623, |
| "learning_rate": 0.00022351575262543756, |
| "loss": 3.1198, |
| "step": 107650 |
| }, |
| { |
| "epoch": 31.399416909620992, |
| "grad_norm": 0.49036237597465515, |
| "learning_rate": 0.000223340723453909, |
| "loss": 3.1199, |
| "step": 107700 |
| }, |
| { |
| "epoch": 31.41399416909621, |
| "grad_norm": 0.4451589286327362, |
| "learning_rate": 0.00022316569428238036, |
| "loss": 3.1185, |
| "step": 107750 |
| }, |
| { |
| "epoch": 31.428571428571427, |
| "grad_norm": 0.4553769528865814, |
| "learning_rate": 0.0002229906651108518, |
| "loss": 3.126, |
| "step": 107800 |
| }, |
| { |
| "epoch": 31.443148688046648, |
| "grad_norm": 0.43727388978004456, |
| "learning_rate": 0.0002228156359393232, |
| "loss": 3.1303, |
| "step": 107850 |
| }, |
| { |
| "epoch": 31.457725947521865, |
| "grad_norm": 0.46281135082244873, |
| "learning_rate": 0.00022264060676779461, |
| "loss": 3.1318, |
| "step": 107900 |
| }, |
| { |
| "epoch": 31.472303206997086, |
| "grad_norm": 0.48131096363067627, |
| "learning_rate": 0.00022246557759626601, |
| "loss": 3.1295, |
| "step": 107950 |
| }, |
| { |
| "epoch": 31.486880466472304, |
| "grad_norm": 0.4911995828151703, |
| "learning_rate": 0.00022229054842473744, |
| "loss": 3.1355, |
| "step": 108000 |
| }, |
| { |
| "epoch": 31.486880466472304, |
| "eval_accuracy": 0.3740834461858365, |
| "eval_loss": 3.5502305030822754, |
| "eval_runtime": 182.3882, |
| "eval_samples_per_second": 91.146, |
| "eval_steps_per_second": 5.697, |
| "step": 108000 |
| }, |
| { |
| "epoch": 31.50145772594752, |
| "grad_norm": 0.508841335773468, |
| "learning_rate": 0.00022211551925320887, |
| "loss": 3.121, |
| "step": 108050 |
| }, |
| { |
| "epoch": 31.516034985422742, |
| "grad_norm": 0.488435834646225, |
| "learning_rate": 0.00022194049008168027, |
| "loss": 3.1243, |
| "step": 108100 |
| }, |
| { |
| "epoch": 31.53061224489796, |
| "grad_norm": 0.46807485818862915, |
| "learning_rate": 0.0002217654609101517, |
| "loss": 3.1247, |
| "step": 108150 |
| }, |
| { |
| "epoch": 31.545189504373177, |
| "grad_norm": 0.4565454125404358, |
| "learning_rate": 0.00022159043173862307, |
| "loss": 3.1369, |
| "step": 108200 |
| }, |
| { |
| "epoch": 31.559766763848398, |
| "grad_norm": 0.47071194648742676, |
| "learning_rate": 0.0002214154025670945, |
| "loss": 3.1403, |
| "step": 108250 |
| }, |
| { |
| "epoch": 31.574344023323615, |
| "grad_norm": 0.4571170210838318, |
| "learning_rate": 0.0002212403733955659, |
| "loss": 3.1373, |
| "step": 108300 |
| }, |
| { |
| "epoch": 31.588921282798832, |
| "grad_norm": 0.46521154046058655, |
| "learning_rate": 0.00022106534422403732, |
| "loss": 3.1306, |
| "step": 108350 |
| }, |
| { |
| "epoch": 31.603498542274053, |
| "grad_norm": 0.46532467007637024, |
| "learning_rate": 0.00022089031505250875, |
| "loss": 3.1411, |
| "step": 108400 |
| }, |
| { |
| "epoch": 31.61807580174927, |
| "grad_norm": 0.46492308378219604, |
| "learning_rate": 0.00022071528588098015, |
| "loss": 3.1392, |
| "step": 108450 |
| }, |
| { |
| "epoch": 31.632653061224488, |
| "grad_norm": 0.4574034810066223, |
| "learning_rate": 0.00022054025670945157, |
| "loss": 3.1387, |
| "step": 108500 |
| }, |
| { |
| "epoch": 31.64723032069971, |
| "grad_norm": 0.4768873453140259, |
| "learning_rate": 0.00022036522753792297, |
| "loss": 3.1437, |
| "step": 108550 |
| }, |
| { |
| "epoch": 31.661807580174926, |
| "grad_norm": 0.4694845676422119, |
| "learning_rate": 0.0002201901983663944, |
| "loss": 3.1375, |
| "step": 108600 |
| }, |
| { |
| "epoch": 31.676384839650147, |
| "grad_norm": 0.4521714746952057, |
| "learning_rate": 0.00022001516919486577, |
| "loss": 3.1324, |
| "step": 108650 |
| }, |
| { |
| "epoch": 31.690962099125365, |
| "grad_norm": 0.44469496607780457, |
| "learning_rate": 0.0002198401400233372, |
| "loss": 3.1336, |
| "step": 108700 |
| }, |
| { |
| "epoch": 31.705539358600582, |
| "grad_norm": 0.43908751010894775, |
| "learning_rate": 0.0002196651108518086, |
| "loss": 3.1413, |
| "step": 108750 |
| }, |
| { |
| "epoch": 31.720116618075803, |
| "grad_norm": 0.4636845588684082, |
| "learning_rate": 0.00021949008168028003, |
| "loss": 3.1506, |
| "step": 108800 |
| }, |
| { |
| "epoch": 31.73469387755102, |
| "grad_norm": 0.45257994532585144, |
| "learning_rate": 0.00021931505250875145, |
| "loss": 3.1482, |
| "step": 108850 |
| }, |
| { |
| "epoch": 31.749271137026238, |
| "grad_norm": 0.4656467139720917, |
| "learning_rate": 0.00021914002333722285, |
| "loss": 3.1473, |
| "step": 108900 |
| }, |
| { |
| "epoch": 31.76384839650146, |
| "grad_norm": 0.4685870409011841, |
| "learning_rate": 0.00021896499416569428, |
| "loss": 3.1573, |
| "step": 108950 |
| }, |
| { |
| "epoch": 31.778425655976676, |
| "grad_norm": 0.4359848201274872, |
| "learning_rate": 0.00021878996499416568, |
| "loss": 3.1469, |
| "step": 109000 |
| }, |
| { |
| "epoch": 31.778425655976676, |
| "eval_accuracy": 0.37454784443871025, |
| "eval_loss": 3.5438437461853027, |
| "eval_runtime": 182.5841, |
| "eval_samples_per_second": 91.048, |
| "eval_steps_per_second": 5.691, |
| "step": 109000 |
| }, |
| { |
| "epoch": 31.778425655976676, |
| "step": 109000, |
| "total_flos": 2.278393618563072e+18, |
| "train_loss": 0.8399101010943771, |
| "train_runtime": 58098.688, |
| "train_samples_per_second": 236.142, |
| "train_steps_per_second": 2.952 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 171500, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 50, |
| "save_steps": 10000, |
| "stateful_callbacks": { |
| "EarlyStoppingCallback": { |
| "args": { |
| "early_stopping_patience": 20, |
| "early_stopping_threshold": 0.0 |
| }, |
| "attributes": { |
| "early_stopping_patience_counter": 11 |
| } |
| }, |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.278393618563072e+18, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|