{ "best_global_step": 50000, "best_metric": 2.1760547161102295, "best_model_checkpoint": "./sky-389m-tx-project/checkpoint-50000", "epoch": 1.7269964078474715, "eval_steps": 1000, "global_step": 50000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0034539928156949434, "grad_norm": 3.3880717754364014, "learning_rate": 2.4750000000000002e-05, "loss": 8.5386, "step": 100 }, { "epoch": 0.006907985631389887, "grad_norm": 2.3290209770202637, "learning_rate": 4.975e-05, "loss": 6.7703, "step": 200 }, { "epoch": 0.01036197844708483, "grad_norm": 2.278391122817993, "learning_rate": 7.475e-05, "loss": 6.0778, "step": 300 }, { "epoch": 0.013815971262779773, "grad_norm": 1.8386310338974, "learning_rate": 9.975000000000001e-05, "loss": 5.7081, "step": 400 }, { "epoch": 0.017269964078474715, "grad_norm": 1.0589734315872192, "learning_rate": 0.00012475, "loss": 5.3779, "step": 500 }, { "epoch": 0.02072395689416966, "grad_norm": 1.060039758682251, "learning_rate": 0.00014975, "loss": 5.0624, "step": 600 }, { "epoch": 0.024177949709864602, "grad_norm": 1.0565826892852783, "learning_rate": 0.00017475, "loss": 4.8215, "step": 700 }, { "epoch": 0.027631942525559547, "grad_norm": 0.9314415454864502, "learning_rate": 0.00019975, "loss": 4.5897, "step": 800 }, { "epoch": 0.03108593534125449, "grad_norm": 0.9964447021484375, "learning_rate": 0.00022475000000000001, "loss": 4.4049, "step": 900 }, { "epoch": 0.03453992815694943, "grad_norm": 0.9212857484817505, "learning_rate": 0.00024975, "loss": 4.2051, "step": 1000 }, { "epoch": 0.03453992815694943, "eval_loss": 4.235040187835693, "eval_runtime": 935.7314, "eval_samples_per_second": 162.843, "eval_steps_per_second": 1.629, "step": 1000 }, { "epoch": 0.037993920972644375, "grad_norm": 0.8937363028526306, "learning_rate": 0.00027475, "loss": 4.0112, "step": 1100 }, { "epoch": 0.04144791378833932, "grad_norm": 0.8522709012031555, "learning_rate": 0.00029975000000000005, "loss": 3.8575, "step": 1200 }, { "epoch": 0.044901906604034265, "grad_norm": 0.8380929231643677, "learning_rate": 0.00032475, "loss": 3.7379, "step": 1300 }, { "epoch": 0.048355899419729204, "grad_norm": 0.704521894454956, "learning_rate": 0.00034975, "loss": 3.6267, "step": 1400 }, { "epoch": 0.05180989223542415, "grad_norm": 0.7526060938835144, "learning_rate": 0.00037475000000000003, "loss": 3.5288, "step": 1500 }, { "epoch": 0.055263885051119094, "grad_norm": 0.8213881254196167, "learning_rate": 0.00039975, "loss": 3.4769, "step": 1600 }, { "epoch": 0.05871787786681404, "grad_norm": 0.6610364317893982, "learning_rate": 0.00042475000000000005, "loss": 3.4406, "step": 1700 }, { "epoch": 0.06217187068250898, "grad_norm": 0.8810706734657288, "learning_rate": 0.00044975, "loss": 3.3477, "step": 1800 }, { "epoch": 0.06562586349820393, "grad_norm": 1.3641518354415894, "learning_rate": 0.00047475, "loss": 3.3183, "step": 1900 }, { "epoch": 0.06907985631389886, "grad_norm": 0.6155968904495239, "learning_rate": 0.0004997500000000001, "loss": 3.3016, "step": 2000 }, { "epoch": 0.06907985631389886, "eval_loss": 3.551506280899048, "eval_runtime": 937.0574, "eval_samples_per_second": 162.612, "eval_steps_per_second": 1.626, "step": 2000 }, { "epoch": 0.0725338491295938, "grad_norm": 0.5413870215415955, "learning_rate": 0.0004994375, "loss": 3.2393, "step": 2100 }, { "epoch": 0.07598784194528875, "grad_norm": 0.5018215775489807, "learning_rate": 0.0004988693181818182, "loss": 3.1889, "step": 2200 }, { "epoch": 0.0794418347609837, "grad_norm": 0.5307313203811646, "learning_rate": 0.0004983011363636364, "loss": 3.1391, "step": 2300 }, { "epoch": 0.08289582757667864, "grad_norm": 0.5371428728103638, "learning_rate": 0.0004977329545454545, "loss": 3.1102, "step": 2400 }, { "epoch": 0.08634982039237359, "grad_norm": 0.5079624652862549, "learning_rate": 0.0004971647727272727, "loss": 3.0868, "step": 2500 }, { "epoch": 0.08980381320806853, "grad_norm": 0.4562855660915375, "learning_rate": 0.000496596590909091, "loss": 3.0448, "step": 2600 }, { "epoch": 0.09325780602376348, "grad_norm": 0.4660443067550659, "learning_rate": 0.0004960284090909092, "loss": 3.0341, "step": 2700 }, { "epoch": 0.09671179883945841, "grad_norm": 0.48204541206359863, "learning_rate": 0.0004954602272727273, "loss": 2.9917, "step": 2800 }, { "epoch": 0.10016579165515335, "grad_norm": 0.43557730317115784, "learning_rate": 0.0004948920454545454, "loss": 2.9464, "step": 2900 }, { "epoch": 0.1036197844708483, "grad_norm": 0.5063506364822388, "learning_rate": 0.0004943238636363637, "loss": 2.9463, "step": 3000 }, { "epoch": 0.1036197844708483, "eval_loss": 3.2142741680145264, "eval_runtime": 937.7254, "eval_samples_per_second": 162.496, "eval_steps_per_second": 1.625, "step": 3000 }, { "epoch": 0.10707377728654324, "grad_norm": 0.4616130590438843, "learning_rate": 0.0004937556818181818, "loss": 2.9168, "step": 3100 }, { "epoch": 0.11052777010223819, "grad_norm": 0.447933554649353, "learning_rate": 0.0004931875, "loss": 2.9172, "step": 3200 }, { "epoch": 0.11398176291793313, "grad_norm": 0.4423768222332001, "learning_rate": 0.0004926193181818183, "loss": 2.8891, "step": 3300 }, { "epoch": 0.11743575573362808, "grad_norm": 0.4318563640117645, "learning_rate": 0.0004920511363636364, "loss": 2.8479, "step": 3400 }, { "epoch": 0.12088974854932302, "grad_norm": 0.41672056913375854, "learning_rate": 0.0004914829545454545, "loss": 2.8462, "step": 3500 }, { "epoch": 0.12434374136501795, "grad_norm": 0.3930817246437073, "learning_rate": 0.0004909147727272727, "loss": 2.8219, "step": 3600 }, { "epoch": 0.1277977341807129, "grad_norm": 0.4133651852607727, "learning_rate": 0.0004903465909090909, "loss": 2.7957, "step": 3700 }, { "epoch": 0.13125172699640786, "grad_norm": 0.40811586380004883, "learning_rate": 0.0004897784090909091, "loss": 2.7879, "step": 3800 }, { "epoch": 0.1347057198121028, "grad_norm": 0.42227643728256226, "learning_rate": 0.0004892102272727273, "loss": 2.7716, "step": 3900 }, { "epoch": 0.13815971262779772, "grad_norm": 0.41413313150405884, "learning_rate": 0.0004886420454545455, "loss": 2.7606, "step": 4000 }, { "epoch": 0.13815971262779772, "eval_loss": 3.061166286468506, "eval_runtime": 936.0728, "eval_samples_per_second": 162.783, "eval_steps_per_second": 1.628, "step": 4000 }, { "epoch": 0.14161370544349267, "grad_norm": 0.40972092747688293, "learning_rate": 0.00048807386363636365, "loss": 2.7386, "step": 4100 }, { "epoch": 0.1450676982591876, "grad_norm": 0.4020697772502899, "learning_rate": 0.00048750568181818183, "loss": 2.7522, "step": 4200 }, { "epoch": 0.14852169107488256, "grad_norm": 0.40231621265411377, "learning_rate": 0.0004869375, "loss": 2.7442, "step": 4300 }, { "epoch": 0.1519756838905775, "grad_norm": 0.455773264169693, "learning_rate": 0.0004863693181818182, "loss": 2.7146, "step": 4400 }, { "epoch": 0.15542967670627245, "grad_norm": 0.38691282272338867, "learning_rate": 0.00048580113636363637, "loss": 2.6924, "step": 4500 }, { "epoch": 0.1588836695219674, "grad_norm": 0.3897066116333008, "learning_rate": 0.00048523295454545455, "loss": 2.6964, "step": 4600 }, { "epoch": 0.16233766233766234, "grad_norm": 0.3786475360393524, "learning_rate": 0.00048466477272727273, "loss": 2.6566, "step": 4700 }, { "epoch": 0.16579165515335728, "grad_norm": 0.3838929235935211, "learning_rate": 0.0004840965909090909, "loss": 2.6634, "step": 4800 }, { "epoch": 0.16924564796905223, "grad_norm": 0.3646841049194336, "learning_rate": 0.0004835284090909091, "loss": 2.6708, "step": 4900 }, { "epoch": 0.17269964078474717, "grad_norm": 0.37178680300712585, "learning_rate": 0.0004829602272727273, "loss": 2.6389, "step": 5000 }, { "epoch": 0.17269964078474717, "eval_loss": 2.940995216369629, "eval_runtime": 935.4231, "eval_samples_per_second": 162.896, "eval_steps_per_second": 1.629, "step": 5000 }, { "epoch": 0.17615363360044212, "grad_norm": 0.37742722034454346, "learning_rate": 0.00048239204545454545, "loss": 2.644, "step": 5100 }, { "epoch": 0.17960762641613706, "grad_norm": 0.3702583611011505, "learning_rate": 0.00048182386363636363, "loss": 2.624, "step": 5200 }, { "epoch": 0.183061619231832, "grad_norm": 0.4044618308544159, "learning_rate": 0.0004812556818181818, "loss": 2.6197, "step": 5300 }, { "epoch": 0.18651561204752695, "grad_norm": 0.3829458951950073, "learning_rate": 0.0004806875, "loss": 2.614, "step": 5400 }, { "epoch": 0.1899696048632219, "grad_norm": 0.3829841911792755, "learning_rate": 0.0004801193181818182, "loss": 2.6118, "step": 5500 }, { "epoch": 0.19342359767891681, "grad_norm": 0.3528871238231659, "learning_rate": 0.0004795511363636364, "loss": 2.6041, "step": 5600 }, { "epoch": 0.19687759049461176, "grad_norm": 0.3476055860519409, "learning_rate": 0.00047898295454545454, "loss": 2.5908, "step": 5700 }, { "epoch": 0.2003315833103067, "grad_norm": 0.3490158021450043, "learning_rate": 0.0004784147727272727, "loss": 2.569, "step": 5800 }, { "epoch": 0.20378557612600165, "grad_norm": 0.3507535457611084, "learning_rate": 0.00047784659090909095, "loss": 2.5502, "step": 5900 }, { "epoch": 0.2072395689416966, "grad_norm": 0.37472763657569885, "learning_rate": 0.0004772784090909091, "loss": 2.5656, "step": 6000 }, { "epoch": 0.2072395689416966, "eval_loss": 2.869264602661133, "eval_runtime": 927.5758, "eval_samples_per_second": 164.274, "eval_steps_per_second": 1.027, "step": 6000 }, { "epoch": 0.21069356175739154, "grad_norm": 0.34653300046920776, "learning_rate": 0.00047671022727272726, "loss": 2.5509, "step": 6100 }, { "epoch": 0.21414755457308648, "grad_norm": 0.3335779011249542, "learning_rate": 0.0004761420454545455, "loss": 2.5421, "step": 6200 }, { "epoch": 0.21760154738878143, "grad_norm": 0.37146443128585815, "learning_rate": 0.0004755738636363636, "loss": 2.5438, "step": 6300 }, { "epoch": 0.22105554020447638, "grad_norm": 0.33024120330810547, "learning_rate": 0.0004750056818181818, "loss": 2.5318, "step": 6400 }, { "epoch": 0.22450953302017132, "grad_norm": 0.3545812666416168, "learning_rate": 0.00047443750000000003, "loss": 2.5167, "step": 6500 }, { "epoch": 0.22796352583586627, "grad_norm": 0.3502351641654968, "learning_rate": 0.00047386931818181816, "loss": 2.5247, "step": 6600 }, { "epoch": 0.2314175186515612, "grad_norm": 0.35102933645248413, "learning_rate": 0.00047330113636363634, "loss": 2.5271, "step": 6700 }, { "epoch": 0.23487151146725616, "grad_norm": 0.34355252981185913, "learning_rate": 0.0004727329545454546, "loss": 2.536, "step": 6800 }, { "epoch": 0.2383255042829511, "grad_norm": 0.3270651400089264, "learning_rate": 0.00047216477272727275, "loss": 2.5081, "step": 6900 }, { "epoch": 0.24177949709864605, "grad_norm": 0.35053566098213196, "learning_rate": 0.0004715965909090909, "loss": 2.4945, "step": 7000 }, { "epoch": 0.24177949709864605, "eval_loss": 2.804372549057007, "eval_runtime": 927.2887, "eval_samples_per_second": 164.325, "eval_steps_per_second": 1.028, "step": 7000 }, { "epoch": 0.245233489914341, "grad_norm": 0.3321439325809479, "learning_rate": 0.0004710284090909091, "loss": 2.482, "step": 7100 }, { "epoch": 0.2486874827300359, "grad_norm": 0.3228578567504883, "learning_rate": 0.0004704602272727273, "loss": 2.4787, "step": 7200 }, { "epoch": 0.25214147554573085, "grad_norm": 0.3319440186023712, "learning_rate": 0.0004698920454545454, "loss": 2.4704, "step": 7300 }, { "epoch": 0.2555954683614258, "grad_norm": 0.34676915407180786, "learning_rate": 0.00046932386363636366, "loss": 2.479, "step": 7400 }, { "epoch": 0.25904946117712074, "grad_norm": 0.3456803560256958, "learning_rate": 0.00046875568181818184, "loss": 2.462, "step": 7500 }, { "epoch": 0.2625034539928157, "grad_norm": 0.330388605594635, "learning_rate": 0.00046818749999999996, "loss": 2.4638, "step": 7600 }, { "epoch": 0.26595744680851063, "grad_norm": 0.3278537690639496, "learning_rate": 0.0004676193181818182, "loss": 2.456, "step": 7700 }, { "epoch": 0.2694114396242056, "grad_norm": 0.331632137298584, "learning_rate": 0.0004670511363636364, "loss": 2.4459, "step": 7800 }, { "epoch": 0.2728654324399005, "grad_norm": 0.34204795956611633, "learning_rate": 0.0004664829545454545, "loss": 2.4545, "step": 7900 }, { "epoch": 0.27631942525559544, "grad_norm": 0.33582791686058044, "learning_rate": 0.00046591477272727274, "loss": 2.4377, "step": 8000 }, { "epoch": 0.27631942525559544, "eval_loss": 2.753157138824463, "eval_runtime": 927.4163, "eval_samples_per_second": 164.303, "eval_steps_per_second": 1.028, "step": 8000 }, { "epoch": 0.2797734180712904, "grad_norm": 0.3626213073730469, "learning_rate": 0.0004653465909090909, "loss": 2.4395, "step": 8100 }, { "epoch": 0.28322741088698533, "grad_norm": 0.33439400792121887, "learning_rate": 0.00046477840909090915, "loss": 2.4267, "step": 8200 }, { "epoch": 0.2866814037026803, "grad_norm": 0.31855249404907227, "learning_rate": 0.0004642102272727273, "loss": 2.4349, "step": 8300 }, { "epoch": 0.2901353965183752, "grad_norm": 0.3519601821899414, "learning_rate": 0.00046364204545454546, "loss": 2.4248, "step": 8400 }, { "epoch": 0.2935893893340702, "grad_norm": 0.31838154792785645, "learning_rate": 0.0004630738636363637, "loss": 2.3968, "step": 8500 }, { "epoch": 0.2970433821497651, "grad_norm": 0.3294484317302704, "learning_rate": 0.0004625056818181818, "loss": 2.4162, "step": 8600 }, { "epoch": 0.3004973749654601, "grad_norm": 0.31714752316474915, "learning_rate": 0.0004619375, "loss": 2.4073, "step": 8700 }, { "epoch": 0.303951367781155, "grad_norm": 0.32918691635131836, "learning_rate": 0.00046136931818181823, "loss": 2.4229, "step": 8800 }, { "epoch": 0.30740536059685, "grad_norm": 0.3097570538520813, "learning_rate": 0.00046080113636363636, "loss": 2.3966, "step": 8900 }, { "epoch": 0.3108593534125449, "grad_norm": 0.3247508406639099, "learning_rate": 0.00046023295454545454, "loss": 2.3925, "step": 9000 }, { "epoch": 0.3108593534125449, "eval_loss": 2.705599308013916, "eval_runtime": 927.4293, "eval_samples_per_second": 164.3, "eval_steps_per_second": 1.028, "step": 9000 }, { "epoch": 0.31431334622823986, "grad_norm": 0.3189142048358917, "learning_rate": 0.0004596647727272728, "loss": 2.3932, "step": 9100 }, { "epoch": 0.3177673390439348, "grad_norm": 0.3028543293476105, "learning_rate": 0.0004590965909090909, "loss": 2.3787, "step": 9200 }, { "epoch": 0.32122133185962976, "grad_norm": 0.3109678030014038, "learning_rate": 0.0004585284090909091, "loss": 2.3665, "step": 9300 }, { "epoch": 0.3246753246753247, "grad_norm": 0.31394320726394653, "learning_rate": 0.0004579602272727273, "loss": 2.3722, "step": 9400 }, { "epoch": 0.32812931749101965, "grad_norm": 0.3214563727378845, "learning_rate": 0.0004573920454545455, "loss": 2.3586, "step": 9500 }, { "epoch": 0.33158331030671456, "grad_norm": 0.33052927255630493, "learning_rate": 0.0004568238636363636, "loss": 2.3838, "step": 9600 }, { "epoch": 0.3350373031224095, "grad_norm": 0.3511188328266144, "learning_rate": 0.00045625568181818186, "loss": 2.3588, "step": 9700 }, { "epoch": 0.33849129593810445, "grad_norm": 0.31076422333717346, "learning_rate": 0.00045568750000000004, "loss": 2.3552, "step": 9800 }, { "epoch": 0.34194528875379937, "grad_norm": 0.32571229338645935, "learning_rate": 0.00045511931818181816, "loss": 2.3496, "step": 9900 }, { "epoch": 0.34539928156949434, "grad_norm": 0.3375560939311981, "learning_rate": 0.0004545511363636364, "loss": 2.3526, "step": 10000 }, { "epoch": 0.34539928156949434, "eval_loss": 2.6650397777557373, "eval_runtime": 927.3482, "eval_samples_per_second": 164.315, "eval_steps_per_second": 1.028, "step": 10000 }, { "epoch": 0.34885327438518926, "grad_norm": 0.3159004747867584, "learning_rate": 0.0004539829545454546, "loss": 2.3606, "step": 10100 }, { "epoch": 0.35230726720088423, "grad_norm": 0.32001519203186035, "learning_rate": 0.0004534147727272727, "loss": 2.37, "step": 10200 }, { "epoch": 0.35576126001657915, "grad_norm": 0.328933447599411, "learning_rate": 0.00045284659090909094, "loss": 2.3515, "step": 10300 }, { "epoch": 0.3592152528322741, "grad_norm": 0.3076813817024231, "learning_rate": 0.0004522784090909091, "loss": 2.3276, "step": 10400 }, { "epoch": 0.36266924564796904, "grad_norm": 0.3153810501098633, "learning_rate": 0.00045171022727272725, "loss": 2.3373, "step": 10500 }, { "epoch": 0.366123238463664, "grad_norm": 0.32247108221054077, "learning_rate": 0.0004511420454545455, "loss": 2.3335, "step": 10600 }, { "epoch": 0.36957723127935893, "grad_norm": 0.3074076771736145, "learning_rate": 0.00045057386363636366, "loss": 2.325, "step": 10700 }, { "epoch": 0.3730312240950539, "grad_norm": 0.31907033920288086, "learning_rate": 0.0004500056818181818, "loss": 2.3155, "step": 10800 }, { "epoch": 0.3764852169107488, "grad_norm": 0.32912886142730713, "learning_rate": 0.0004494375, "loss": 2.324, "step": 10900 }, { "epoch": 0.3799392097264438, "grad_norm": 0.3003767430782318, "learning_rate": 0.0004488693181818182, "loss": 2.3222, "step": 11000 }, { "epoch": 0.3799392097264438, "eval_loss": 2.633434772491455, "eval_runtime": 927.254, "eval_samples_per_second": 164.331, "eval_steps_per_second": 1.028, "step": 11000 }, { "epoch": 0.3833932025421387, "grad_norm": 0.3144666254520416, "learning_rate": 0.0004483011363636364, "loss": 2.3251, "step": 11100 }, { "epoch": 0.38684719535783363, "grad_norm": 0.3284156322479248, "learning_rate": 0.00044773295454545456, "loss": 2.3033, "step": 11200 }, { "epoch": 0.3903011881735286, "grad_norm": 0.3226972222328186, "learning_rate": 0.00044716477272727274, "loss": 2.296, "step": 11300 }, { "epoch": 0.3937551809892235, "grad_norm": 0.34044623374938965, "learning_rate": 0.0004465965909090909, "loss": 2.3198, "step": 11400 }, { "epoch": 0.3972091738049185, "grad_norm": 0.3101319372653961, "learning_rate": 0.0004460284090909091, "loss": 2.3107, "step": 11500 }, { "epoch": 0.4006631666206134, "grad_norm": 0.3044012486934662, "learning_rate": 0.0004454602272727273, "loss": 2.2984, "step": 11600 }, { "epoch": 0.4041171594363084, "grad_norm": 0.3155890107154846, "learning_rate": 0.00044489204545454546, "loss": 2.2968, "step": 11700 }, { "epoch": 0.4075711522520033, "grad_norm": 0.33918723464012146, "learning_rate": 0.00044432386363636364, "loss": 2.2707, "step": 11800 }, { "epoch": 0.41102514506769827, "grad_norm": 0.30243411660194397, "learning_rate": 0.0004437556818181818, "loss": 2.2979, "step": 11900 }, { "epoch": 0.4144791378833932, "grad_norm": 0.3046514391899109, "learning_rate": 0.0004431875, "loss": 2.2809, "step": 12000 }, { "epoch": 0.4144791378833932, "eval_loss": 2.61051344871521, "eval_runtime": 927.0521, "eval_samples_per_second": 164.367, "eval_steps_per_second": 1.028, "step": 12000 }, { "epoch": 0.41793313069908816, "grad_norm": 0.32584163546562195, "learning_rate": 0.0004426193181818182, "loss": 2.2876, "step": 12100 }, { "epoch": 0.4213871235147831, "grad_norm": 0.34489238262176514, "learning_rate": 0.00044205113636363637, "loss": 2.298, "step": 12200 }, { "epoch": 0.42484111633047805, "grad_norm": 0.30355241894721985, "learning_rate": 0.00044148295454545455, "loss": 2.2767, "step": 12300 }, { "epoch": 0.42829510914617297, "grad_norm": 0.3140780031681061, "learning_rate": 0.0004409147727272728, "loss": 2.2779, "step": 12400 }, { "epoch": 0.43174910196186794, "grad_norm": 0.31298449635505676, "learning_rate": 0.0004403465909090909, "loss": 2.2831, "step": 12500 }, { "epoch": 0.43520309477756286, "grad_norm": 0.32630786299705505, "learning_rate": 0.0004397784090909091, "loss": 2.2698, "step": 12600 }, { "epoch": 0.43865708759325783, "grad_norm": 0.303371399641037, "learning_rate": 0.0004392102272727273, "loss": 2.2767, "step": 12700 }, { "epoch": 0.44211108040895275, "grad_norm": 0.30070436000823975, "learning_rate": 0.00043864204545454545, "loss": 2.2449, "step": 12800 }, { "epoch": 0.44556507322464767, "grad_norm": 0.2887287139892578, "learning_rate": 0.00043807386363636363, "loss": 2.2688, "step": 12900 }, { "epoch": 0.44901906604034264, "grad_norm": 0.306916743516922, "learning_rate": 0.00043750568181818186, "loss": 2.2557, "step": 13000 }, { "epoch": 0.44901906604034264, "eval_loss": 2.5854439735412598, "eval_runtime": 926.6658, "eval_samples_per_second": 164.436, "eval_steps_per_second": 1.028, "step": 13000 }, { "epoch": 0.45247305885603756, "grad_norm": 0.34850597381591797, "learning_rate": 0.0004369375, "loss": 2.2423, "step": 13100 }, { "epoch": 0.45592705167173253, "grad_norm": 0.35393500328063965, "learning_rate": 0.00043636931818181817, "loss": 2.2543, "step": 13200 }, { "epoch": 0.45938104448742745, "grad_norm": 0.3059336543083191, "learning_rate": 0.0004358011363636364, "loss": 2.2516, "step": 13300 }, { "epoch": 0.4628350373031224, "grad_norm": 0.3357197344303131, "learning_rate": 0.00043523295454545453, "loss": 2.2328, "step": 13400 }, { "epoch": 0.46628903011881734, "grad_norm": 0.31849172711372375, "learning_rate": 0.0004346647727272727, "loss": 2.2424, "step": 13500 }, { "epoch": 0.4697430229345123, "grad_norm": 0.31968438625335693, "learning_rate": 0.00043409659090909094, "loss": 2.2228, "step": 13600 }, { "epoch": 0.47319701575020723, "grad_norm": 0.3293677568435669, "learning_rate": 0.0004335284090909091, "loss": 2.2555, "step": 13700 }, { "epoch": 0.4766510085659022, "grad_norm": 0.3031880855560303, "learning_rate": 0.00043296022727272725, "loss": 2.2387, "step": 13800 }, { "epoch": 0.4801050013815971, "grad_norm": 0.2914179563522339, "learning_rate": 0.0004323920454545455, "loss": 2.2494, "step": 13900 }, { "epoch": 0.4835589941972921, "grad_norm": 0.3345280587673187, "learning_rate": 0.00043182386363636367, "loss": 2.2322, "step": 14000 }, { "epoch": 0.4835589941972921, "eval_loss": 2.5606906414031982, "eval_runtime": 929.2705, "eval_samples_per_second": 163.975, "eval_steps_per_second": 1.026, "step": 14000 }, { "epoch": 0.487012987012987, "grad_norm": 0.3165434002876282, "learning_rate": 0.0004312556818181818, "loss": 2.2266, "step": 14100 }, { "epoch": 0.490466979828682, "grad_norm": 0.30577775835990906, "learning_rate": 0.0004306875, "loss": 2.2401, "step": 14200 }, { "epoch": 0.4939209726443769, "grad_norm": 0.2920292019844055, "learning_rate": 0.0004301193181818182, "loss": 2.2203, "step": 14300 }, { "epoch": 0.4973749654600718, "grad_norm": 0.32168078422546387, "learning_rate": 0.00042955113636363633, "loss": 2.2064, "step": 14400 }, { "epoch": 0.5008289582757668, "grad_norm": 0.31879886984825134, "learning_rate": 0.00042898295454545457, "loss": 2.219, "step": 14500 }, { "epoch": 0.5042829510914617, "grad_norm": 0.2906869351863861, "learning_rate": 0.00042841477272727275, "loss": 2.2055, "step": 14600 }, { "epoch": 0.5077369439071566, "grad_norm": 0.3648407757282257, "learning_rate": 0.0004278465909090909, "loss": 2.2157, "step": 14700 }, { "epoch": 0.5111909367228517, "grad_norm": 0.30823054909706116, "learning_rate": 0.0004272784090909091, "loss": 2.2158, "step": 14800 }, { "epoch": 0.5146449295385466, "grad_norm": 0.3004588782787323, "learning_rate": 0.0004267102272727273, "loss": 2.2009, "step": 14900 }, { "epoch": 0.5180989223542415, "grad_norm": 0.29552149772644043, "learning_rate": 0.00042614204545454547, "loss": 2.2194, "step": 15000 }, { "epoch": 0.5180989223542415, "eval_loss": 2.537440538406372, "eval_runtime": 928.9697, "eval_samples_per_second": 164.028, "eval_steps_per_second": 1.026, "step": 15000 }, { "epoch": 0.5215529151699364, "grad_norm": 0.3077145516872406, "learning_rate": 0.00042557386363636365, "loss": 2.199, "step": 15100 }, { "epoch": 0.5250069079856314, "grad_norm": 0.32205095887184143, "learning_rate": 0.00042500568181818183, "loss": 2.2045, "step": 15200 }, { "epoch": 0.5284609008013263, "grad_norm": 0.30157867074012756, "learning_rate": 0.0004244375, "loss": 2.1926, "step": 15300 }, { "epoch": 0.5319148936170213, "grad_norm": 0.35868486762046814, "learning_rate": 0.0004238693181818182, "loss": 2.1911, "step": 15400 }, { "epoch": 0.5353688864327162, "grad_norm": 0.3132970631122589, "learning_rate": 0.00042330113636363637, "loss": 2.193, "step": 15500 }, { "epoch": 0.5388228792484112, "grad_norm": 0.31356823444366455, "learning_rate": 0.00042273295454545455, "loss": 2.1959, "step": 15600 }, { "epoch": 0.5422768720641061, "grad_norm": 0.31471192836761475, "learning_rate": 0.00042216477272727273, "loss": 2.2069, "step": 15700 }, { "epoch": 0.545730864879801, "grad_norm": 0.33163174986839294, "learning_rate": 0.0004215965909090909, "loss": 2.1929, "step": 15800 }, { "epoch": 0.549184857695496, "grad_norm": 0.31774455308914185, "learning_rate": 0.0004210284090909091, "loss": 2.1816, "step": 15900 }, { "epoch": 0.5526388505111909, "grad_norm": 0.30572381615638733, "learning_rate": 0.00042046022727272727, "loss": 2.206, "step": 16000 }, { "epoch": 0.5526388505111909, "eval_loss": 2.52417254447937, "eval_runtime": 928.7275, "eval_samples_per_second": 164.071, "eval_steps_per_second": 1.026, "step": 16000 }, { "epoch": 0.5560928433268859, "grad_norm": 0.3196762502193451, "learning_rate": 0.00041989204545454545, "loss": 2.1801, "step": 16100 }, { "epoch": 0.5595468361425808, "grad_norm": 0.3148038685321808, "learning_rate": 0.00041932386363636363, "loss": 2.1722, "step": 16200 }, { "epoch": 0.5630008289582757, "grad_norm": 0.32507434487342834, "learning_rate": 0.0004187556818181818, "loss": 2.1836, "step": 16300 }, { "epoch": 0.5664548217739707, "grad_norm": 0.3227043151855469, "learning_rate": 0.0004181875, "loss": 2.1794, "step": 16400 }, { "epoch": 0.5699088145896657, "grad_norm": 0.3271748721599579, "learning_rate": 0.0004176193181818182, "loss": 2.1786, "step": 16500 }, { "epoch": 0.5733628074053606, "grad_norm": 0.31076040863990784, "learning_rate": 0.0004170511363636364, "loss": 2.1616, "step": 16600 }, { "epoch": 0.5768168002210555, "grad_norm": 0.32442960143089294, "learning_rate": 0.00041648295454545453, "loss": 2.1642, "step": 16700 }, { "epoch": 0.5802707930367504, "grad_norm": 0.2945985794067383, "learning_rate": 0.0004159147727272727, "loss": 2.1641, "step": 16800 }, { "epoch": 0.5837247858524455, "grad_norm": 0.32005414366722107, "learning_rate": 0.00041534659090909095, "loss": 2.1968, "step": 16900 }, { "epoch": 0.5871787786681404, "grad_norm": 0.31035128235816956, "learning_rate": 0.0004147784090909091, "loss": 2.1735, "step": 17000 }, { "epoch": 0.5871787786681404, "eval_loss": 2.5003573894500732, "eval_runtime": 929.6579, "eval_samples_per_second": 163.907, "eval_steps_per_second": 1.025, "step": 17000 }, { "epoch": 0.5906327714838353, "grad_norm": 0.3103092908859253, "learning_rate": 0.00041421022727272726, "loss": 2.1625, "step": 17100 }, { "epoch": 0.5940867642995302, "grad_norm": 0.3217906057834625, "learning_rate": 0.0004136420454545455, "loss": 2.1485, "step": 17200 }, { "epoch": 0.5975407571152253, "grad_norm": 0.2988424301147461, "learning_rate": 0.0004130738636363636, "loss": 2.1628, "step": 17300 }, { "epoch": 0.6009947499309202, "grad_norm": 0.3058546185493469, "learning_rate": 0.0004125056818181818, "loss": 2.1701, "step": 17400 }, { "epoch": 0.6044487427466151, "grad_norm": 0.3056589961051941, "learning_rate": 0.00041193750000000003, "loss": 2.1515, "step": 17500 }, { "epoch": 0.60790273556231, "grad_norm": 0.31840789318084717, "learning_rate": 0.00041136931818181816, "loss": 2.1536, "step": 17600 }, { "epoch": 0.6113567283780049, "grad_norm": 0.3044828772544861, "learning_rate": 0.00041080113636363634, "loss": 2.162, "step": 17700 }, { "epoch": 0.6148107211937, "grad_norm": 0.30973371863365173, "learning_rate": 0.00041023295454545457, "loss": 2.1498, "step": 17800 }, { "epoch": 0.6182647140093949, "grad_norm": 0.30947718024253845, "learning_rate": 0.00040966477272727275, "loss": 2.1435, "step": 17900 }, { "epoch": 0.6217187068250898, "grad_norm": 0.34587281942367554, "learning_rate": 0.0004090965909090909, "loss": 2.1504, "step": 18000 }, { "epoch": 0.6217187068250898, "eval_loss": 2.484160900115967, "eval_runtime": 927.1891, "eval_samples_per_second": 164.343, "eval_steps_per_second": 1.028, "step": 18000 }, { "epoch": 0.6251726996407847, "grad_norm": 0.30945053696632385, "learning_rate": 0.0004085284090909091, "loss": 2.1545, "step": 18100 }, { "epoch": 0.6286266924564797, "grad_norm": 0.3018719255924225, "learning_rate": 0.0004079602272727273, "loss": 2.1439, "step": 18200 }, { "epoch": 0.6320806852721746, "grad_norm": 0.3113386332988739, "learning_rate": 0.0004073920454545454, "loss": 2.1225, "step": 18300 }, { "epoch": 0.6355346780878696, "grad_norm": 0.29737088084220886, "learning_rate": 0.00040682386363636365, "loss": 2.1286, "step": 18400 }, { "epoch": 0.6389886709035645, "grad_norm": 0.31960177421569824, "learning_rate": 0.00040625568181818183, "loss": 2.1249, "step": 18500 }, { "epoch": 0.6424426637192595, "grad_norm": 0.3072162866592407, "learning_rate": 0.00040568749999999996, "loss": 2.1348, "step": 18600 }, { "epoch": 0.6458966565349544, "grad_norm": 0.3196597397327423, "learning_rate": 0.0004051193181818182, "loss": 2.1408, "step": 18700 }, { "epoch": 0.6493506493506493, "grad_norm": 0.3315812051296234, "learning_rate": 0.0004045511363636364, "loss": 2.1439, "step": 18800 }, { "epoch": 0.6528046421663443, "grad_norm": 0.2933200001716614, "learning_rate": 0.0004039829545454545, "loss": 2.1465, "step": 18900 }, { "epoch": 0.6562586349820393, "grad_norm": 0.33558085560798645, "learning_rate": 0.00040341477272727274, "loss": 2.1416, "step": 19000 }, { "epoch": 0.6562586349820393, "eval_loss": 2.473646640777588, "eval_runtime": 926.9255, "eval_samples_per_second": 164.39, "eval_steps_per_second": 1.028, "step": 19000 }, { "epoch": 0.6597126277977342, "grad_norm": 0.2992997169494629, "learning_rate": 0.0004028465909090909, "loss": 2.1386, "step": 19100 }, { "epoch": 0.6631666206134291, "grad_norm": 0.3051714599132538, "learning_rate": 0.00040227840909090915, "loss": 2.1225, "step": 19200 }, { "epoch": 0.666620613429124, "grad_norm": 0.31724849343299866, "learning_rate": 0.0004017102272727273, "loss": 2.1174, "step": 19300 }, { "epoch": 0.670074606244819, "grad_norm": 0.2937643826007843, "learning_rate": 0.00040114204545454546, "loss": 2.1279, "step": 19400 }, { "epoch": 0.673528599060514, "grad_norm": 0.31908687949180603, "learning_rate": 0.0004005738636363637, "loss": 2.1151, "step": 19500 }, { "epoch": 0.6769825918762089, "grad_norm": 0.31399762630462646, "learning_rate": 0.0004000056818181818, "loss": 2.1378, "step": 19600 }, { "epoch": 0.6804365846919038, "grad_norm": 0.3157575726509094, "learning_rate": 0.0003994375, "loss": 2.1149, "step": 19700 }, { "epoch": 0.6838905775075987, "grad_norm": 0.32018882036209106, "learning_rate": 0.00039886931818181823, "loss": 2.0993, "step": 19800 }, { "epoch": 0.6873445703232938, "grad_norm": 0.31708574295043945, "learning_rate": 0.00039830113636363636, "loss": 2.1132, "step": 19900 }, { "epoch": 0.6907985631389887, "grad_norm": 0.2904827892780304, "learning_rate": 0.00039773295454545454, "loss": 2.1088, "step": 20000 }, { "epoch": 0.6907985631389887, "eval_loss": 2.4520211219787598, "eval_runtime": 926.891, "eval_samples_per_second": 164.396, "eval_steps_per_second": 1.028, "step": 20000 }, { "epoch": 0.6942525559546836, "grad_norm": 0.3298169672489166, "learning_rate": 0.0003971647727272728, "loss": 2.1177, "step": 20100 }, { "epoch": 0.6977065487703785, "grad_norm": 0.291166752576828, "learning_rate": 0.0003965965909090909, "loss": 2.0954, "step": 20200 }, { "epoch": 0.7011605415860735, "grad_norm": 0.3211086094379425, "learning_rate": 0.0003960284090909091, "loss": 2.1209, "step": 20300 }, { "epoch": 0.7046145344017685, "grad_norm": 0.3161545395851135, "learning_rate": 0.0003954602272727273, "loss": 2.1149, "step": 20400 }, { "epoch": 0.7080685272174634, "grad_norm": 0.3262562155723572, "learning_rate": 0.0003948920454545455, "loss": 2.1204, "step": 20500 }, { "epoch": 0.7115225200331583, "grad_norm": 0.3347005546092987, "learning_rate": 0.0003943238636363636, "loss": 2.104, "step": 20600 }, { "epoch": 0.7149765128488533, "grad_norm": 0.30474451184272766, "learning_rate": 0.00039375568181818186, "loss": 2.0955, "step": 20700 }, { "epoch": 0.7184305056645482, "grad_norm": 0.32672184705734253, "learning_rate": 0.00039318750000000004, "loss": 2.0998, "step": 20800 }, { "epoch": 0.7218844984802432, "grad_norm": 0.3041098713874817, "learning_rate": 0.00039261931818181816, "loss": 2.0897, "step": 20900 }, { "epoch": 0.7253384912959381, "grad_norm": 0.351904034614563, "learning_rate": 0.0003920511363636364, "loss": 2.0925, "step": 21000 }, { "epoch": 0.7253384912959381, "eval_loss": 2.4404454231262207, "eval_runtime": 927.0383, "eval_samples_per_second": 164.37, "eval_steps_per_second": 1.028, "step": 21000 }, { "epoch": 0.728792484111633, "grad_norm": 0.34308210015296936, "learning_rate": 0.0003914829545454546, "loss": 2.1039, "step": 21100 }, { "epoch": 0.732246476927328, "grad_norm": 0.3298318088054657, "learning_rate": 0.0003909147727272727, "loss": 2.0774, "step": 21200 }, { "epoch": 0.735700469743023, "grad_norm": 0.3102123737335205, "learning_rate": 0.00039034659090909094, "loss": 2.1111, "step": 21300 }, { "epoch": 0.7391544625587179, "grad_norm": 0.3186514973640442, "learning_rate": 0.0003897784090909091, "loss": 2.084, "step": 21400 }, { "epoch": 0.7426084553744128, "grad_norm": 0.31114721298217773, "learning_rate": 0.00038921022727272724, "loss": 2.1037, "step": 21500 }, { "epoch": 0.7460624481901078, "grad_norm": 0.330563485622406, "learning_rate": 0.0003886420454545455, "loss": 2.0831, "step": 21600 }, { "epoch": 0.7495164410058027, "grad_norm": 0.3088129460811615, "learning_rate": 0.00038807386363636366, "loss": 2.0914, "step": 21700 }, { "epoch": 0.7529704338214976, "grad_norm": 0.28733545541763306, "learning_rate": 0.0003875056818181818, "loss": 2.0955, "step": 21800 }, { "epoch": 0.7564244266371926, "grad_norm": 0.3190239667892456, "learning_rate": 0.0003869375, "loss": 2.0828, "step": 21900 }, { "epoch": 0.7598784194528876, "grad_norm": 0.3163771331310272, "learning_rate": 0.0003863693181818182, "loss": 2.0786, "step": 22000 }, { "epoch": 0.7598784194528876, "eval_loss": 2.4309139251708984, "eval_runtime": 926.6419, "eval_samples_per_second": 164.44, "eval_steps_per_second": 1.028, "step": 22000 }, { "epoch": 0.7633324122685825, "grad_norm": 0.2819238603115082, "learning_rate": 0.0003858011363636364, "loss": 2.092, "step": 22100 }, { "epoch": 0.7667864050842774, "grad_norm": 0.31991979479789734, "learning_rate": 0.00038523295454545456, "loss": 2.0628, "step": 22200 }, { "epoch": 0.7702403978999723, "grad_norm": 0.3094194233417511, "learning_rate": 0.00038466477272727274, "loss": 2.0826, "step": 22300 }, { "epoch": 0.7736943907156673, "grad_norm": 0.30959707498550415, "learning_rate": 0.0003840965909090909, "loss": 2.0858, "step": 22400 }, { "epoch": 0.7771483835313623, "grad_norm": 0.30589380860328674, "learning_rate": 0.0003835284090909091, "loss": 2.0864, "step": 22500 }, { "epoch": 0.7806023763470572, "grad_norm": 0.3400673270225525, "learning_rate": 0.0003829602272727273, "loss": 2.069, "step": 22600 }, { "epoch": 0.7840563691627521, "grad_norm": 0.3428845703601837, "learning_rate": 0.00038239204545454546, "loss": 2.0622, "step": 22700 }, { "epoch": 0.787510361978447, "grad_norm": 0.3274592459201813, "learning_rate": 0.00038182386363636364, "loss": 2.0714, "step": 22800 }, { "epoch": 0.7909643547941421, "grad_norm": 0.3281017243862152, "learning_rate": 0.0003812556818181818, "loss": 2.0856, "step": 22900 }, { "epoch": 0.794418347609837, "grad_norm": 0.32381513714790344, "learning_rate": 0.0003806875, "loss": 2.0687, "step": 23000 }, { "epoch": 0.794418347609837, "eval_loss": 2.416405439376831, "eval_runtime": 926.9677, "eval_samples_per_second": 164.382, "eval_steps_per_second": 1.028, "step": 23000 }, { "epoch": 0.7978723404255319, "grad_norm": 0.31997501850128174, "learning_rate": 0.0003801193181818182, "loss": 2.0923, "step": 23100 }, { "epoch": 0.8013263332412268, "grad_norm": 0.315775603055954, "learning_rate": 0.00037955113636363636, "loss": 2.0578, "step": 23200 }, { "epoch": 0.8047803260569218, "grad_norm": 0.3135242462158203, "learning_rate": 0.00037898295454545454, "loss": 2.0604, "step": 23300 }, { "epoch": 0.8082343188726168, "grad_norm": 0.33324697613716125, "learning_rate": 0.0003784147727272728, "loss": 2.0776, "step": 23400 }, { "epoch": 0.8116883116883117, "grad_norm": 0.3114740252494812, "learning_rate": 0.0003778465909090909, "loss": 2.0679, "step": 23500 }, { "epoch": 0.8151423045040066, "grad_norm": 0.37432342767715454, "learning_rate": 0.0003772784090909091, "loss": 2.0685, "step": 23600 }, { "epoch": 0.8185962973197016, "grad_norm": 0.31538712978363037, "learning_rate": 0.0003767102272727273, "loss": 2.0687, "step": 23700 }, { "epoch": 0.8220502901353965, "grad_norm": 0.3598659336566925, "learning_rate": 0.00037614204545454545, "loss": 2.0909, "step": 23800 }, { "epoch": 0.8255042829510915, "grad_norm": 0.3034459948539734, "learning_rate": 0.0003755738636363636, "loss": 2.0588, "step": 23900 }, { "epoch": 0.8289582757667864, "grad_norm": 0.3221229016780853, "learning_rate": 0.00037500568181818186, "loss": 2.0661, "step": 24000 }, { "epoch": 0.8289582757667864, "eval_loss": 2.4008617401123047, "eval_runtime": 927.2556, "eval_samples_per_second": 164.331, "eval_steps_per_second": 1.028, "step": 24000 }, { "epoch": 0.8324122685824813, "grad_norm": 0.3049459755420685, "learning_rate": 0.0003744375, "loss": 2.0428, "step": 24100 }, { "epoch": 0.8358662613981763, "grad_norm": 0.3034842908382416, "learning_rate": 0.00037386931818181817, "loss": 2.0639, "step": 24200 }, { "epoch": 0.8393202542138712, "grad_norm": 0.3170601427555084, "learning_rate": 0.0003733011363636364, "loss": 2.0606, "step": 24300 }, { "epoch": 0.8427742470295662, "grad_norm": 0.3232339918613434, "learning_rate": 0.00037273295454545453, "loss": 2.0394, "step": 24400 }, { "epoch": 0.8462282398452611, "grad_norm": 0.3366962671279907, "learning_rate": 0.0003721647727272727, "loss": 2.0415, "step": 24500 }, { "epoch": 0.8496822326609561, "grad_norm": 0.3091275095939636, "learning_rate": 0.00037159659090909094, "loss": 2.0789, "step": 24600 }, { "epoch": 0.853136225476651, "grad_norm": 0.3144051432609558, "learning_rate": 0.0003710284090909091, "loss": 2.059, "step": 24700 }, { "epoch": 0.8565902182923459, "grad_norm": 0.3365747332572937, "learning_rate": 0.00037046022727272725, "loss": 2.0388, "step": 24800 }, { "epoch": 0.8600442111080409, "grad_norm": 0.2965666949748993, "learning_rate": 0.0003698920454545455, "loss": 2.0576, "step": 24900 }, { "epoch": 0.8634982039237359, "grad_norm": 0.3322639465332031, "learning_rate": 0.00036932386363636366, "loss": 2.0633, "step": 25000 }, { "epoch": 0.8634982039237359, "eval_loss": 2.392946243286133, "eval_runtime": 926.5204, "eval_samples_per_second": 164.462, "eval_steps_per_second": 1.029, "step": 25000 }, { "epoch": 0.8669521967394308, "grad_norm": 0.3184923827648163, "learning_rate": 0.0003687556818181818, "loss": 2.0442, "step": 25100 }, { "epoch": 0.8704061895551257, "grad_norm": 0.30526450276374817, "learning_rate": 0.0003681875, "loss": 2.0364, "step": 25200 }, { "epoch": 0.8738601823708206, "grad_norm": 0.3035339117050171, "learning_rate": 0.0003676193181818182, "loss": 2.0399, "step": 25300 }, { "epoch": 0.8773141751865157, "grad_norm": 0.3300335705280304, "learning_rate": 0.00036705113636363633, "loss": 2.0388, "step": 25400 }, { "epoch": 0.8807681680022106, "grad_norm": 0.33707037568092346, "learning_rate": 0.00036648295454545457, "loss": 2.0364, "step": 25500 }, { "epoch": 0.8842221608179055, "grad_norm": 0.3057771623134613, "learning_rate": 0.00036591477272727275, "loss": 2.0377, "step": 25600 }, { "epoch": 0.8876761536336004, "grad_norm": 0.33993765711784363, "learning_rate": 0.00036534659090909087, "loss": 2.0485, "step": 25700 }, { "epoch": 0.8911301464492953, "grad_norm": 0.3075715899467468, "learning_rate": 0.0003647784090909091, "loss": 2.0256, "step": 25800 }, { "epoch": 0.8945841392649904, "grad_norm": 0.30490240454673767, "learning_rate": 0.0003642102272727273, "loss": 2.0489, "step": 25900 }, { "epoch": 0.8980381320806853, "grad_norm": 0.3403315246105194, "learning_rate": 0.00036364204545454547, "loss": 2.0476, "step": 26000 }, { "epoch": 0.8980381320806853, "eval_loss": 2.382169008255005, "eval_runtime": 932.2661, "eval_samples_per_second": 163.448, "eval_steps_per_second": 1.635, "step": 26000 }, { "epoch": 0.9014921248963802, "grad_norm": 0.31369808316230774, "learning_rate": 0.00036307386363636365, "loss": 2.0265, "step": 26100 }, { "epoch": 0.9049461177120751, "grad_norm": 0.30494198203086853, "learning_rate": 0.00036250568181818183, "loss": 2.0328, "step": 26200 }, { "epoch": 0.9084001105277701, "grad_norm": 0.2981790006160736, "learning_rate": 0.0003619375, "loss": 2.0196, "step": 26300 }, { "epoch": 0.9118541033434651, "grad_norm": 0.3235887587070465, "learning_rate": 0.0003613693181818182, "loss": 2.0224, "step": 26400 }, { "epoch": 0.91530809615916, "grad_norm": 0.32602986693382263, "learning_rate": 0.00036080113636363637, "loss": 2.0169, "step": 26500 }, { "epoch": 0.9187620889748549, "grad_norm": 0.3355056643486023, "learning_rate": 0.00036023295454545455, "loss": 2.0338, "step": 26600 }, { "epoch": 0.9222160817905499, "grad_norm": 0.3180111348628998, "learning_rate": 0.00035966477272727273, "loss": 2.0297, "step": 26700 }, { "epoch": 0.9256700746062448, "grad_norm": 0.2988349199295044, "learning_rate": 0.0003590965909090909, "loss": 2.0189, "step": 26800 }, { "epoch": 0.9291240674219398, "grad_norm": 0.30824485421180725, "learning_rate": 0.0003585284090909091, "loss": 2.0086, "step": 26900 }, { "epoch": 0.9325780602376347, "grad_norm": 0.33140483498573303, "learning_rate": 0.00035796022727272727, "loss": 2.0127, "step": 27000 }, { "epoch": 0.9325780602376347, "eval_loss": 2.3799469470977783, "eval_runtime": 931.9535, "eval_samples_per_second": 163.503, "eval_steps_per_second": 1.635, "step": 27000 }, { "epoch": 0.9360320530533297, "grad_norm": 0.31175485253334045, "learning_rate": 0.00035739204545454545, "loss": 2.027, "step": 27100 }, { "epoch": 0.9394860458690246, "grad_norm": 0.3109052777290344, "learning_rate": 0.00035682386363636363, "loss": 2.029, "step": 27200 }, { "epoch": 0.9429400386847195, "grad_norm": 0.3299388885498047, "learning_rate": 0.0003562556818181818, "loss": 2.0194, "step": 27300 }, { "epoch": 0.9463940315004145, "grad_norm": 0.35121017694473267, "learning_rate": 0.0003556875, "loss": 2.0158, "step": 27400 }, { "epoch": 0.9498480243161094, "grad_norm": 0.3052006959915161, "learning_rate": 0.00035511931818181817, "loss": 2.0109, "step": 27500 }, { "epoch": 0.9533020171318044, "grad_norm": 0.3126027584075928, "learning_rate": 0.0003545511363636364, "loss": 2.0215, "step": 27600 }, { "epoch": 0.9567560099474993, "grad_norm": 0.32444655895233154, "learning_rate": 0.00035398295454545453, "loss": 2.0108, "step": 27700 }, { "epoch": 0.9602100027631942, "grad_norm": 0.31381282210350037, "learning_rate": 0.0003534147727272727, "loss": 2.0151, "step": 27800 }, { "epoch": 0.9636639955788892, "grad_norm": 0.3093770444393158, "learning_rate": 0.00035284659090909095, "loss": 1.9959, "step": 27900 }, { "epoch": 0.9671179883945842, "grad_norm": 0.3137684762477875, "learning_rate": 0.0003522784090909091, "loss": 2.0223, "step": 28000 }, { "epoch": 0.9671179883945842, "eval_loss": 2.3616411685943604, "eval_runtime": 936.0723, "eval_samples_per_second": 162.783, "eval_steps_per_second": 1.628, "step": 28000 }, { "epoch": 0.9705719812102791, "grad_norm": 0.3130528926849365, "learning_rate": 0.00035171022727272725, "loss": 2.0078, "step": 28100 }, { "epoch": 0.974025974025974, "grad_norm": 0.33664995431900024, "learning_rate": 0.0003511420454545455, "loss": 2.0087, "step": 28200 }, { "epoch": 0.9774799668416689, "grad_norm": 0.32277122139930725, "learning_rate": 0.0003505738636363636, "loss": 2.0106, "step": 28300 }, { "epoch": 0.980933959657364, "grad_norm": 0.33459389209747314, "learning_rate": 0.0003500056818181818, "loss": 2.019, "step": 28400 }, { "epoch": 0.9843879524730589, "grad_norm": 0.31769075989723206, "learning_rate": 0.00034943750000000003, "loss": 2.0105, "step": 28500 }, { "epoch": 0.9878419452887538, "grad_norm": 0.3090764582157135, "learning_rate": 0.00034886931818181816, "loss": 2.0121, "step": 28600 }, { "epoch": 0.9912959381044487, "grad_norm": 0.3254571557044983, "learning_rate": 0.00034830113636363634, "loss": 2.0069, "step": 28700 }, { "epoch": 0.9947499309201436, "grad_norm": 0.3087945878505707, "learning_rate": 0.00034773295454545457, "loss": 1.9956, "step": 28800 }, { "epoch": 0.9982039237358387, "grad_norm": 0.2959256172180176, "learning_rate": 0.00034716477272727275, "loss": 2.0202, "step": 28900 }, { "epoch": 1.0016579165515336, "grad_norm": 0.3626255691051483, "learning_rate": 0.0003465965909090909, "loss": 1.9733, "step": 29000 }, { "epoch": 1.0016579165515336, "eval_loss": 2.3518831729888916, "eval_runtime": 933.6764, "eval_samples_per_second": 163.201, "eval_steps_per_second": 1.632, "step": 29000 }, { "epoch": 1.0051119093672285, "grad_norm": 0.3299137353897095, "learning_rate": 0.0003460284090909091, "loss": 1.9406, "step": 29100 }, { "epoch": 1.0085659021829234, "grad_norm": 0.3189757168292999, "learning_rate": 0.0003454602272727273, "loss": 1.9547, "step": 29200 }, { "epoch": 1.0120198949986183, "grad_norm": 0.33895236253738403, "learning_rate": 0.0003448920454545454, "loss": 1.9462, "step": 29300 }, { "epoch": 1.0154738878143132, "grad_norm": 0.3329538106918335, "learning_rate": 0.00034432386363636365, "loss": 1.9445, "step": 29400 }, { "epoch": 1.0189278806300084, "grad_norm": 0.33972305059432983, "learning_rate": 0.00034375568181818183, "loss": 1.9482, "step": 29500 }, { "epoch": 1.0223818734457033, "grad_norm": 0.3170960545539856, "learning_rate": 0.00034318749999999996, "loss": 1.9322, "step": 29600 }, { "epoch": 1.0258358662613982, "grad_norm": 0.3435528576374054, "learning_rate": 0.0003426193181818182, "loss": 1.9651, "step": 29700 }, { "epoch": 1.0292898590770931, "grad_norm": 0.3118680715560913, "learning_rate": 0.0003420511363636364, "loss": 1.9553, "step": 29800 }, { "epoch": 1.032743851892788, "grad_norm": 0.30952584743499756, "learning_rate": 0.0003414829545454545, "loss": 1.9594, "step": 29900 }, { "epoch": 1.036197844708483, "grad_norm": 0.3205563724040985, "learning_rate": 0.00034091477272727274, "loss": 1.951, "step": 30000 }, { "epoch": 1.036197844708483, "eval_loss": 2.3421385288238525, "eval_runtime": 931.9003, "eval_samples_per_second": 163.512, "eval_steps_per_second": 1.635, "step": 30000 }, { "epoch": 1.039651837524178, "grad_norm": 0.3193325400352478, "learning_rate": 0.0003403465909090909, "loss": 1.9781, "step": 30100 }, { "epoch": 1.0431058303398728, "grad_norm": 0.3476419448852539, "learning_rate": 0.00033977840909090915, "loss": 1.9804, "step": 30200 }, { "epoch": 1.046559823155568, "grad_norm": 0.334945946931839, "learning_rate": 0.0003392102272727273, "loss": 1.9956, "step": 30300 }, { "epoch": 1.0500138159712629, "grad_norm": 0.3205523192882538, "learning_rate": 0.00033864204545454546, "loss": 1.9738, "step": 30400 }, { "epoch": 1.0534678087869578, "grad_norm": 0.3324650824069977, "learning_rate": 0.0003380738636363637, "loss": 1.9851, "step": 30500 }, { "epoch": 1.0569218016026527, "grad_norm": 0.3181789815425873, "learning_rate": 0.0003375056818181818, "loss": 1.9993, "step": 30600 }, { "epoch": 1.0603757944183476, "grad_norm": 0.3182109594345093, "learning_rate": 0.0003369375, "loss": 1.9808, "step": 30700 }, { "epoch": 1.0638297872340425, "grad_norm": 0.3040473163127899, "learning_rate": 0.00033636931818181823, "loss": 1.9697, "step": 30800 }, { "epoch": 1.0672837800497375, "grad_norm": 0.3187369108200073, "learning_rate": 0.00033580113636363636, "loss": 1.9668, "step": 30900 }, { "epoch": 1.0707377728654324, "grad_norm": 0.31757599115371704, "learning_rate": 0.00033523295454545454, "loss": 1.9797, "step": 31000 }, { "epoch": 1.0707377728654324, "eval_loss": 2.334416151046753, "eval_runtime": 932.4289, "eval_samples_per_second": 163.419, "eval_steps_per_second": 1.634, "step": 31000 }, { "epoch": 1.0741917656811273, "grad_norm": 0.3234330713748932, "learning_rate": 0.0003346647727272728, "loss": 1.9646, "step": 31100 }, { "epoch": 1.0776457584968224, "grad_norm": 0.346343457698822, "learning_rate": 0.0003340965909090909, "loss": 1.9633, "step": 31200 }, { "epoch": 1.0810997513125173, "grad_norm": 0.33652421832084656, "learning_rate": 0.0003335284090909091, "loss": 1.9635, "step": 31300 }, { "epoch": 1.0845537441282123, "grad_norm": 0.3355984091758728, "learning_rate": 0.0003329602272727273, "loss": 1.9714, "step": 31400 }, { "epoch": 1.0880077369439072, "grad_norm": 0.3155532479286194, "learning_rate": 0.0003323920454545455, "loss": 1.9579, "step": 31500 }, { "epoch": 1.091461729759602, "grad_norm": 0.3124435842037201, "learning_rate": 0.0003318238636363636, "loss": 1.9896, "step": 31600 }, { "epoch": 1.094915722575297, "grad_norm": 0.3473125100135803, "learning_rate": 0.00033125568181818185, "loss": 1.9604, "step": 31700 }, { "epoch": 1.098369715390992, "grad_norm": 0.33051636815071106, "learning_rate": 0.00033068750000000004, "loss": 1.9703, "step": 31800 }, { "epoch": 1.1018237082066868, "grad_norm": 0.3092711865901947, "learning_rate": 0.00033011931818181816, "loss": 1.9583, "step": 31900 }, { "epoch": 1.105277701022382, "grad_norm": 0.32419732213020325, "learning_rate": 0.0003295511363636364, "loss": 1.9603, "step": 32000 }, { "epoch": 1.105277701022382, "eval_loss": 2.3255245685577393, "eval_runtime": 932.6931, "eval_samples_per_second": 163.373, "eval_steps_per_second": 1.634, "step": 32000 }, { "epoch": 1.108731693838077, "grad_norm": 0.332787424325943, "learning_rate": 0.0003289829545454546, "loss": 1.992, "step": 32100 }, { "epoch": 1.1121856866537718, "grad_norm": 0.3273712992668152, "learning_rate": 0.0003284147727272727, "loss": 1.9632, "step": 32200 }, { "epoch": 1.1156396794694667, "grad_norm": 0.32147789001464844, "learning_rate": 0.00032784659090909094, "loss": 1.9838, "step": 32300 }, { "epoch": 1.1190936722851617, "grad_norm": 0.3235771358013153, "learning_rate": 0.0003272784090909091, "loss": 1.9594, "step": 32400 }, { "epoch": 1.1225476651008566, "grad_norm": 0.31604549288749695, "learning_rate": 0.00032671022727272724, "loss": 1.9716, "step": 32500 }, { "epoch": 1.1260016579165515, "grad_norm": 0.3200394809246063, "learning_rate": 0.0003261420454545455, "loss": 1.9598, "step": 32600 }, { "epoch": 1.1294556507322464, "grad_norm": 0.31569093465805054, "learning_rate": 0.00032557386363636366, "loss": 1.9598, "step": 32700 }, { "epoch": 1.1329096435479413, "grad_norm": 0.3108920753002167, "learning_rate": 0.0003250056818181818, "loss": 1.9333, "step": 32800 }, { "epoch": 1.1363636363636362, "grad_norm": 0.31714916229248047, "learning_rate": 0.0003244375, "loss": 1.9665, "step": 32900 }, { "epoch": 1.1398176291793314, "grad_norm": 0.3428919017314911, "learning_rate": 0.0003238693181818182, "loss": 1.9367, "step": 33000 }, { "epoch": 1.1398176291793314, "eval_loss": 2.314099073410034, "eval_runtime": 933.8501, "eval_samples_per_second": 163.171, "eval_steps_per_second": 1.632, "step": 33000 }, { "epoch": 1.1432716219950263, "grad_norm": 0.31503021717071533, "learning_rate": 0.0003233011363636364, "loss": 1.952, "step": 33100 }, { "epoch": 1.1467256148107212, "grad_norm": 0.3151177763938904, "learning_rate": 0.00032273295454545456, "loss": 1.9711, "step": 33200 }, { "epoch": 1.1501796076264161, "grad_norm": 0.33299991488456726, "learning_rate": 0.00032216477272727274, "loss": 1.966, "step": 33300 }, { "epoch": 1.153633600442111, "grad_norm": 0.35912394523620605, "learning_rate": 0.0003215965909090909, "loss": 1.9345, "step": 33400 }, { "epoch": 1.157087593257806, "grad_norm": 0.3316855728626251, "learning_rate": 0.0003210284090909091, "loss": 1.9473, "step": 33500 }, { "epoch": 1.1605415860735009, "grad_norm": 0.32025349140167236, "learning_rate": 0.0003204602272727273, "loss": 1.9512, "step": 33600 }, { "epoch": 1.163995578889196, "grad_norm": 0.31566309928894043, "learning_rate": 0.00031989204545454546, "loss": 1.9451, "step": 33700 }, { "epoch": 1.167449571704891, "grad_norm": 0.32200607657432556, "learning_rate": 0.00031932386363636364, "loss": 1.9382, "step": 33800 }, { "epoch": 1.1709035645205859, "grad_norm": 0.3362364172935486, "learning_rate": 0.0003187556818181818, "loss": 1.9504, "step": 33900 }, { "epoch": 1.1743575573362808, "grad_norm": 0.3156588077545166, "learning_rate": 0.0003181875, "loss": 1.9488, "step": 34000 }, { "epoch": 1.1743575573362808, "eval_loss": 2.308772563934326, "eval_runtime": 932.5965, "eval_samples_per_second": 163.39, "eval_steps_per_second": 1.634, "step": 34000 }, { "epoch": 1.1778115501519757, "grad_norm": 0.3278816342353821, "learning_rate": 0.0003176193181818182, "loss": 1.9547, "step": 34100 }, { "epoch": 1.1812655429676706, "grad_norm": 0.3398403227329254, "learning_rate": 0.00031705113636363636, "loss": 1.9293, "step": 34200 }, { "epoch": 1.1847195357833655, "grad_norm": 0.34434807300567627, "learning_rate": 0.00031648295454545454, "loss": 1.9497, "step": 34300 }, { "epoch": 1.1881735285990604, "grad_norm": 0.33737897872924805, "learning_rate": 0.0003159147727272728, "loss": 1.9471, "step": 34400 }, { "epoch": 1.1916275214147554, "grad_norm": 0.3157757520675659, "learning_rate": 0.0003153465909090909, "loss": 1.9395, "step": 34500 }, { "epoch": 1.1950815142304503, "grad_norm": 0.3554360866546631, "learning_rate": 0.0003147784090909091, "loss": 1.9589, "step": 34600 }, { "epoch": 1.1985355070461454, "grad_norm": 0.31714192032814026, "learning_rate": 0.0003142102272727273, "loss": 1.9382, "step": 34700 }, { "epoch": 1.2019894998618403, "grad_norm": 0.3395540416240692, "learning_rate": 0.00031364204545454545, "loss": 1.9245, "step": 34800 }, { "epoch": 1.2054434926775353, "grad_norm": 0.38380250334739685, "learning_rate": 0.0003130738636363636, "loss": 1.9379, "step": 34900 }, { "epoch": 1.2088974854932302, "grad_norm": 0.3237415552139282, "learning_rate": 0.00031250568181818186, "loss": 1.9433, "step": 35000 }, { "epoch": 1.2088974854932302, "eval_loss": 2.299807548522949, "eval_runtime": 932.2028, "eval_samples_per_second": 163.459, "eval_steps_per_second": 1.635, "step": 35000 }, { "epoch": 1.212351478308925, "grad_norm": 0.3568110764026642, "learning_rate": 0.0003119375, "loss": 1.9359, "step": 35100 }, { "epoch": 1.21580547112462, "grad_norm": 0.3228346109390259, "learning_rate": 0.00031136931818181817, "loss": 1.9398, "step": 35200 }, { "epoch": 1.219259463940315, "grad_norm": 0.4409060478210449, "learning_rate": 0.0003108011363636364, "loss": 1.9271, "step": 35300 }, { "epoch": 1.22271345675601, "grad_norm": 0.3323960602283478, "learning_rate": 0.0003102329545454545, "loss": 1.9351, "step": 35400 }, { "epoch": 1.226167449571705, "grad_norm": 0.33286628127098083, "learning_rate": 0.0003096647727272727, "loss": 1.9261, "step": 35500 }, { "epoch": 1.2296214423874, "grad_norm": 0.32433241605758667, "learning_rate": 0.00030909659090909094, "loss": 1.9235, "step": 35600 }, { "epoch": 1.2330754352030948, "grad_norm": 0.33505016565322876, "learning_rate": 0.0003085284090909091, "loss": 1.9463, "step": 35700 }, { "epoch": 1.2365294280187897, "grad_norm": 0.33028197288513184, "learning_rate": 0.00030796022727272725, "loss": 1.9425, "step": 35800 }, { "epoch": 1.2399834208344847, "grad_norm": 0.32460519671440125, "learning_rate": 0.0003073920454545455, "loss": 1.9237, "step": 35900 }, { "epoch": 1.2434374136501796, "grad_norm": 0.34961310029029846, "learning_rate": 0.00030682386363636366, "loss": 1.927, "step": 36000 }, { "epoch": 1.2434374136501796, "eval_loss": 2.2926623821258545, "eval_runtime": 933.7737, "eval_samples_per_second": 163.184, "eval_steps_per_second": 1.632, "step": 36000 }, { "epoch": 1.2468914064658745, "grad_norm": 0.3421266973018646, "learning_rate": 0.0003062556818181818, "loss": 1.9172, "step": 36100 }, { "epoch": 1.2503453992815694, "grad_norm": 0.31496691703796387, "learning_rate": 0.0003056875, "loss": 1.9283, "step": 36200 }, { "epoch": 1.2537993920972643, "grad_norm": 0.3333700895309448, "learning_rate": 0.0003051193181818182, "loss": 1.9083, "step": 36300 }, { "epoch": 1.2572533849129595, "grad_norm": 0.33785733580589294, "learning_rate": 0.00030455113636363633, "loss": 1.9364, "step": 36400 }, { "epoch": 1.2607073777286544, "grad_norm": 0.3140362799167633, "learning_rate": 0.00030398295454545456, "loss": 1.9202, "step": 36500 }, { "epoch": 1.2641613705443493, "grad_norm": 0.332356721162796, "learning_rate": 0.00030341477272727275, "loss": 1.9219, "step": 36600 }, { "epoch": 1.2676153633600442, "grad_norm": 0.30988287925720215, "learning_rate": 0.00030284659090909087, "loss": 1.9247, "step": 36700 }, { "epoch": 1.2710693561757391, "grad_norm": 0.3257978856563568, "learning_rate": 0.0003022784090909091, "loss": 1.9274, "step": 36800 }, { "epoch": 1.274523348991434, "grad_norm": 0.3108922243118286, "learning_rate": 0.0003017102272727273, "loss": 1.9182, "step": 36900 }, { "epoch": 1.277977341807129, "grad_norm": 0.32838690280914307, "learning_rate": 0.00030114204545454547, "loss": 1.921, "step": 37000 }, { "epoch": 1.277977341807129, "eval_loss": 2.28013014793396, "eval_runtime": 932.9674, "eval_samples_per_second": 163.325, "eval_steps_per_second": 1.633, "step": 37000 }, { "epoch": 1.281431334622824, "grad_norm": 0.33043205738067627, "learning_rate": 0.00030057386363636365, "loss": 1.9282, "step": 37100 }, { "epoch": 1.284885327438519, "grad_norm": 0.3355056047439575, "learning_rate": 0.00030000568181818183, "loss": 1.9146, "step": 37200 }, { "epoch": 1.288339320254214, "grad_norm": 0.34499314427375793, "learning_rate": 0.0002994375, "loss": 1.9121, "step": 37300 }, { "epoch": 1.2917933130699089, "grad_norm": 0.33857813477516174, "learning_rate": 0.0002988693181818182, "loss": 1.906, "step": 37400 }, { "epoch": 1.2952473058856038, "grad_norm": 0.34451091289520264, "learning_rate": 0.00029830113636363637, "loss": 1.9069, "step": 37500 }, { "epoch": 1.2987012987012987, "grad_norm": 0.31819987297058105, "learning_rate": 0.00029773295454545455, "loss": 1.905, "step": 37600 }, { "epoch": 1.3021552915169936, "grad_norm": 0.32892873883247375, "learning_rate": 0.00029716477272727273, "loss": 1.9358, "step": 37700 }, { "epoch": 1.3056092843326885, "grad_norm": 0.3139948844909668, "learning_rate": 0.0002965965909090909, "loss": 1.917, "step": 37800 }, { "epoch": 1.3090632771483834, "grad_norm": 0.3358207046985626, "learning_rate": 0.0002960284090909091, "loss": 1.8979, "step": 37900 }, { "epoch": 1.3125172699640784, "grad_norm": 0.3274485468864441, "learning_rate": 0.00029546022727272727, "loss": 1.9147, "step": 38000 }, { "epoch": 1.3125172699640784, "eval_loss": 2.2716429233551025, "eval_runtime": 932.4022, "eval_samples_per_second": 163.424, "eval_steps_per_second": 1.634, "step": 38000 }, { "epoch": 1.3159712627797735, "grad_norm": 0.3326353430747986, "learning_rate": 0.00029489204545454545, "loss": 1.9151, "step": 38100 }, { "epoch": 1.3194252555954684, "grad_norm": 0.33048099279403687, "learning_rate": 0.00029432386363636363, "loss": 1.9003, "step": 38200 }, { "epoch": 1.3228792484111633, "grad_norm": 0.3198449909687042, "learning_rate": 0.0002937556818181818, "loss": 1.9012, "step": 38300 }, { "epoch": 1.3263332412268583, "grad_norm": 0.3347759246826172, "learning_rate": 0.0002931875, "loss": 1.889, "step": 38400 }, { "epoch": 1.3297872340425532, "grad_norm": 0.344235360622406, "learning_rate": 0.00029261931818181817, "loss": 1.9096, "step": 38500 }, { "epoch": 1.333241226858248, "grad_norm": 0.34197336435317993, "learning_rate": 0.0002920511363636364, "loss": 1.9083, "step": 38600 }, { "epoch": 1.336695219673943, "grad_norm": 0.3257678747177124, "learning_rate": 0.00029148295454545453, "loss": 1.9007, "step": 38700 }, { "epoch": 1.3401492124896381, "grad_norm": 0.3299179971218109, "learning_rate": 0.0002909147727272727, "loss": 1.8992, "step": 38800 }, { "epoch": 1.343603205305333, "grad_norm": 0.32206007838249207, "learning_rate": 0.00029034659090909095, "loss": 1.8853, "step": 38900 }, { "epoch": 1.347057198121028, "grad_norm": 0.3281271159648895, "learning_rate": 0.0002897784090909091, "loss": 1.9075, "step": 39000 }, { "epoch": 1.347057198121028, "eval_loss": 2.266144275665283, "eval_runtime": 933.0379, "eval_samples_per_second": 163.313, "eval_steps_per_second": 1.633, "step": 39000 }, { "epoch": 1.350511190936723, "grad_norm": 0.32982590794563293, "learning_rate": 0.00028921022727272725, "loss": 1.9255, "step": 39100 }, { "epoch": 1.3539651837524178, "grad_norm": 0.33906838297843933, "learning_rate": 0.0002886420454545455, "loss": 1.9119, "step": 39200 }, { "epoch": 1.3574191765681127, "grad_norm": 0.32768332958221436, "learning_rate": 0.0002880738636363636, "loss": 1.8838, "step": 39300 }, { "epoch": 1.3608731693838076, "grad_norm": 0.3550179600715637, "learning_rate": 0.0002875056818181818, "loss": 1.8889, "step": 39400 }, { "epoch": 1.3643271621995026, "grad_norm": 0.32649099826812744, "learning_rate": 0.00028693750000000003, "loss": 1.8983, "step": 39500 }, { "epoch": 1.3677811550151975, "grad_norm": 0.33756542205810547, "learning_rate": 0.00028636931818181816, "loss": 1.8982, "step": 39600 }, { "epoch": 1.3712351478308924, "grad_norm": 0.3554450571537018, "learning_rate": 0.00028580113636363634, "loss": 1.8831, "step": 39700 }, { "epoch": 1.3746891406465875, "grad_norm": 0.3348751962184906, "learning_rate": 0.00028523295454545457, "loss": 1.9022, "step": 39800 }, { "epoch": 1.3781431334622825, "grad_norm": 0.3384929895401001, "learning_rate": 0.00028466477272727275, "loss": 1.8973, "step": 39900 }, { "epoch": 1.3815971262779774, "grad_norm": 0.3346748352050781, "learning_rate": 0.0002840965909090909, "loss": 1.897, "step": 40000 }, { "epoch": 1.3815971262779774, "eval_loss": 2.258094072341919, "eval_runtime": 932.2639, "eval_samples_per_second": 163.448, "eval_steps_per_second": 1.635, "step": 40000 }, { "epoch": 1.3850511190936723, "grad_norm": 0.3488174378871918, "learning_rate": 0.0002835284090909091, "loss": 1.899, "step": 40100 }, { "epoch": 1.3885051119093672, "grad_norm": 0.357048898935318, "learning_rate": 0.0002829602272727273, "loss": 1.8874, "step": 40200 }, { "epoch": 1.3919591047250621, "grad_norm": 0.34619608521461487, "learning_rate": 0.0002823920454545454, "loss": 1.8971, "step": 40300 }, { "epoch": 1.395413097540757, "grad_norm": 0.3450053930282593, "learning_rate": 0.00028182386363636365, "loss": 1.8951, "step": 40400 }, { "epoch": 1.3988670903564522, "grad_norm": 0.3244158923625946, "learning_rate": 0.00028125568181818183, "loss": 1.887, "step": 40500 }, { "epoch": 1.402321083172147, "grad_norm": 0.36656075716018677, "learning_rate": 0.00028068749999999996, "loss": 1.8961, "step": 40600 }, { "epoch": 1.405775075987842, "grad_norm": 0.3427944481372833, "learning_rate": 0.0002801193181818182, "loss": 1.8801, "step": 40700 }, { "epoch": 1.409229068803537, "grad_norm": 0.3511246144771576, "learning_rate": 0.0002795511363636364, "loss": 1.8856, "step": 40800 }, { "epoch": 1.4126830616192319, "grad_norm": 0.34178775548934937, "learning_rate": 0.0002789829545454545, "loss": 1.8888, "step": 40900 }, { "epoch": 1.4161370544349268, "grad_norm": 0.35453692078590393, "learning_rate": 0.00027841477272727273, "loss": 1.8867, "step": 41000 }, { "epoch": 1.4161370544349268, "eval_loss": 2.2483203411102295, "eval_runtime": 932.6422, "eval_samples_per_second": 163.382, "eval_steps_per_second": 1.634, "step": 41000 }, { "epoch": 1.4195910472506217, "grad_norm": 0.38095447421073914, "learning_rate": 0.0002778465909090909, "loss": 1.8847, "step": 41100 }, { "epoch": 1.4230450400663166, "grad_norm": 0.3299073576927185, "learning_rate": 0.00027727840909090915, "loss": 1.8848, "step": 41200 }, { "epoch": 1.4264990328820115, "grad_norm": 0.3188841640949249, "learning_rate": 0.0002767102272727273, "loss": 1.9009, "step": 41300 }, { "epoch": 1.4299530256977064, "grad_norm": 0.3500712811946869, "learning_rate": 0.00027614204545454546, "loss": 1.885, "step": 41400 }, { "epoch": 1.4334070185134014, "grad_norm": 0.34655386209487915, "learning_rate": 0.0002755738636363637, "loss": 1.8862, "step": 41500 }, { "epoch": 1.4368610113290965, "grad_norm": 0.34666162729263306, "learning_rate": 0.0002750056818181818, "loss": 1.8859, "step": 41600 }, { "epoch": 1.4403150041447914, "grad_norm": 0.3630838692188263, "learning_rate": 0.0002744375, "loss": 1.8796, "step": 41700 }, { "epoch": 1.4437689969604863, "grad_norm": 0.40710654854774475, "learning_rate": 0.00027386931818181823, "loss": 1.8822, "step": 41800 }, { "epoch": 1.4472229897761812, "grad_norm": 0.33801448345184326, "learning_rate": 0.00027330113636363636, "loss": 1.8788, "step": 41900 }, { "epoch": 1.4506769825918762, "grad_norm": 0.3448280692100525, "learning_rate": 0.00027273295454545454, "loss": 1.8685, "step": 42000 }, { "epoch": 1.4506769825918762, "eval_loss": 2.24458909034729, "eval_runtime": 932.8306, "eval_samples_per_second": 163.349, "eval_steps_per_second": 1.634, "step": 42000 }, { "epoch": 1.454130975407571, "grad_norm": 0.35361775755882263, "learning_rate": 0.00027216477272727277, "loss": 1.8657, "step": 42100 }, { "epoch": 1.4575849682232662, "grad_norm": 0.3468896448612213, "learning_rate": 0.0002715965909090909, "loss": 1.8701, "step": 42200 }, { "epoch": 1.4610389610389611, "grad_norm": 0.3501305878162384, "learning_rate": 0.0002710284090909091, "loss": 1.8729, "step": 42300 }, { "epoch": 1.464492953854656, "grad_norm": 0.3370625078678131, "learning_rate": 0.0002704602272727273, "loss": 1.8723, "step": 42400 }, { "epoch": 1.467946946670351, "grad_norm": 0.33096930384635925, "learning_rate": 0.0002698920454545455, "loss": 1.8642, "step": 42500 }, { "epoch": 1.471400939486046, "grad_norm": 0.3265809118747711, "learning_rate": 0.0002693238636363636, "loss": 1.8757, "step": 42600 }, { "epoch": 1.4748549323017408, "grad_norm": 0.3586813509464264, "learning_rate": 0.00026875568181818185, "loss": 1.8639, "step": 42700 }, { "epoch": 1.4783089251174357, "grad_norm": 0.3498245179653168, "learning_rate": 0.00026818750000000003, "loss": 1.888, "step": 42800 }, { "epoch": 1.4817629179331306, "grad_norm": 0.34165388345718384, "learning_rate": 0.00026761931818181816, "loss": 1.8644, "step": 42900 }, { "epoch": 1.4852169107488256, "grad_norm": 0.32099393010139465, "learning_rate": 0.0002670511363636364, "loss": 1.8747, "step": 43000 }, { "epoch": 1.4852169107488256, "eval_loss": 2.237309455871582, "eval_runtime": 932.7872, "eval_samples_per_second": 163.357, "eval_steps_per_second": 1.634, "step": 43000 }, { "epoch": 1.4886709035645205, "grad_norm": 0.4318270981311798, "learning_rate": 0.0002664829545454546, "loss": 1.8684, "step": 43100 }, { "epoch": 1.4921248963802154, "grad_norm": 0.34946203231811523, "learning_rate": 0.0002659147727272727, "loss": 1.8813, "step": 43200 }, { "epoch": 1.4955788891959105, "grad_norm": 0.33623960614204407, "learning_rate": 0.00026534659090909094, "loss": 1.8566, "step": 43300 }, { "epoch": 1.4990328820116054, "grad_norm": 0.3431924283504486, "learning_rate": 0.0002647784090909091, "loss": 1.8555, "step": 43400 }, { "epoch": 1.5024868748273004, "grad_norm": 0.3669569492340088, "learning_rate": 0.00026421022727272724, "loss": 1.8656, "step": 43500 }, { "epoch": 1.5059408676429953, "grad_norm": 0.3411414623260498, "learning_rate": 0.0002636420454545455, "loss": 1.8533, "step": 43600 }, { "epoch": 1.5093948604586902, "grad_norm": 0.348023384809494, "learning_rate": 0.00026307386363636366, "loss": 1.8583, "step": 43700 }, { "epoch": 1.5128488532743853, "grad_norm": 0.3822565972805023, "learning_rate": 0.0002625056818181818, "loss": 1.8669, "step": 43800 }, { "epoch": 1.5163028460900803, "grad_norm": 0.34821194410324097, "learning_rate": 0.0002619375, "loss": 1.8513, "step": 43900 }, { "epoch": 1.5197568389057752, "grad_norm": 0.35662829875946045, "learning_rate": 0.0002613693181818182, "loss": 1.8699, "step": 44000 }, { "epoch": 1.5197568389057752, "eval_loss": 2.2242226600646973, "eval_runtime": 933.1522, "eval_samples_per_second": 163.293, "eval_steps_per_second": 1.633, "step": 44000 }, { "epoch": 1.52321083172147, "grad_norm": 0.34279394149780273, "learning_rate": 0.0002608011363636364, "loss": 1.8583, "step": 44100 }, { "epoch": 1.526664824537165, "grad_norm": 0.35233989357948303, "learning_rate": 0.00026023295454545456, "loss": 1.8434, "step": 44200 }, { "epoch": 1.53011881735286, "grad_norm": 0.34149396419525146, "learning_rate": 0.00025966477272727274, "loss": 1.8593, "step": 44300 }, { "epoch": 1.5335728101685548, "grad_norm": 0.35298213362693787, "learning_rate": 0.0002590965909090909, "loss": 1.8439, "step": 44400 }, { "epoch": 1.5370268029842498, "grad_norm": 0.3766247630119324, "learning_rate": 0.0002585284090909091, "loss": 1.8645, "step": 44500 }, { "epoch": 1.5404807957999447, "grad_norm": 0.3492392301559448, "learning_rate": 0.0002579602272727273, "loss": 1.8551, "step": 44600 }, { "epoch": 1.5439347886156396, "grad_norm": 0.324101060628891, "learning_rate": 0.00025739204545454546, "loss": 1.8657, "step": 44700 }, { "epoch": 1.5473887814313345, "grad_norm": 0.3346399664878845, "learning_rate": 0.00025682386363636364, "loss": 1.8483, "step": 44800 }, { "epoch": 1.5508427742470294, "grad_norm": 0.35447120666503906, "learning_rate": 0.0002562556818181818, "loss": 1.8424, "step": 44900 }, { "epoch": 1.5542967670627243, "grad_norm": 0.3583132326602936, "learning_rate": 0.0002556875, "loss": 1.8619, "step": 45000 }, { "epoch": 1.5542967670627243, "eval_loss": 2.2166972160339355, "eval_runtime": 933.428, "eval_samples_per_second": 163.245, "eval_steps_per_second": 1.633, "step": 45000 }, { "epoch": 1.5577507598784195, "grad_norm": 0.34049317240715027, "learning_rate": 0.0002551193181818182, "loss": 1.8577, "step": 45100 }, { "epoch": 1.5612047526941144, "grad_norm": 0.3376822769641876, "learning_rate": 0.00025455113636363636, "loss": 1.8448, "step": 45200 }, { "epoch": 1.5646587455098093, "grad_norm": 0.3559693396091461, "learning_rate": 0.00025398295454545454, "loss": 1.8366, "step": 45300 }, { "epoch": 1.5681127383255042, "grad_norm": 0.34435904026031494, "learning_rate": 0.0002534147727272728, "loss": 1.8485, "step": 45400 }, { "epoch": 1.5715667311411994, "grad_norm": 0.35500675439834595, "learning_rate": 0.0002528465909090909, "loss": 1.8516, "step": 45500 }, { "epoch": 1.5750207239568943, "grad_norm": 0.34272322058677673, "learning_rate": 0.0002522784090909091, "loss": 1.8296, "step": 45600 }, { "epoch": 1.5784747167725892, "grad_norm": 0.36497625708580017, "learning_rate": 0.0002517102272727273, "loss": 1.8255, "step": 45700 }, { "epoch": 1.5819287095882841, "grad_norm": 0.31943902373313904, "learning_rate": 0.00025114204545454544, "loss": 1.8657, "step": 45800 }, { "epoch": 1.585382702403979, "grad_norm": 0.3567992150783539, "learning_rate": 0.0002505738636363636, "loss": 1.8727, "step": 45900 }, { "epoch": 1.588836695219674, "grad_norm": 0.3523275554180145, "learning_rate": 0.00025000568181818186, "loss": 1.8348, "step": 46000 }, { "epoch": 1.588836695219674, "eval_loss": 2.211845874786377, "eval_runtime": 932.6736, "eval_samples_per_second": 163.377, "eval_steps_per_second": 1.634, "step": 46000 }, { "epoch": 1.5922906880353689, "grad_norm": 0.3533009886741638, "learning_rate": 0.0002494375, "loss": 1.8324, "step": 46100 }, { "epoch": 1.5957446808510638, "grad_norm": 0.35436585545539856, "learning_rate": 0.00024886931818181817, "loss": 1.8329, "step": 46200 }, { "epoch": 1.5991986736667587, "grad_norm": 0.35463017225265503, "learning_rate": 0.0002483011363636364, "loss": 1.848, "step": 46300 }, { "epoch": 1.6026526664824536, "grad_norm": 0.33948197960853577, "learning_rate": 0.0002477329545454546, "loss": 1.8416, "step": 46400 }, { "epoch": 1.6061066592981486, "grad_norm": 0.3487997353076935, "learning_rate": 0.0002471647727272727, "loss": 1.8331, "step": 46500 }, { "epoch": 1.6095606521138435, "grad_norm": 0.3553692698478699, "learning_rate": 0.00024659659090909094, "loss": 1.8443, "step": 46600 }, { "epoch": 1.6130146449295384, "grad_norm": 0.3699355721473694, "learning_rate": 0.0002460284090909091, "loss": 1.8396, "step": 46700 }, { "epoch": 1.6164686377452335, "grad_norm": 0.33341851830482483, "learning_rate": 0.00024546022727272725, "loss": 1.8266, "step": 46800 }, { "epoch": 1.6199226305609284, "grad_norm": 0.3703523874282837, "learning_rate": 0.0002448920454545455, "loss": 1.8356, "step": 46900 }, { "epoch": 1.6233766233766234, "grad_norm": 0.34331998229026794, "learning_rate": 0.00024432386363636366, "loss": 1.8506, "step": 47000 }, { "epoch": 1.6233766233766234, "eval_loss": 2.201261281967163, "eval_runtime": 932.8465, "eval_samples_per_second": 163.346, "eval_steps_per_second": 1.634, "step": 47000 }, { "epoch": 1.6268306161923183, "grad_norm": 0.3524048924446106, "learning_rate": 0.00024375568181818184, "loss": 1.8276, "step": 47100 }, { "epoch": 1.6302846090080134, "grad_norm": 0.6397112607955933, "learning_rate": 0.0002431875, "loss": 1.8358, "step": 47200 }, { "epoch": 1.6337386018237083, "grad_norm": 0.3624354600906372, "learning_rate": 0.00024261931818181818, "loss": 1.819, "step": 47300 }, { "epoch": 1.6371925946394033, "grad_norm": 0.3678456246852875, "learning_rate": 0.00024205113636363638, "loss": 1.8151, "step": 47400 }, { "epoch": 1.6406465874550982, "grad_norm": 0.38248035311698914, "learning_rate": 0.00024148295454545454, "loss": 1.8303, "step": 47500 }, { "epoch": 1.644100580270793, "grad_norm": 0.36703070998191833, "learning_rate": 0.00024091477272727272, "loss": 1.8375, "step": 47600 }, { "epoch": 1.647554573086488, "grad_norm": 0.34606924653053284, "learning_rate": 0.00024034659090909092, "loss": 1.8261, "step": 47700 }, { "epoch": 1.651008565902183, "grad_norm": 0.35459455847740173, "learning_rate": 0.00023977840909090908, "loss": 1.8541, "step": 47800 }, { "epoch": 1.6544625587178778, "grad_norm": 0.35106080770492554, "learning_rate": 0.00023921022727272728, "loss": 1.8434, "step": 47900 }, { "epoch": 1.6579165515335728, "grad_norm": 0.3380804657936096, "learning_rate": 0.00023864204545454547, "loss": 1.8323, "step": 48000 }, { "epoch": 1.6579165515335728, "eval_loss": 2.1915159225463867, "eval_runtime": 932.8372, "eval_samples_per_second": 163.348, "eval_steps_per_second": 1.634, "step": 48000 }, { "epoch": 1.6613705443492677, "grad_norm": 0.36180025339126587, "learning_rate": 0.00023807386363636362, "loss": 1.8347, "step": 48100 }, { "epoch": 1.6648245371649626, "grad_norm": 0.33836793899536133, "learning_rate": 0.00023750568181818183, "loss": 1.8169, "step": 48200 }, { "epoch": 1.6682785299806575, "grad_norm": 0.34874165058135986, "learning_rate": 0.0002369375, "loss": 1.8206, "step": 48300 }, { "epoch": 1.6717325227963524, "grad_norm": 0.3255716562271118, "learning_rate": 0.0002363693181818182, "loss": 1.8319, "step": 48400 }, { "epoch": 1.6751865156120476, "grad_norm": 0.3886810839176178, "learning_rate": 0.00023580113636363637, "loss": 1.8208, "step": 48500 }, { "epoch": 1.6786405084277425, "grad_norm": 0.38673707842826843, "learning_rate": 0.00023523295454545455, "loss": 1.8294, "step": 48600 }, { "epoch": 1.6820945012434374, "grad_norm": 0.3884912431240082, "learning_rate": 0.00023466477272727273, "loss": 1.8137, "step": 48700 }, { "epoch": 1.6855484940591323, "grad_norm": 0.35155996680259705, "learning_rate": 0.0002340965909090909, "loss": 1.8135, "step": 48800 }, { "epoch": 1.6890024868748275, "grad_norm": 0.34583061933517456, "learning_rate": 0.0002335284090909091, "loss": 1.8125, "step": 48900 }, { "epoch": 1.6924564796905224, "grad_norm": 0.3412420451641083, "learning_rate": 0.00023296022727272727, "loss": 1.8238, "step": 49000 }, { "epoch": 1.6924564796905224, "eval_loss": 2.1860053539276123, "eval_runtime": 932.5574, "eval_samples_per_second": 163.397, "eval_steps_per_second": 1.634, "step": 49000 }, { "epoch": 1.6959104725062173, "grad_norm": 0.36108842492103577, "learning_rate": 0.00023239204545454545, "loss": 1.8195, "step": 49100 }, { "epoch": 1.6993644653219122, "grad_norm": 0.3617706000804901, "learning_rate": 0.00023182386363636366, "loss": 1.8032, "step": 49200 }, { "epoch": 1.7028184581376071, "grad_norm": 0.36145681142807007, "learning_rate": 0.00023125568181818184, "loss": 1.8441, "step": 49300 }, { "epoch": 1.706272450953302, "grad_norm": 0.3923262059688568, "learning_rate": 0.0002306875, "loss": 1.8136, "step": 49400 }, { "epoch": 1.709726443768997, "grad_norm": 0.3287799656391144, "learning_rate": 0.0002301193181818182, "loss": 1.8211, "step": 49500 }, { "epoch": 1.7131804365846919, "grad_norm": 0.35752880573272705, "learning_rate": 0.00022955113636363638, "loss": 1.8108, "step": 49600 }, { "epoch": 1.7166344294003868, "grad_norm": 0.3737923204898834, "learning_rate": 0.00022898295454545456, "loss": 1.8033, "step": 49700 }, { "epoch": 1.7200884222160817, "grad_norm": 0.374796599149704, "learning_rate": 0.00022841477272727274, "loss": 1.8097, "step": 49800 }, { "epoch": 1.7235424150317766, "grad_norm": 0.386203408241272, "learning_rate": 0.00022784659090909092, "loss": 1.811, "step": 49900 }, { "epoch": 1.7269964078474715, "grad_norm": 0.3648054003715515, "learning_rate": 0.0002272784090909091, "loss": 1.8061, "step": 50000 }, { "epoch": 1.7269964078474715, "eval_loss": 2.1760547161102295, "eval_runtime": 932.5357, "eval_samples_per_second": 163.401, "eval_steps_per_second": 1.634, "step": 50000 } ], "logging_steps": 100, "max_steps": 90000, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.64606111435273e+18, "train_batch_size": 100, "trial_name": null, "trial_params": null }