| { |
| "best_global_step": 352, |
| "best_metric": 1.5587613582611084, |
| "best_model_checkpoint": "./my_model/checkpoint-352", |
| "epoch": 64.0, |
| "eval_steps": 500, |
| "global_step": 1024, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 2.611812502145767, |
| "epoch": 0.06451612903225806, |
| "grad_norm": 2.948519468307495, |
| "learning_rate": 0.0, |
| "loss": 5.2626, |
| "mean_token_accuracy": 0.2785332165658474, |
| "num_tokens": 1354.0, |
| "step": 1 |
| }, |
| { |
| "entropy": 2.4764878584278955, |
| "epoch": 0.6451612903225806, |
| "grad_norm": 2.852177858352661, |
| "learning_rate": 5.625e-07, |
| "loss": 5.0479, |
| "mean_token_accuracy": 0.30352623243298793, |
| "num_tokens": 13915.0, |
| "step": 10 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_entropy": 2.549605812345232, |
| "eval_loss": 5.082220554351807, |
| "eval_mean_token_accuracy": 0.29234256382499424, |
| "eval_num_tokens": 21527.0, |
| "eval_runtime": 0.8908, |
| "eval_samples_per_second": 61.741, |
| "eval_steps_per_second": 15.716, |
| "step": 16 |
| }, |
| { |
| "entropy": 2.4721337352928363, |
| "epoch": 1.2580645161290323, |
| "grad_norm": 2.8321847915649414, |
| "learning_rate": 1.1875e-06, |
| "loss": 5.033, |
| "mean_token_accuracy": 0.30711135052536664, |
| "num_tokens": 27177.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 2.514200139045715, |
| "epoch": 1.903225806451613, |
| "grad_norm": 3.025498151779175, |
| "learning_rate": 1.8125e-06, |
| "loss": 5.0773, |
| "mean_token_accuracy": 0.2981846956536174, |
| "num_tokens": 40986.0, |
| "step": 30 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_entropy": 2.5709889445986067, |
| "eval_loss": 5.047023773193359, |
| "eval_mean_token_accuracy": 0.2919592655130795, |
| "eval_num_tokens": 43054.0, |
| "eval_runtime": 0.8868, |
| "eval_samples_per_second": 62.018, |
| "eval_steps_per_second": 15.786, |
| "step": 32 |
| }, |
| { |
| "entropy": 2.541774250959095, |
| "epoch": 2.5161290322580645, |
| "grad_norm": 3.4038965702056885, |
| "learning_rate": 2.4375000000000004e-06, |
| "loss": 5.0438, |
| "mean_token_accuracy": 0.2972054022707437, |
| "num_tokens": 54010.0, |
| "step": 40 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_entropy": 2.5724382059914723, |
| "eval_loss": 4.9238505363464355, |
| "eval_mean_token_accuracy": 0.2935441700475557, |
| "eval_num_tokens": 64581.0, |
| "eval_runtime": 0.997, |
| "eval_samples_per_second": 55.165, |
| "eval_steps_per_second": 14.042, |
| "step": 48 |
| }, |
| { |
| "entropy": 2.5254996732661597, |
| "epoch": 3.129032258064516, |
| "grad_norm": 4.6959710121154785, |
| "learning_rate": 3.0625e-06, |
| "loss": 4.8754, |
| "mean_token_accuracy": 0.3122231556396735, |
| "num_tokens": 67318.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 2.532456985116005, |
| "epoch": 3.774193548387097, |
| "grad_norm": 1.8910281658172607, |
| "learning_rate": 3.6875e-06, |
| "loss": 4.8683, |
| "mean_token_accuracy": 0.31472160052508114, |
| "num_tokens": 81243.0, |
| "step": 60 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_entropy": 2.6716178315026418, |
| "eval_loss": 4.745687484741211, |
| "eval_mean_token_accuracy": 0.3186415561607906, |
| "eval_num_tokens": 86108.0, |
| "eval_runtime": 0.8831, |
| "eval_samples_per_second": 62.282, |
| "eval_steps_per_second": 15.854, |
| "step": 64 |
| }, |
| { |
| "entropy": 2.6316533935697457, |
| "epoch": 4.387096774193548, |
| "grad_norm": 1.985713243484497, |
| "learning_rate": 4.3125e-06, |
| "loss": 4.7158, |
| "mean_token_accuracy": 0.32339329899925934, |
| "num_tokens": 94546.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 2.6654595481721977, |
| "epoch": 5.0, |
| "grad_norm": 1.8635355234146118, |
| "learning_rate": 4.937500000000001e-06, |
| "loss": 4.4521, |
| "mean_token_accuracy": 0.3342058395868854, |
| "num_tokens": 107635.0, |
| "step": 80 |
| }, |
| { |
| "epoch": 5.0, |
| "eval_entropy": 2.7309968301228116, |
| "eval_loss": 4.356206893920898, |
| "eval_mean_token_accuracy": 0.3469075581857136, |
| "eval_num_tokens": 107635.0, |
| "eval_runtime": 1.4048, |
| "eval_samples_per_second": 39.153, |
| "eval_steps_per_second": 9.966, |
| "step": 80 |
| }, |
| { |
| "entropy": 2.6752349376678466, |
| "epoch": 5.645161290322581, |
| "grad_norm": 1.7081493139266968, |
| "learning_rate": 5.5625000000000005e-06, |
| "loss": 4.1996, |
| "mean_token_accuracy": 0.3722452763468027, |
| "num_tokens": 121563.0, |
| "step": 90 |
| }, |
| { |
| "epoch": 6.0, |
| "eval_entropy": 2.802973576954433, |
| "eval_loss": 3.9146478176116943, |
| "eval_mean_token_accuracy": 0.38693774597985403, |
| "eval_num_tokens": 129162.0, |
| "eval_runtime": 0.9423, |
| "eval_samples_per_second": 58.371, |
| "eval_steps_per_second": 14.858, |
| "step": 96 |
| }, |
| { |
| "entropy": 2.770364958988993, |
| "epoch": 6.258064516129032, |
| "grad_norm": 1.6673827171325684, |
| "learning_rate": 6.1875000000000005e-06, |
| "loss": 3.9392, |
| "mean_token_accuracy": 0.387496774133883, |
| "num_tokens": 134741.0, |
| "step": 100 |
| }, |
| { |
| "entropy": 2.7224088311195374, |
| "epoch": 6.903225806451613, |
| "grad_norm": 1.527048110961914, |
| "learning_rate": 6.8125e-06, |
| "loss": 3.62, |
| "mean_token_accuracy": 0.45071578361094, |
| "num_tokens": 148587.0, |
| "step": 110 |
| }, |
| { |
| "epoch": 7.0, |
| "eval_entropy": 2.6817347833088467, |
| "eval_loss": 3.47971248626709, |
| "eval_mean_token_accuracy": 0.487923339009285, |
| "eval_num_tokens": 150689.0, |
| "eval_runtime": 0.9499, |
| "eval_samples_per_second": 57.9, |
| "eval_steps_per_second": 14.738, |
| "step": 112 |
| }, |
| { |
| "entropy": 2.5822339403001884, |
| "epoch": 7.516129032258064, |
| "grad_norm": 1.3976553678512573, |
| "learning_rate": 7.4375e-06, |
| "loss": 3.3133, |
| "mean_token_accuracy": 0.5055951774120331, |
| "num_tokens": 161826.0, |
| "step": 120 |
| }, |
| { |
| "epoch": 8.0, |
| "eval_entropy": 2.662975311279297, |
| "eval_loss": 3.109734296798706, |
| "eval_mean_token_accuracy": 0.5159200259617397, |
| "eval_num_tokens": 172216.0, |
| "eval_runtime": 0.9281, |
| "eval_samples_per_second": 59.26, |
| "eval_steps_per_second": 15.084, |
| "step": 128 |
| }, |
| { |
| "entropy": 2.563361422011727, |
| "epoch": 8.129032258064516, |
| "grad_norm": 1.7696324586868286, |
| "learning_rate": 8.062500000000001e-06, |
| "loss": 3.1155, |
| "mean_token_accuracy": 0.5187091541133428, |
| "num_tokens": 175017.0, |
| "step": 130 |
| }, |
| { |
| "entropy": 2.588775309920311, |
| "epoch": 8.774193548387096, |
| "grad_norm": 2.261418342590332, |
| "learning_rate": 8.6875e-06, |
| "loss": 2.8235, |
| "mean_token_accuracy": 0.5475961033254861, |
| "num_tokens": 188936.0, |
| "step": 140 |
| }, |
| { |
| "epoch": 9.0, |
| "eval_entropy": 2.5937297514506747, |
| "eval_loss": 2.6096889972686768, |
| "eval_mean_token_accuracy": 0.5660783967801503, |
| "eval_num_tokens": 193743.0, |
| "eval_runtime": 0.9053, |
| "eval_samples_per_second": 60.757, |
| "eval_steps_per_second": 15.465, |
| "step": 144 |
| }, |
| { |
| "entropy": 2.5060042989881417, |
| "epoch": 9.387096774193548, |
| "grad_norm": 1.5702177286148071, |
| "learning_rate": 9.312500000000001e-06, |
| "loss": 2.5439, |
| "mean_token_accuracy": 0.5805619108049493, |
| "num_tokens": 202001.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 2.201874099279705, |
| "epoch": 10.0, |
| "grad_norm": 1.8405438661575317, |
| "learning_rate": 9.937500000000001e-06, |
| "loss": 2.3102, |
| "mean_token_accuracy": 0.6006814374735481, |
| "num_tokens": 215270.0, |
| "step": 160 |
| }, |
| { |
| "epoch": 10.0, |
| "eval_entropy": 2.1728043726512363, |
| "eval_loss": 2.3522286415100098, |
| "eval_mean_token_accuracy": 0.5939501779420036, |
| "eval_num_tokens": 215270.0, |
| "eval_runtime": 0.9322, |
| "eval_samples_per_second": 58.999, |
| "eval_steps_per_second": 15.018, |
| "step": 160 |
| }, |
| { |
| "entropy": 2.1090281650424005, |
| "epoch": 10.64516129032258, |
| "grad_norm": 1.3122938871383667, |
| "learning_rate": 1.0562500000000001e-05, |
| "loss": 2.2236, |
| "mean_token_accuracy": 0.605573232471943, |
| "num_tokens": 229135.0, |
| "step": 170 |
| }, |
| { |
| "epoch": 11.0, |
| "eval_entropy": 2.126331014292581, |
| "eval_loss": 2.2020554542541504, |
| "eval_mean_token_accuracy": 0.6053547646318164, |
| "eval_num_tokens": 236797.0, |
| "eval_runtime": 0.9054, |
| "eval_samples_per_second": 60.745, |
| "eval_steps_per_second": 15.462, |
| "step": 176 |
| }, |
| { |
| "entropy": 2.053142737401159, |
| "epoch": 11.258064516129032, |
| "grad_norm": 1.5178934335708618, |
| "learning_rate": 1.1187500000000001e-05, |
| "loss": 2.0915, |
| "mean_token_accuracy": 0.61878864937707, |
| "num_tokens": 242377.0, |
| "step": 180 |
| }, |
| { |
| "entropy": 2.013620141148567, |
| "epoch": 11.903225806451612, |
| "grad_norm": 1.4182125329971313, |
| "learning_rate": 1.1812499999999999e-05, |
| "loss": 2.0478, |
| "mean_token_accuracy": 0.6269605554640293, |
| "num_tokens": 256298.0, |
| "step": 190 |
| }, |
| { |
| "epoch": 12.0, |
| "eval_entropy": 2.0988540819713046, |
| "eval_loss": 2.093829393386841, |
| "eval_mean_token_accuracy": 0.6164226361683437, |
| "eval_num_tokens": 258324.0, |
| "eval_runtime": 0.9948, |
| "eval_samples_per_second": 55.289, |
| "eval_steps_per_second": 14.074, |
| "step": 192 |
| }, |
| { |
| "entropy": 1.9682148710677498, |
| "epoch": 12.516129032258064, |
| "grad_norm": 1.3009895086288452, |
| "learning_rate": 1.24375e-05, |
| "loss": 1.9508, |
| "mean_token_accuracy": 0.6344620632497888, |
| "num_tokens": 269514.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 13.0, |
| "eval_entropy": 2.022765415055411, |
| "eval_loss": 1.9986889362335205, |
| "eval_mean_token_accuracy": 0.6248509841305869, |
| "eval_num_tokens": 279851.0, |
| "eval_runtime": 1.3989, |
| "eval_samples_per_second": 39.316, |
| "eval_steps_per_second": 10.008, |
| "step": 208 |
| }, |
| { |
| "entropy": 1.9475462483732324, |
| "epoch": 13.129032258064516, |
| "grad_norm": 1.2914632558822632, |
| "learning_rate": 1.3062499999999999e-05, |
| "loss": 1.9446, |
| "mean_token_accuracy": 0.6389191699655432, |
| "num_tokens": 282652.0, |
| "step": 210 |
| }, |
| { |
| "entropy": 1.9222171217203141, |
| "epoch": 13.774193548387096, |
| "grad_norm": 1.631095290184021, |
| "learning_rate": 1.36875e-05, |
| "loss": 1.8472, |
| "mean_token_accuracy": 0.6466637052595615, |
| "num_tokens": 296522.0, |
| "step": 220 |
| }, |
| { |
| "epoch": 14.0, |
| "eval_entropy": 1.9563691530908858, |
| "eval_loss": 1.9220991134643555, |
| "eval_mean_token_accuracy": 0.6309446564742497, |
| "eval_num_tokens": 301378.0, |
| "eval_runtime": 1.0668, |
| "eval_samples_per_second": 51.555, |
| "eval_steps_per_second": 13.123, |
| "step": 224 |
| }, |
| { |
| "entropy": 1.870294423479783, |
| "epoch": 14.387096774193548, |
| "grad_norm": 1.3715276718139648, |
| "learning_rate": 1.43125e-05, |
| "loss": 1.7688, |
| "mean_token_accuracy": 0.6547230929136276, |
| "num_tokens": 309834.0, |
| "step": 230 |
| }, |
| { |
| "entropy": 1.8683158645504399, |
| "epoch": 15.0, |
| "grad_norm": 1.9479233026504517, |
| "learning_rate": 1.4937500000000002e-05, |
| "loss": 1.7425, |
| "mean_token_accuracy": 0.6565716470542707, |
| "num_tokens": 322905.0, |
| "step": 240 |
| }, |
| { |
| "epoch": 15.0, |
| "eval_entropy": 1.9176117862973894, |
| "eval_loss": 1.8483814001083374, |
| "eval_mean_token_accuracy": 0.6409520549433572, |
| "eval_num_tokens": 322905.0, |
| "eval_runtime": 0.9202, |
| "eval_samples_per_second": 59.77, |
| "eval_steps_per_second": 15.214, |
| "step": 240 |
| }, |
| { |
| "entropy": 1.8178010761737824, |
| "epoch": 15.64516129032258, |
| "grad_norm": 1.6151540279388428, |
| "learning_rate": 1.5562500000000002e-05, |
| "loss": 1.6598, |
| "mean_token_accuracy": 0.6694157928228378, |
| "num_tokens": 336780.0, |
| "step": 250 |
| }, |
| { |
| "epoch": 16.0, |
| "eval_entropy": 1.8427454914365495, |
| "eval_loss": 1.7848949432373047, |
| "eval_mean_token_accuracy": 0.6483822464942932, |
| "eval_num_tokens": 344432.0, |
| "eval_runtime": 0.8965, |
| "eval_samples_per_second": 61.351, |
| "eval_steps_per_second": 15.617, |
| "step": 256 |
| }, |
| { |
| "entropy": 1.7828364309511686, |
| "epoch": 16.258064516129032, |
| "grad_norm": 1.4990966320037842, |
| "learning_rate": 1.61875e-05, |
| "loss": 1.5925, |
| "mean_token_accuracy": 0.6848364127309698, |
| "num_tokens": 349950.0, |
| "step": 260 |
| }, |
| { |
| "entropy": 1.7749755203723907, |
| "epoch": 16.903225806451612, |
| "grad_norm": 1.6659446954727173, |
| "learning_rate": 1.6812500000000002e-05, |
| "loss": 1.5674, |
| "mean_token_accuracy": 0.7018352136015892, |
| "num_tokens": 363887.0, |
| "step": 270 |
| }, |
| { |
| "epoch": 17.0, |
| "eval_entropy": 1.7996527978352137, |
| "eval_loss": 1.7261488437652588, |
| "eval_mean_token_accuracy": 0.6748419829777309, |
| "eval_num_tokens": 365959.0, |
| "eval_runtime": 0.956, |
| "eval_samples_per_second": 57.533, |
| "eval_steps_per_second": 14.645, |
| "step": 272 |
| }, |
| { |
| "entropy": 1.7126218849106838, |
| "epoch": 17.516129032258064, |
| "grad_norm": 1.7774029970169067, |
| "learning_rate": 1.74375e-05, |
| "loss": 1.4737, |
| "mean_token_accuracy": 0.7113534455236635, |
| "num_tokens": 377051.0, |
| "step": 280 |
| }, |
| { |
| "epoch": 18.0, |
| "eval_entropy": 1.7618773494447981, |
| "eval_loss": 1.6715681552886963, |
| "eval_mean_token_accuracy": 0.6734440326690674, |
| "eval_num_tokens": 387486.0, |
| "eval_runtime": 1.3598, |
| "eval_samples_per_second": 40.446, |
| "eval_steps_per_second": 10.295, |
| "step": 288 |
| }, |
| { |
| "entropy": 1.684084642874567, |
| "epoch": 18.129032258064516, |
| "grad_norm": 1.8488755226135254, |
| "learning_rate": 1.8062500000000002e-05, |
| "loss": 1.4353, |
| "mean_token_accuracy": 0.717556736186931, |
| "num_tokens": 390288.0, |
| "step": 290 |
| }, |
| { |
| "entropy": 1.6123981848359108, |
| "epoch": 18.774193548387096, |
| "grad_norm": 1.94077467918396, |
| "learning_rate": 1.8687500000000004e-05, |
| "loss": 1.347, |
| "mean_token_accuracy": 0.7245998069643974, |
| "num_tokens": 404246.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 19.0, |
| "eval_entropy": 1.6447282092911857, |
| "eval_loss": 1.6219120025634766, |
| "eval_mean_token_accuracy": 0.6779105194977352, |
| "eval_num_tokens": 409013.0, |
| "eval_runtime": 0.9117, |
| "eval_samples_per_second": 60.326, |
| "eval_steps_per_second": 15.356, |
| "step": 304 |
| }, |
| { |
| "entropy": 1.5541185391576666, |
| "epoch": 19.387096774193548, |
| "grad_norm": 2.0378854274749756, |
| "learning_rate": 1.93125e-05, |
| "loss": 1.2785, |
| "mean_token_accuracy": 0.7347154123218436, |
| "num_tokens": 417317.0, |
| "step": 310 |
| }, |
| { |
| "entropy": 1.403356123911707, |
| "epoch": 20.0, |
| "grad_norm": 2.347236394882202, |
| "learning_rate": 1.99375e-05, |
| "loss": 1.219, |
| "mean_token_accuracy": 0.7410024349626742, |
| "num_tokens": 430540.0, |
| "step": 320 |
| }, |
| { |
| "epoch": 20.0, |
| "eval_entropy": 1.4916774034500122, |
| "eval_loss": 1.587111473083496, |
| "eval_mean_token_accuracy": 0.6795690613133567, |
| "eval_num_tokens": 430540.0, |
| "eval_runtime": 0.9048, |
| "eval_samples_per_second": 60.786, |
| "eval_steps_per_second": 15.473, |
| "step": 320 |
| }, |
| { |
| "entropy": 1.338417048752308, |
| "epoch": 20.64516129032258, |
| "grad_norm": 2.188633441925049, |
| "learning_rate": 2.0562500000000002e-05, |
| "loss": 1.1363, |
| "mean_token_accuracy": 0.7564118377864361, |
| "num_tokens": 444500.0, |
| "step": 330 |
| }, |
| { |
| "epoch": 21.0, |
| "eval_entropy": 1.3439774853842599, |
| "eval_loss": 1.5705065727233887, |
| "eval_mean_token_accuracy": 0.6734923720359802, |
| "eval_num_tokens": 452067.0, |
| "eval_runtime": 1.3705, |
| "eval_samples_per_second": 40.131, |
| "eval_steps_per_second": 10.215, |
| "step": 336 |
| }, |
| { |
| "entropy": 1.2597325475592362, |
| "epoch": 21.258064516129032, |
| "grad_norm": 2.0218992233276367, |
| "learning_rate": 2.1187500000000003e-05, |
| "loss": 1.0859, |
| "mean_token_accuracy": 0.7610587077705484, |
| "num_tokens": 457711.0, |
| "step": 340 |
| }, |
| { |
| "entropy": 1.1984408333897592, |
| "epoch": 21.903225806451612, |
| "grad_norm": 2.203165292739868, |
| "learning_rate": 2.18125e-05, |
| "loss": 1.0145, |
| "mean_token_accuracy": 0.7772165350615978, |
| "num_tokens": 471531.0, |
| "step": 350 |
| }, |
| { |
| "epoch": 22.0, |
| "eval_entropy": 1.2664960197040014, |
| "eval_loss": 1.5587613582611084, |
| "eval_mean_token_accuracy": 0.6787797468049186, |
| "eval_num_tokens": 473594.0, |
| "eval_runtime": 0.9351, |
| "eval_samples_per_second": 58.82, |
| "eval_steps_per_second": 14.972, |
| "step": 352 |
| }, |
| { |
| "entropy": 1.1226187560119127, |
| "epoch": 22.516129032258064, |
| "grad_norm": 2.4278323650360107, |
| "learning_rate": 2.24375e-05, |
| "loss": 0.9255, |
| "mean_token_accuracy": 0.7932525535947398, |
| "num_tokens": 484678.0, |
| "step": 360 |
| }, |
| { |
| "epoch": 23.0, |
| "eval_entropy": 1.2237448862620763, |
| "eval_loss": 1.579514741897583, |
| "eval_mean_token_accuracy": 0.6716454710279193, |
| "eval_num_tokens": 495121.0, |
| "eval_runtime": 1.3763, |
| "eval_samples_per_second": 39.963, |
| "eval_steps_per_second": 10.172, |
| "step": 368 |
| }, |
| { |
| "entropy": 1.0918044400842566, |
| "epoch": 23.129032258064516, |
| "grad_norm": 2.5345664024353027, |
| "learning_rate": 2.30625e-05, |
| "loss": 0.8737, |
| "mean_token_accuracy": 0.7978831976652145, |
| "num_tokens": 497974.0, |
| "step": 370 |
| }, |
| { |
| "entropy": 1.006336173415184, |
| "epoch": 23.774193548387096, |
| "grad_norm": 2.779085874557495, |
| "learning_rate": 2.36875e-05, |
| "loss": 0.7646, |
| "mean_token_accuracy": 0.8216063916683197, |
| "num_tokens": 511856.0, |
| "step": 380 |
| }, |
| { |
| "epoch": 24.0, |
| "eval_entropy": 1.1918106589998518, |
| "eval_loss": 1.5942119359970093, |
| "eval_mean_token_accuracy": 0.6677229617323194, |
| "eval_num_tokens": 516648.0, |
| "eval_runtime": 0.9097, |
| "eval_samples_per_second": 60.46, |
| "eval_steps_per_second": 15.39, |
| "step": 384 |
| }, |
| { |
| "entropy": 1.0034341443526118, |
| "epoch": 24.387096774193548, |
| "grad_norm": 2.9064760208129883, |
| "learning_rate": 2.43125e-05, |
| "loss": 0.7059, |
| "mean_token_accuracy": 0.8363859167224482, |
| "num_tokens": 524969.0, |
| "step": 390 |
| }, |
| { |
| "entropy": 0.9153783635089272, |
| "epoch": 25.0, |
| "grad_norm": 3.5367863178253174, |
| "learning_rate": 2.4937500000000003e-05, |
| "loss": 0.6268, |
| "mean_token_accuracy": 0.8444738458645972, |
| "num_tokens": 538175.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 25.0, |
| "eval_entropy": 1.1169007931436812, |
| "eval_loss": 1.666494369506836, |
| "eval_mean_token_accuracy": 0.6655990694250379, |
| "eval_num_tokens": 538175.0, |
| "eval_runtime": 0.9258, |
| "eval_samples_per_second": 59.408, |
| "eval_steps_per_second": 15.122, |
| "step": 400 |
| }, |
| { |
| "entropy": 0.8221864104270935, |
| "epoch": 25.64516129032258, |
| "grad_norm": 2.6190507411956787, |
| "learning_rate": 2.55625e-05, |
| "loss": 0.5171, |
| "mean_token_accuracy": 0.8675303012132645, |
| "num_tokens": 552002.0, |
| "step": 410 |
| }, |
| { |
| "epoch": 26.0, |
| "eval_entropy": 1.0611292464392525, |
| "eval_loss": 1.6921919584274292, |
| "eval_mean_token_accuracy": 0.65951726266316, |
| "eval_num_tokens": 559702.0, |
| "eval_runtime": 1.0329, |
| "eval_samples_per_second": 53.25, |
| "eval_steps_per_second": 13.555, |
| "step": 416 |
| }, |
| { |
| "entropy": 0.8153277470877296, |
| "epoch": 26.258064516129032, |
| "grad_norm": 2.4259774684906006, |
| "learning_rate": 2.6187500000000003e-05, |
| "loss": 0.4676, |
| "mean_token_accuracy": 0.8747850750621996, |
| "num_tokens": 565255.0, |
| "step": 420 |
| }, |
| { |
| "entropy": 0.7013958178460598, |
| "epoch": 26.903225806451612, |
| "grad_norm": 2.4626195430755615, |
| "learning_rate": 2.68125e-05, |
| "loss": 0.3867, |
| "mean_token_accuracy": 0.8833703070878982, |
| "num_tokens": 579163.0, |
| "step": 430 |
| }, |
| { |
| "epoch": 27.0, |
| "eval_entropy": 0.9333125693457467, |
| "eval_loss": 1.7267862558364868, |
| "eval_mean_token_accuracy": 0.658582159451076, |
| "eval_num_tokens": 581229.0, |
| "eval_runtime": 0.9233, |
| "eval_samples_per_second": 59.571, |
| "eval_steps_per_second": 15.163, |
| "step": 432 |
| }, |
| { |
| "entropy": 0.5856798651971316, |
| "epoch": 27.516129032258064, |
| "grad_norm": 2.773681402206421, |
| "learning_rate": 2.74375e-05, |
| "loss": 0.3408, |
| "mean_token_accuracy": 0.8944279308381834, |
| "num_tokens": 592320.0, |
| "step": 440 |
| }, |
| { |
| "epoch": 28.0, |
| "eval_entropy": 0.831269736800875, |
| "eval_loss": 1.8276340961456299, |
| "eval_mean_token_accuracy": 0.6552059480122158, |
| "eval_num_tokens": 602756.0, |
| "eval_runtime": 0.9803, |
| "eval_samples_per_second": 56.107, |
| "eval_steps_per_second": 14.282, |
| "step": 448 |
| }, |
| { |
| "entropy": 0.5249183703409998, |
| "epoch": 28.129032258064516, |
| "grad_norm": 1.7010736465454102, |
| "learning_rate": 2.80625e-05, |
| "loss": 0.3165, |
| "mean_token_accuracy": 0.9029051528165215, |
| "num_tokens": 605474.0, |
| "step": 450 |
| }, |
| { |
| "entropy": 0.4536327484995127, |
| "epoch": 28.774193548387096, |
| "grad_norm": 2.132962226867676, |
| "learning_rate": 2.86875e-05, |
| "loss": 0.2865, |
| "mean_token_accuracy": 0.9036249771714211, |
| "num_tokens": 619457.0, |
| "step": 460 |
| }, |
| { |
| "epoch": 29.0, |
| "eval_entropy": 0.7956289521285466, |
| "eval_loss": 1.9158949851989746, |
| "eval_mean_token_accuracy": 0.6526114770344326, |
| "eval_num_tokens": 624283.0, |
| "eval_runtime": 0.9267, |
| "eval_samples_per_second": 59.35, |
| "eval_steps_per_second": 15.107, |
| "step": 464 |
| }, |
| { |
| "entropy": 0.4277557734596102, |
| "epoch": 29.387096774193548, |
| "grad_norm": 1.9701188802719116, |
| "learning_rate": 2.9312500000000004e-05, |
| "loss": 0.2736, |
| "mean_token_accuracy": 0.9080228907497305, |
| "num_tokens": 632664.0, |
| "step": 470 |
| }, |
| { |
| "entropy": 0.3978642586030458, |
| "epoch": 30.0, |
| "grad_norm": 2.9814867973327637, |
| "learning_rate": 2.9937500000000003e-05, |
| "loss": 0.2796, |
| "mean_token_accuracy": 0.9062309014169794, |
| "num_tokens": 645810.0, |
| "step": 480 |
| }, |
| { |
| "epoch": 30.0, |
| "eval_entropy": 0.7981852037566048, |
| "eval_loss": 2.003068447113037, |
| "eval_mean_token_accuracy": 0.6490011853831155, |
| "eval_num_tokens": 645810.0, |
| "eval_runtime": 0.9368, |
| "eval_samples_per_second": 58.708, |
| "eval_steps_per_second": 14.944, |
| "step": 480 |
| }, |
| { |
| "entropy": 0.37885100245475767, |
| "epoch": 30.64516129032258, |
| "grad_norm": 2.112239360809326, |
| "learning_rate": 3.05625e-05, |
| "loss": 0.2518, |
| "mean_token_accuracy": 0.9144969284534454, |
| "num_tokens": 659784.0, |
| "step": 490 |
| }, |
| { |
| "epoch": 31.0, |
| "eval_entropy": 0.7598817561353955, |
| "eval_loss": 2.0737624168395996, |
| "eval_mean_token_accuracy": 0.6507022082805634, |
| "eval_num_tokens": 667337.0, |
| "eval_runtime": 0.8916, |
| "eval_samples_per_second": 61.689, |
| "eval_steps_per_second": 15.703, |
| "step": 496 |
| }, |
| { |
| "entropy": 0.3670440645594346, |
| "epoch": 31.258064516129032, |
| "grad_norm": 1.9339967966079712, |
| "learning_rate": 3.1187500000000006e-05, |
| "loss": 0.2544, |
| "mean_token_accuracy": 0.9085111869008917, |
| "num_tokens": 672944.0, |
| "step": 500 |
| }, |
| { |
| "entropy": 0.34540521949529646, |
| "epoch": 31.903225806451612, |
| "grad_norm": 2.0099146366119385, |
| "learning_rate": 3.18125e-05, |
| "loss": 0.2468, |
| "mean_token_accuracy": 0.9127625226974487, |
| "num_tokens": 686848.0, |
| "step": 510 |
| }, |
| { |
| "epoch": 32.0, |
| "eval_entropy": 0.6959893958909171, |
| "eval_loss": 2.104912519454956, |
| "eval_mean_token_accuracy": 0.6566033831664494, |
| "eval_num_tokens": 688864.0, |
| "eval_runtime": 1.2103, |
| "eval_samples_per_second": 45.445, |
| "eval_steps_per_second": 11.568, |
| "step": 512 |
| }, |
| { |
| "entropy": 0.3452399529908833, |
| "epoch": 32.516129032258064, |
| "grad_norm": 1.8694605827331543, |
| "learning_rate": 3.24375e-05, |
| "loss": 0.2461, |
| "mean_token_accuracy": 0.9124428755358646, |
| "num_tokens": 699838.0, |
| "step": 520 |
| }, |
| { |
| "epoch": 33.0, |
| "eval_entropy": 0.6757666979517255, |
| "eval_loss": 2.1449453830718994, |
| "eval_mean_token_accuracy": 0.6545567682811192, |
| "eval_num_tokens": 710391.0, |
| "eval_runtime": 0.912, |
| "eval_samples_per_second": 60.306, |
| "eval_steps_per_second": 15.351, |
| "step": 528 |
| }, |
| { |
| "entropy": 0.3346626260562947, |
| "epoch": 33.12903225806452, |
| "grad_norm": 1.2991915941238403, |
| "learning_rate": 3.3062500000000004e-05, |
| "loss": 0.2393, |
| "mean_token_accuracy": 0.9142316736673054, |
| "num_tokens": 713076.0, |
| "step": 530 |
| }, |
| { |
| "entropy": 0.3058926550671458, |
| "epoch": 33.774193548387096, |
| "grad_norm": 2.250917911529541, |
| "learning_rate": 3.36875e-05, |
| "loss": 0.2366, |
| "mean_token_accuracy": 0.9110181450843811, |
| "num_tokens": 726913.0, |
| "step": 540 |
| }, |
| { |
| "epoch": 34.0, |
| "eval_entropy": 0.713920282466071, |
| "eval_loss": 2.044567823410034, |
| "eval_mean_token_accuracy": 0.6525738835334778, |
| "eval_num_tokens": 731918.0, |
| "eval_runtime": 0.8817, |
| "eval_samples_per_second": 62.379, |
| "eval_steps_per_second": 15.878, |
| "step": 544 |
| }, |
| { |
| "entropy": 0.3304567150771618, |
| "epoch": 34.38709677419355, |
| "grad_norm": 1.6248284578323364, |
| "learning_rate": 3.43125e-05, |
| "loss": 0.234, |
| "mean_token_accuracy": 0.9141417553550318, |
| "num_tokens": 740119.0, |
| "step": 550 |
| }, |
| { |
| "entropy": 0.2964809501641675, |
| "epoch": 35.0, |
| "grad_norm": 2.199978828430176, |
| "learning_rate": 3.49375e-05, |
| "loss": 0.2354, |
| "mean_token_accuracy": 0.9153264400206114, |
| "num_tokens": 753445.0, |
| "step": 560 |
| }, |
| { |
| "epoch": 35.0, |
| "eval_entropy": 0.6718730543340955, |
| "eval_loss": 2.131523847579956, |
| "eval_mean_token_accuracy": 0.6514393900121961, |
| "eval_num_tokens": 753445.0, |
| "eval_runtime": 0.919, |
| "eval_samples_per_second": 59.849, |
| "eval_steps_per_second": 15.234, |
| "step": 560 |
| }, |
| { |
| "entropy": 0.29936634581536054, |
| "epoch": 35.645161290322584, |
| "grad_norm": 1.9858863353729248, |
| "learning_rate": 3.5562500000000004e-05, |
| "loss": 0.2233, |
| "mean_token_accuracy": 0.9170688688755035, |
| "num_tokens": 767352.0, |
| "step": 570 |
| }, |
| { |
| "epoch": 36.0, |
| "eval_entropy": 0.6986751215798515, |
| "eval_loss": 2.072589874267578, |
| "eval_mean_token_accuracy": 0.655285677739552, |
| "eval_num_tokens": 774972.0, |
| "eval_runtime": 0.9265, |
| "eval_samples_per_second": 59.365, |
| "eval_steps_per_second": 15.111, |
| "step": 576 |
| }, |
| { |
| "entropy": 0.30756315883052976, |
| "epoch": 36.25806451612903, |
| "grad_norm": 1.2706995010375977, |
| "learning_rate": 3.61875e-05, |
| "loss": 0.2278, |
| "mean_token_accuracy": 0.9169254663743471, |
| "num_tokens": 780722.0, |
| "step": 580 |
| }, |
| { |
| "entropy": 0.2909585501998663, |
| "epoch": 36.903225806451616, |
| "grad_norm": 2.095874786376953, |
| "learning_rate": 3.68125e-05, |
| "loss": 0.2266, |
| "mean_token_accuracy": 0.9121941901743412, |
| "num_tokens": 794511.0, |
| "step": 590 |
| }, |
| { |
| "epoch": 37.0, |
| "eval_entropy": 0.636478283575603, |
| "eval_loss": 2.166316270828247, |
| "eval_mean_token_accuracy": 0.6568594745227269, |
| "eval_num_tokens": 796499.0, |
| "eval_runtime": 0.9013, |
| "eval_samples_per_second": 61.02, |
| "eval_steps_per_second": 15.532, |
| "step": 592 |
| }, |
| { |
| "entropy": 0.28608485017167895, |
| "epoch": 37.516129032258064, |
| "grad_norm": 2.4893622398376465, |
| "learning_rate": 3.74375e-05, |
| "loss": 0.218, |
| "mean_token_accuracy": 0.9165164412636506, |
| "num_tokens": 807681.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 38.0, |
| "eval_entropy": 0.6342436969280243, |
| "eval_loss": 2.156416177749634, |
| "eval_mean_token_accuracy": 0.6568004020622799, |
| "eval_num_tokens": 818026.0, |
| "eval_runtime": 0.9354, |
| "eval_samples_per_second": 58.798, |
| "eval_steps_per_second": 14.967, |
| "step": 608 |
| }, |
| { |
| "entropy": 0.2943071307320344, |
| "epoch": 38.12903225806452, |
| "grad_norm": 1.960349202156067, |
| "learning_rate": 3.8062500000000004e-05, |
| "loss": 0.2245, |
| "mean_token_accuracy": 0.9143017130462747, |
| "num_tokens": 820826.0, |
| "step": 610 |
| }, |
| { |
| "entropy": 0.26704031582921745, |
| "epoch": 38.774193548387096, |
| "grad_norm": 1.1493830680847168, |
| "learning_rate": 3.8687500000000005e-05, |
| "loss": 0.2165, |
| "mean_token_accuracy": 0.9143977962434292, |
| "num_tokens": 834709.0, |
| "step": 620 |
| }, |
| { |
| "epoch": 39.0, |
| "eval_entropy": 0.6424140781164169, |
| "eval_loss": 2.2105581760406494, |
| "eval_mean_token_accuracy": 0.6563994671617236, |
| "eval_num_tokens": 839553.0, |
| "eval_runtime": 0.9226, |
| "eval_samples_per_second": 59.612, |
| "eval_steps_per_second": 15.174, |
| "step": 624 |
| }, |
| { |
| "entropy": 0.27404083448805305, |
| "epoch": 39.38709677419355, |
| "grad_norm": 1.7598483562469482, |
| "learning_rate": 3.93125e-05, |
| "loss": 0.2157, |
| "mean_token_accuracy": 0.9152095623706517, |
| "num_tokens": 847976.0, |
| "step": 630 |
| }, |
| { |
| "entropy": 0.2730099213750739, |
| "epoch": 40.0, |
| "grad_norm": 1.9577555656433105, |
| "learning_rate": 3.99375e-05, |
| "loss": 0.2216, |
| "mean_token_accuracy": 0.9135262981841439, |
| "num_tokens": 861080.0, |
| "step": 640 |
| }, |
| { |
| "epoch": 40.0, |
| "eval_entropy": 0.623557556952749, |
| "eval_loss": 2.177314043045044, |
| "eval_mean_token_accuracy": 0.6597372846943992, |
| "eval_num_tokens": 861080.0, |
| "eval_runtime": 0.9132, |
| "eval_samples_per_second": 60.227, |
| "eval_steps_per_second": 15.33, |
| "step": 640 |
| }, |
| { |
| "entropy": 0.26116420738399027, |
| "epoch": 40.645161290322584, |
| "grad_norm": 1.4962230920791626, |
| "learning_rate": 4.0562500000000003e-05, |
| "loss": 0.2104, |
| "mean_token_accuracy": 0.9174227572977542, |
| "num_tokens": 874945.0, |
| "step": 650 |
| }, |
| { |
| "epoch": 41.0, |
| "eval_entropy": 0.6280922591686249, |
| "eval_loss": 2.182685136795044, |
| "eval_mean_token_accuracy": 0.6488148740359715, |
| "eval_num_tokens": 882607.0, |
| "eval_runtime": 0.9382, |
| "eval_samples_per_second": 58.625, |
| "eval_steps_per_second": 14.923, |
| "step": 656 |
| }, |
| { |
| "entropy": 0.27049236548574346, |
| "epoch": 41.25806451612903, |
| "grad_norm": 1.5585705041885376, |
| "learning_rate": 4.11875e-05, |
| "loss": 0.2172, |
| "mean_token_accuracy": 0.9116643212343517, |
| "num_tokens": 888188.0, |
| "step": 660 |
| }, |
| { |
| "entropy": 0.25881535150110724, |
| "epoch": 41.903225806451616, |
| "grad_norm": 1.7957926988601685, |
| "learning_rate": 4.181250000000001e-05, |
| "loss": 0.2171, |
| "mean_token_accuracy": 0.9126927703619003, |
| "num_tokens": 902034.0, |
| "step": 670 |
| }, |
| { |
| "epoch": 42.0, |
| "eval_entropy": 0.6354757377079555, |
| "eval_loss": 2.1949057579040527, |
| "eval_mean_token_accuracy": 0.651188816343035, |
| "eval_num_tokens": 904134.0, |
| "eval_runtime": 0.8942, |
| "eval_samples_per_second": 61.51, |
| "eval_steps_per_second": 15.657, |
| "step": 672 |
| }, |
| { |
| "entropy": 0.2616837781510855, |
| "epoch": 42.516129032258064, |
| "grad_norm": 1.9319422245025635, |
| "learning_rate": 4.24375e-05, |
| "loss": 0.2111, |
| "mean_token_accuracy": 0.9157685621788627, |
| "num_tokens": 915303.0, |
| "step": 680 |
| }, |
| { |
| "epoch": 43.0, |
| "eval_entropy": 0.615446959223066, |
| "eval_loss": 2.2043023109436035, |
| "eval_mean_token_accuracy": 0.6558100581169128, |
| "eval_num_tokens": 925661.0, |
| "eval_runtime": 0.9252, |
| "eval_samples_per_second": 59.445, |
| "eval_steps_per_second": 15.131, |
| "step": 688 |
| }, |
| { |
| "entropy": 0.25998294510339437, |
| "epoch": 43.12903225806452, |
| "grad_norm": 2.184018850326538, |
| "learning_rate": 4.30625e-05, |
| "loss": 0.2175, |
| "mean_token_accuracy": 0.914588484324907, |
| "num_tokens": 928440.0, |
| "step": 690 |
| }, |
| { |
| "entropy": 0.25247995406389234, |
| "epoch": 43.774193548387096, |
| "grad_norm": 2.9504449367523193, |
| "learning_rate": 4.3687500000000005e-05, |
| "loss": 0.216, |
| "mean_token_accuracy": 0.9170804493129253, |
| "num_tokens": 942357.0, |
| "step": 700 |
| }, |
| { |
| "epoch": 44.0, |
| "eval_entropy": 0.6387277500970023, |
| "eval_loss": 2.154686212539673, |
| "eval_mean_token_accuracy": 0.6561333068779537, |
| "eval_num_tokens": 947188.0, |
| "eval_runtime": 0.912, |
| "eval_samples_per_second": 60.307, |
| "eval_steps_per_second": 15.351, |
| "step": 704 |
| }, |
| { |
| "entropy": 0.26556668744275447, |
| "epoch": 44.38709677419355, |
| "grad_norm": 2.3016467094421387, |
| "learning_rate": 4.43125e-05, |
| "loss": 0.2129, |
| "mean_token_accuracy": 0.9151790534195147, |
| "num_tokens": 955541.0, |
| "step": 710 |
| }, |
| { |
| "entropy": 0.25153220680199173, |
| "epoch": 45.0, |
| "grad_norm": 1.5553311109542847, |
| "learning_rate": 4.49375e-05, |
| "loss": 0.2197, |
| "mean_token_accuracy": 0.912982240319252, |
| "num_tokens": 968715.0, |
| "step": 720 |
| }, |
| { |
| "epoch": 45.0, |
| "eval_entropy": 0.6276453903743199, |
| "eval_loss": 2.186168670654297, |
| "eval_mean_token_accuracy": 0.6584120520523616, |
| "eval_num_tokens": 968715.0, |
| "eval_runtime": 0.9005, |
| "eval_samples_per_second": 61.074, |
| "eval_steps_per_second": 15.546, |
| "step": 720 |
| }, |
| { |
| "entropy": 0.2620750930160284, |
| "epoch": 45.645161290322584, |
| "grad_norm": 2.157158136367798, |
| "learning_rate": 4.55625e-05, |
| "loss": 0.2042, |
| "mean_token_accuracy": 0.9174154184758663, |
| "num_tokens": 982594.0, |
| "step": 730 |
| }, |
| { |
| "epoch": 46.0, |
| "eval_entropy": 0.6001809579985482, |
| "eval_loss": 2.269158124923706, |
| "eval_mean_token_accuracy": 0.6551194148404258, |
| "eval_num_tokens": 990242.0, |
| "eval_runtime": 0.9149, |
| "eval_samples_per_second": 60.114, |
| "eval_steps_per_second": 15.302, |
| "step": 736 |
| }, |
| { |
| "entropy": 0.24646662116834991, |
| "epoch": 46.25806451612903, |
| "grad_norm": 0.8333325982093811, |
| "learning_rate": 4.61875e-05, |
| "loss": 0.2186, |
| "mean_token_accuracy": 0.9161424346660313, |
| "num_tokens": 995750.0, |
| "step": 740 |
| }, |
| { |
| "entropy": 0.25320138819515703, |
| "epoch": 46.903225806451616, |
| "grad_norm": 1.5136183500289917, |
| "learning_rate": 4.6812500000000006e-05, |
| "loss": 0.212, |
| "mean_token_accuracy": 0.9146950207650661, |
| "num_tokens": 1009725.0, |
| "step": 750 |
| }, |
| { |
| "epoch": 47.0, |
| "eval_entropy": 0.6296457903725761, |
| "eval_loss": 2.1639528274536133, |
| "eval_mean_token_accuracy": 0.6583362604890551, |
| "eval_num_tokens": 1011769.0, |
| "eval_runtime": 0.8933, |
| "eval_samples_per_second": 61.57, |
| "eval_steps_per_second": 15.672, |
| "step": 752 |
| }, |
| { |
| "entropy": 0.25277749547048617, |
| "epoch": 47.516129032258064, |
| "grad_norm": 2.703397512435913, |
| "learning_rate": 4.74375e-05, |
| "loss": 0.2107, |
| "mean_token_accuracy": 0.9154670128696843, |
| "num_tokens": 1022922.0, |
| "step": 760 |
| }, |
| { |
| "epoch": 48.0, |
| "eval_entropy": 0.6431450226477214, |
| "eval_loss": 2.113975763320923, |
| "eval_mean_token_accuracy": 0.6571915745735168, |
| "eval_num_tokens": 1033296.0, |
| "eval_runtime": 1.4606, |
| "eval_samples_per_second": 37.656, |
| "eval_steps_per_second": 9.585, |
| "step": 768 |
| }, |
| { |
| "entropy": 0.2505563164227887, |
| "epoch": 48.12903225806452, |
| "grad_norm": 1.7070248126983643, |
| "learning_rate": 4.80625e-05, |
| "loss": 0.2181, |
| "mean_token_accuracy": 0.9125990342152747, |
| "num_tokens": 1036124.0, |
| "step": 770 |
| }, |
| { |
| "entropy": 0.2494984647259116, |
| "epoch": 48.774193548387096, |
| "grad_norm": 2.354995012283325, |
| "learning_rate": 4.8687500000000004e-05, |
| "loss": 0.2069, |
| "mean_token_accuracy": 0.9165523618459701, |
| "num_tokens": 1050032.0, |
| "step": 780 |
| }, |
| { |
| "epoch": 49.0, |
| "eval_entropy": 0.603933504649571, |
| "eval_loss": 2.2200510501861572, |
| "eval_mean_token_accuracy": 0.6548148649079459, |
| "eval_num_tokens": 1054823.0, |
| "eval_runtime": 0.9123, |
| "eval_samples_per_second": 60.286, |
| "eval_steps_per_second": 15.346, |
| "step": 784 |
| }, |
| { |
| "entropy": 0.24296215019728007, |
| "epoch": 49.38709677419355, |
| "grad_norm": 1.5552977323532104, |
| "learning_rate": 4.93125e-05, |
| "loss": 0.2083, |
| "mean_token_accuracy": 0.9156087629104915, |
| "num_tokens": 1063159.0, |
| "step": 790 |
| }, |
| { |
| "entropy": 0.24393935266293978, |
| "epoch": 50.0, |
| "grad_norm": 1.544976830482483, |
| "learning_rate": 4.99375e-05, |
| "loss": 0.2157, |
| "mean_token_accuracy": 0.9146833968789954, |
| "num_tokens": 1076350.0, |
| "step": 800 |
| }, |
| { |
| "epoch": 50.0, |
| "eval_entropy": 0.6158908797161919, |
| "eval_loss": 2.2002458572387695, |
| "eval_mean_token_accuracy": 0.6551659618105207, |
| "eval_num_tokens": 1076350.0, |
| "eval_runtime": 0.9028, |
| "eval_samples_per_second": 60.923, |
| "eval_steps_per_second": 15.508, |
| "step": 800 |
| }, |
| { |
| "entropy": 0.24434087462723256, |
| "epoch": 50.645161290322584, |
| "grad_norm": 1.5835460424423218, |
| "learning_rate": 5.05625e-05, |
| "loss": 0.2053, |
| "mean_token_accuracy": 0.9157114021480084, |
| "num_tokens": 1090170.0, |
| "step": 810 |
| }, |
| { |
| "epoch": 51.0, |
| "eval_entropy": 0.6001614396061216, |
| "eval_loss": 2.134579658508301, |
| "eval_mean_token_accuracy": 0.6620945462158748, |
| "eval_num_tokens": 1097877.0, |
| "eval_runtime": 0.9182, |
| "eval_samples_per_second": 59.901, |
| "eval_steps_per_second": 15.248, |
| "step": 816 |
| }, |
| { |
| "entropy": 0.2386562437995484, |
| "epoch": 51.25806451612903, |
| "grad_norm": 0.8457896709442139, |
| "learning_rate": 5.11875e-05, |
| "loss": 0.2112, |
| "mean_token_accuracy": 0.9135481056414152, |
| "num_tokens": 1103447.0, |
| "step": 820 |
| }, |
| { |
| "entropy": 0.23914105109870434, |
| "epoch": 51.903225806451616, |
| "grad_norm": 1.2757948637008667, |
| "learning_rate": 5.18125e-05, |
| "loss": 0.2121, |
| "mean_token_accuracy": 0.917195787280798, |
| "num_tokens": 1117426.0, |
| "step": 830 |
| }, |
| { |
| "epoch": 52.0, |
| "eval_entropy": 0.6030709551913398, |
| "eval_loss": 2.1421921253204346, |
| "eval_mean_token_accuracy": 0.6576450892857143, |
| "eval_num_tokens": 1119404.0, |
| "eval_runtime": 0.9157, |
| "eval_samples_per_second": 60.065, |
| "eval_steps_per_second": 15.289, |
| "step": 832 |
| }, |
| { |
| "entropy": 0.23953668361431674, |
| "epoch": 52.516129032258064, |
| "grad_norm": 0.7424585223197937, |
| "learning_rate": 5.243750000000001e-05, |
| "loss": 0.206, |
| "mean_token_accuracy": 0.9173323300323988, |
| "num_tokens": 1130592.0, |
| "step": 840 |
| }, |
| { |
| "epoch": 53.0, |
| "eval_entropy": 0.6143183495317187, |
| "eval_loss": 2.136685609817505, |
| "eval_mean_token_accuracy": 0.661886956010546, |
| "eval_num_tokens": 1140931.0, |
| "eval_runtime": 0.9082, |
| "eval_samples_per_second": 60.559, |
| "eval_steps_per_second": 15.415, |
| "step": 848 |
| }, |
| { |
| "entropy": 0.24182377598787608, |
| "epoch": 53.12903225806452, |
| "grad_norm": 1.3732322454452515, |
| "learning_rate": 5.30625e-05, |
| "loss": 0.2121, |
| "mean_token_accuracy": 0.9144565549335981, |
| "num_tokens": 1143685.0, |
| "step": 850 |
| }, |
| { |
| "entropy": 0.22879959754645823, |
| "epoch": 53.774193548387096, |
| "grad_norm": 2.147244453430176, |
| "learning_rate": 5.3687500000000004e-05, |
| "loss": 0.2054, |
| "mean_token_accuracy": 0.9151782430708408, |
| "num_tokens": 1157611.0, |
| "step": 860 |
| }, |
| { |
| "epoch": 54.0, |
| "eval_entropy": 0.6321406619889396, |
| "eval_loss": 2.204887628555298, |
| "eval_mean_token_accuracy": 0.6532182906355176, |
| "eval_num_tokens": 1162458.0, |
| "eval_runtime": 0.9208, |
| "eval_samples_per_second": 59.732, |
| "eval_steps_per_second": 15.205, |
| "step": 864 |
| }, |
| { |
| "entropy": 0.23761214551172757, |
| "epoch": 54.38709677419355, |
| "grad_norm": 1.3003839254379272, |
| "learning_rate": 5.43125e-05, |
| "loss": 0.2035, |
| "mean_token_accuracy": 0.9193885200902036, |
| "num_tokens": 1170903.0, |
| "step": 870 |
| }, |
| { |
| "entropy": 0.2340365965899668, |
| "epoch": 55.0, |
| "grad_norm": 2.1319892406463623, |
| "learning_rate": 5.49375e-05, |
| "loss": 0.2094, |
| "mean_token_accuracy": 0.9123364067391345, |
| "num_tokens": 1183985.0, |
| "step": 880 |
| }, |
| { |
| "epoch": 55.0, |
| "eval_entropy": 0.5765226589781898, |
| "eval_loss": 2.259164333343506, |
| "eval_mean_token_accuracy": 0.6540640251977103, |
| "eval_num_tokens": 1183985.0, |
| "eval_runtime": 1.3389, |
| "eval_samples_per_second": 41.077, |
| "eval_steps_per_second": 10.456, |
| "step": 880 |
| }, |
| { |
| "entropy": 0.22173939775675536, |
| "epoch": 55.645161290322584, |
| "grad_norm": 0.9774155616760254, |
| "learning_rate": 5.556250000000001e-05, |
| "loss": 0.2028, |
| "mean_token_accuracy": 0.9162920407950879, |
| "num_tokens": 1197803.0, |
| "step": 890 |
| }, |
| { |
| "epoch": 56.0, |
| "eval_entropy": 0.6301779236112323, |
| "eval_loss": 2.164349317550659, |
| "eval_mean_token_accuracy": 0.6591020779950278, |
| "eval_num_tokens": 1205512.0, |
| "eval_runtime": 0.8858, |
| "eval_samples_per_second": 62.094, |
| "eval_steps_per_second": 15.806, |
| "step": 896 |
| }, |
| { |
| "entropy": 0.24067558975596176, |
| "epoch": 56.25806451612903, |
| "grad_norm": 0.6777637004852295, |
| "learning_rate": 5.6187500000000004e-05, |
| "loss": 0.2047, |
| "mean_token_accuracy": 0.9156502038240433, |
| "num_tokens": 1211040.0, |
| "step": 900 |
| }, |
| { |
| "entropy": 0.22286444082856177, |
| "epoch": 56.903225806451616, |
| "grad_norm": 1.1683127880096436, |
| "learning_rate": 5.68125e-05, |
| "loss": 0.2092, |
| "mean_token_accuracy": 0.9164877288043499, |
| "num_tokens": 1224951.0, |
| "step": 910 |
| }, |
| { |
| "epoch": 57.0, |
| "eval_entropy": 0.6137891731091908, |
| "eval_loss": 2.166926622390747, |
| "eval_mean_token_accuracy": 0.655881038733891, |
| "eval_num_tokens": 1227039.0, |
| "eval_runtime": 0.9144, |
| "eval_samples_per_second": 60.149, |
| "eval_steps_per_second": 15.311, |
| "step": 912 |
| }, |
| { |
| "entropy": 0.22742715889686033, |
| "epoch": 57.516129032258064, |
| "grad_norm": 1.949874997138977, |
| "learning_rate": 5.74375e-05, |
| "loss": 0.1963, |
| "mean_token_accuracy": 0.9188078069373181, |
| "num_tokens": 1238175.0, |
| "step": 920 |
| }, |
| { |
| "epoch": 58.0, |
| "eval_entropy": 0.6465144370283399, |
| "eval_loss": 2.1403896808624268, |
| "eval_mean_token_accuracy": 0.6535507994038718, |
| "eval_num_tokens": 1248566.0, |
| "eval_runtime": 0.9857, |
| "eval_samples_per_second": 55.798, |
| "eval_steps_per_second": 14.203, |
| "step": 928 |
| }, |
| { |
| "entropy": 0.2348696542413611, |
| "epoch": 58.12903225806452, |
| "grad_norm": 0.7200958728790283, |
| "learning_rate": 5.8062499999999995e-05, |
| "loss": 0.2062, |
| "mean_token_accuracy": 0.9138364462476027, |
| "num_tokens": 1251370.0, |
| "step": 930 |
| }, |
| { |
| "entropy": 0.23166095688939095, |
| "epoch": 58.774193548387096, |
| "grad_norm": 2.3401575088500977, |
| "learning_rate": 5.8687500000000003e-05, |
| "loss": 0.2022, |
| "mean_token_accuracy": 0.9133141487836838, |
| "num_tokens": 1265252.0, |
| "step": 940 |
| }, |
| { |
| "epoch": 59.0, |
| "eval_entropy": 0.5926203238112586, |
| "eval_loss": 2.1573755741119385, |
| "eval_mean_token_accuracy": 0.6642243266105652, |
| "eval_num_tokens": 1270093.0, |
| "eval_runtime": 1.3512, |
| "eval_samples_per_second": 40.706, |
| "eval_steps_per_second": 10.362, |
| "step": 944 |
| }, |
| { |
| "entropy": 0.228854532304563, |
| "epoch": 59.38709677419355, |
| "grad_norm": 1.2065060138702393, |
| "learning_rate": 5.9312500000000005e-05, |
| "loss": 0.204, |
| "mean_token_accuracy": 0.915436535289413, |
| "num_tokens": 1278422.0, |
| "step": 950 |
| }, |
| { |
| "entropy": 0.2283171534930405, |
| "epoch": 60.0, |
| "grad_norm": 1.6328575611114502, |
| "learning_rate": 5.99375e-05, |
| "loss": 0.2106, |
| "mean_token_accuracy": 0.9148200319001549, |
| "num_tokens": 1291620.0, |
| "step": 960 |
| }, |
| { |
| "epoch": 60.0, |
| "eval_entropy": 0.6066372990608215, |
| "eval_loss": 2.2142343521118164, |
| "eval_mean_token_accuracy": 0.663521830524717, |
| "eval_num_tokens": 1291620.0, |
| "eval_runtime": 0.918, |
| "eval_samples_per_second": 59.916, |
| "eval_steps_per_second": 15.251, |
| "step": 960 |
| }, |
| { |
| "entropy": 0.22587883714586496, |
| "epoch": 60.645161290322584, |
| "grad_norm": 1.1729897260665894, |
| "learning_rate": 6.05625e-05, |
| "loss": 0.2004, |
| "mean_token_accuracy": 0.918365728110075, |
| "num_tokens": 1305660.0, |
| "step": 970 |
| }, |
| { |
| "epoch": 61.0, |
| "eval_entropy": 0.6117608717509678, |
| "eval_loss": 2.096813678741455, |
| "eval_mean_token_accuracy": 0.6604258716106415, |
| "eval_num_tokens": 1313147.0, |
| "eval_runtime": 0.9077, |
| "eval_samples_per_second": 60.594, |
| "eval_steps_per_second": 15.424, |
| "step": 976 |
| }, |
| { |
| "entropy": 0.23531277536561615, |
| "epoch": 61.25806451612903, |
| "grad_norm": 0.6755152344703674, |
| "learning_rate": 6.11875e-05, |
| "loss": 0.2082, |
| "mean_token_accuracy": 0.9143172777012775, |
| "num_tokens": 1318734.0, |
| "step": 980 |
| }, |
| { |
| "entropy": 0.2311376605182886, |
| "epoch": 61.903225806451616, |
| "grad_norm": 1.2641390562057495, |
| "learning_rate": 6.18125e-05, |
| "loss": 0.2046, |
| "mean_token_accuracy": 0.9152424365282059, |
| "num_tokens": 1332705.0, |
| "step": 990 |
| }, |
| { |
| "epoch": 62.0, |
| "eval_entropy": 0.614239479814257, |
| "eval_loss": 2.1853811740875244, |
| "eval_mean_token_accuracy": 0.6579888761043549, |
| "eval_num_tokens": 1334674.0, |
| "eval_runtime": 0.912, |
| "eval_samples_per_second": 60.308, |
| "eval_steps_per_second": 15.351, |
| "step": 992 |
| }, |
| { |
| "entropy": 0.22623609006404877, |
| "epoch": 62.516129032258064, |
| "grad_norm": 0.73882657289505, |
| "learning_rate": 6.24375e-05, |
| "loss": 0.196, |
| "mean_token_accuracy": 0.9183140248060226, |
| "num_tokens": 1345954.0, |
| "step": 1000 |
| }, |
| { |
| "epoch": 63.0, |
| "eval_entropy": 0.5942062480109078, |
| "eval_loss": 2.2710092067718506, |
| "eval_mean_token_accuracy": 0.6534528051103864, |
| "eval_num_tokens": 1356201.0, |
| "eval_runtime": 0.9816, |
| "eval_samples_per_second": 56.034, |
| "eval_steps_per_second": 14.263, |
| "step": 1008 |
| }, |
| { |
| "entropy": 0.23058188216466652, |
| "epoch": 63.12903225806452, |
| "grad_norm": 1.617550253868103, |
| "learning_rate": 6.306250000000001e-05, |
| "loss": 0.2061, |
| "mean_token_accuracy": 0.9149901968868155, |
| "num_tokens": 1358961.0, |
| "step": 1010 |
| }, |
| { |
| "entropy": 0.2263046816922724, |
| "epoch": 63.774193548387096, |
| "grad_norm": 1.7583829164505005, |
| "learning_rate": 6.36875e-05, |
| "loss": 0.1997, |
| "mean_token_accuracy": 0.9183584488928318, |
| "num_tokens": 1372891.0, |
| "step": 1020 |
| }, |
| { |
| "epoch": 64.0, |
| "eval_entropy": 0.5932143756321498, |
| "eval_loss": 2.1655139923095703, |
| "eval_mean_token_accuracy": 0.659838148525783, |
| "eval_num_tokens": 1377728.0, |
| "eval_runtime": 0.8899, |
| "eval_samples_per_second": 61.807, |
| "eval_steps_per_second": 15.733, |
| "step": 1024 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 16000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1000, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 6.765547267067904e+16, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|