| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 250, |
| "global_step": 530, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 0.825, |
| "epoch": 0.01890359168241966, |
| "grad_norm": 157.0, |
| "learning_rate": 5.925925925925926e-06, |
| "loss": 2.9876, |
| "mean_token_accuracy": 0.6893173575401306, |
| "num_input_tokens_seen": 115216, |
| "num_tokens": 114489.0, |
| "step": 5, |
| "train_runtime": 4.4379, |
| "train_tokens_per_second": 25961.602 |
| }, |
| { |
| "entropy": 0.8890625, |
| "epoch": 0.03780718336483932, |
| "grad_norm": 24.5, |
| "learning_rate": 1.3333333333333333e-05, |
| "loss": 0.6662, |
| "mean_token_accuracy": 0.8326915562152862, |
| "num_input_tokens_seen": 230592, |
| "num_tokens": 229109.0, |
| "step": 10, |
| "train_runtime": 7.7641, |
| "train_tokens_per_second": 29699.859 |
| }, |
| { |
| "entropy": 1.02890625, |
| "epoch": 0.05671077504725898, |
| "grad_norm": 18.625, |
| "learning_rate": 2.074074074074074e-05, |
| "loss": 0.6027, |
| "mean_token_accuracy": 0.8529165983200073, |
| "num_input_tokens_seen": 345600, |
| "num_tokens": 343545.0, |
| "step": 15, |
| "train_runtime": 13.2817, |
| "train_tokens_per_second": 26020.836 |
| }, |
| { |
| "entropy": 1.1421875, |
| "epoch": 0.07561436672967864, |
| "grad_norm": 19.125, |
| "learning_rate": 2.814814814814815e-05, |
| "loss": 0.4447, |
| "mean_token_accuracy": 0.881816154718399, |
| "num_input_tokens_seen": 461282, |
| "num_tokens": 458335.0, |
| "step": 20, |
| "train_runtime": 16.8766, |
| "train_tokens_per_second": 27332.56 |
| }, |
| { |
| "entropy": 1.21640625, |
| "epoch": 0.0945179584120983, |
| "grad_norm": 21.75, |
| "learning_rate": 3.555555555555555e-05, |
| "loss": 0.3989, |
| "mean_token_accuracy": 0.8929793298244476, |
| "num_input_tokens_seen": 576346, |
| "num_tokens": 572824.0, |
| "step": 25, |
| "train_runtime": 22.5256, |
| "train_tokens_per_second": 25586.277 |
| }, |
| { |
| "entropy": 1.26484375, |
| "epoch": 0.11342155009451796, |
| "grad_norm": 13.625, |
| "learning_rate": 3.999843966403289e-05, |
| "loss": 0.4872, |
| "mean_token_accuracy": 0.8781549751758575, |
| "num_input_tokens_seen": 691188, |
| "num_tokens": 687152.0, |
| "step": 30, |
| "train_runtime": 26.0077, |
| "train_tokens_per_second": 26576.234 |
| }, |
| { |
| "entropy": 1.28515625, |
| "epoch": 0.1323251417769376, |
| "grad_norm": 17.375, |
| "learning_rate": 3.99808886803243e-05, |
| "loss": 0.28, |
| "mean_token_accuracy": 0.9074305832386017, |
| "num_input_tokens_seen": 806740, |
| "num_tokens": 801973.0, |
| "step": 35, |
| "train_runtime": 29.7225, |
| "train_tokens_per_second": 27142.403 |
| }, |
| { |
| "entropy": 1.2984375, |
| "epoch": 0.15122873345935728, |
| "grad_norm": 12.625, |
| "learning_rate": 3.994385346473689e-05, |
| "loss": 0.356, |
| "mean_token_accuracy": 0.9146295249462127, |
| "num_input_tokens_seen": 921796, |
| "num_tokens": 916426.0, |
| "step": 40, |
| "train_runtime": 34.6767, |
| "train_tokens_per_second": 26582.553 |
| }, |
| { |
| "entropy": 1.56953125, |
| "epoch": 0.17013232514177692, |
| "grad_norm": 11.4375, |
| "learning_rate": 3.9887370131917e-05, |
| "loss": 0.3933, |
| "mean_token_accuracy": 0.9064954161643982, |
| "num_input_tokens_seen": 1036824, |
| "num_tokens": 1030824.0, |
| "step": 45, |
| "train_runtime": 38.0075, |
| "train_tokens_per_second": 27279.472 |
| }, |
| { |
| "entropy": 1.7515625, |
| "epoch": 0.1890359168241966, |
| "grad_norm": 15.1875, |
| "learning_rate": 3.981149376121427e-05, |
| "loss": 0.2873, |
| "mean_token_accuracy": 0.9260397672653198, |
| "num_input_tokens_seen": 1152356, |
| "num_tokens": 1145500.0, |
| "step": 50, |
| "train_runtime": 43.1792, |
| "train_tokens_per_second": 26687.759 |
| }, |
| { |
| "entropy": 1.75859375, |
| "epoch": 0.20793950850661624, |
| "grad_norm": 8.5625, |
| "learning_rate": 3.97162983429714e-05, |
| "loss": 0.3322, |
| "mean_token_accuracy": 0.9256749033927918, |
| "num_input_tokens_seen": 1267634, |
| "num_tokens": 1260057.0, |
| "step": 55, |
| "train_runtime": 46.6166, |
| "train_tokens_per_second": 27192.755 |
| }, |
| { |
| "entropy": 1.734375, |
| "epoch": 0.22684310018903592, |
| "grad_norm": 11.6875, |
| "learning_rate": 3.960187670637294e-05, |
| "loss": 0.2865, |
| "mean_token_accuracy": 0.9282522916793823, |
| "num_input_tokens_seen": 1383494, |
| "num_tokens": 1374973.0, |
| "step": 60, |
| "train_runtime": 50.5007, |
| "train_tokens_per_second": 27395.522 |
| }, |
| { |
| "entropy": 1.70859375, |
| "epoch": 0.24574669187145556, |
| "grad_norm": 9.75, |
| "learning_rate": 3.946834042892355e-05, |
| "loss": 0.2277, |
| "mean_token_accuracy": 0.9320353448390961, |
| "num_input_tokens_seen": 1499052, |
| "num_tokens": 1489683.0, |
| "step": 65, |
| "train_runtime": 55.2907, |
| "train_tokens_per_second": 27112.206 |
| }, |
| { |
| "entropy": 1.6578125, |
| "epoch": 0.2646502835538752, |
| "grad_norm": 10.125, |
| "learning_rate": 3.931581972764386e-05, |
| "loss": 0.2733, |
| "mean_token_accuracy": 0.9363594233989716, |
| "num_input_tokens_seen": 1614146, |
| "num_tokens": 1604106.0, |
| "step": 70, |
| "train_runtime": 58.5263, |
| "train_tokens_per_second": 27579.832 |
| }, |
| { |
| "entropy": 1.57109375, |
| "epoch": 0.2835538752362949, |
| "grad_norm": 6.34375, |
| "learning_rate": 3.91444633320903e-05, |
| "loss": 0.2164, |
| "mean_token_accuracy": 0.9349239528179168, |
| "num_input_tokens_seen": 1729362, |
| "num_tokens": 1718632.0, |
| "step": 75, |
| "train_runtime": 63.5177, |
| "train_tokens_per_second": 27226.454 |
| }, |
| { |
| "entropy": 1.57734375, |
| "epoch": 0.30245746691871456, |
| "grad_norm": 10.875, |
| "learning_rate": 3.8954438339322366e-05, |
| "loss": 0.2173, |
| "mean_token_accuracy": 0.9350460767745972, |
| "num_input_tokens_seen": 1844444, |
| "num_tokens": 1833068.0, |
| "step": 80, |
| "train_runtime": 66.8194, |
| "train_tokens_per_second": 27603.402 |
| }, |
| { |
| "entropy": 1.62421875, |
| "epoch": 0.32136105860113423, |
| "grad_norm": 11.0, |
| "learning_rate": 3.874593005095909e-05, |
| "loss": 0.2337, |
| "mean_token_accuracy": 0.929820317029953, |
| "num_input_tokens_seen": 1959682, |
| "num_tokens": 1947640.0, |
| "step": 85, |
| "train_runtime": 70.4744, |
| "train_tokens_per_second": 27806.99 |
| }, |
| { |
| "entropy": 1.71171875, |
| "epoch": 0.34026465028355385, |
| "grad_norm": 8.625, |
| "learning_rate": 3.851914179248333e-05, |
| "loss": 0.2156, |
| "mean_token_accuracy": 0.9308744966983795, |
| "num_input_tokens_seen": 2075138, |
| "num_tokens": 2062310.0, |
| "step": 90, |
| "train_runtime": 75.9345, |
| "train_tokens_per_second": 27327.991 |
| }, |
| { |
| "entropy": 1.86875, |
| "epoch": 0.3591682419659735, |
| "grad_norm": 13.5625, |
| "learning_rate": 3.82742947149703e-05, |
| "loss": 0.2718, |
| "mean_token_accuracy": 0.9264281988143921, |
| "num_input_tokens_seen": 2190160, |
| "num_tokens": 2176716.0, |
| "step": 95, |
| "train_runtime": 79.4416, |
| "train_tokens_per_second": 27569.42 |
| }, |
| { |
| "entropy": 1.94765625, |
| "epoch": 0.3780718336483932, |
| "grad_norm": 6.125, |
| "learning_rate": 3.801162757943359e-05, |
| "loss": 0.3385, |
| "mean_token_accuracy": 0.9164456725120544, |
| "num_input_tokens_seen": 2305250, |
| "num_tokens": 2291230.0, |
| "step": 100, |
| "train_runtime": 84.7105, |
| "train_tokens_per_second": 27213.265 |
| }, |
| { |
| "entropy": 1.846875, |
| "epoch": 0.39697542533081287, |
| "grad_norm": 15.1875, |
| "learning_rate": 3.773139652399884e-05, |
| "loss": 0.1811, |
| "mean_token_accuracy": 0.944804173707962, |
| "num_input_tokens_seen": 2420666, |
| "num_tokens": 2405904.0, |
| "step": 105, |
| "train_runtime": 88.7231, |
| "train_tokens_per_second": 27283.383 |
| }, |
| { |
| "entropy": 1.84765625, |
| "epoch": 0.4158790170132325, |
| "grad_norm": 6.3125, |
| "learning_rate": 3.743387481413243e-05, |
| "loss": 0.1974, |
| "mean_token_accuracy": 0.9379207909107208, |
| "num_input_tokens_seen": 2535606, |
| "num_tokens": 2520235.0, |
| "step": 110, |
| "train_runtime": 93.0343, |
| "train_tokens_per_second": 27254.523 |
| }, |
| { |
| "entropy": 1.83984375, |
| "epoch": 0.43478260869565216, |
| "grad_norm": 2.875, |
| "learning_rate": 3.711935257616842e-05, |
| "loss": 0.1266, |
| "mean_token_accuracy": 0.9594786465167999, |
| "num_input_tokens_seen": 2650514, |
| "num_tokens": 2634592.0, |
| "step": 115, |
| "train_runtime": 98.3865, |
| "train_tokens_per_second": 26939.815 |
| }, |
| { |
| "entropy": 1.78984375, |
| "epoch": 0.45368620037807184, |
| "grad_norm": 3.6875, |
| "learning_rate": 3.678813651439376e-05, |
| "loss": 0.1993, |
| "mean_token_accuracy": 0.9459972441196441, |
| "num_input_tokens_seen": 2766004, |
| "num_tokens": 2749299.0, |
| "step": 120, |
| "train_runtime": 102.0946, |
| "train_tokens_per_second": 27092.567 |
| }, |
| { |
| "entropy": 1.78828125, |
| "epoch": 0.4725897920604915, |
| "grad_norm": 9.1875, |
| "learning_rate": 3.6440549611967656e-05, |
| "loss": 0.2075, |
| "mean_token_accuracy": 0.940614128112793, |
| "num_input_tokens_seen": 2880990, |
| "num_tokens": 2863713.0, |
| "step": 125, |
| "train_runtime": 107.8735, |
| "train_tokens_per_second": 26707.121 |
| }, |
| { |
| "entropy": 1.85234375, |
| "epoch": 0.4914933837429111, |
| "grad_norm": 7.15625, |
| "learning_rate": 3.6076930815966654e-05, |
| "loss": 0.236, |
| "mean_token_accuracy": 0.9343804061412812, |
| "num_input_tokens_seen": 2995844, |
| "num_tokens": 2978032.0, |
| "step": 130, |
| "train_runtime": 111.3362, |
| "train_tokens_per_second": 26908.095 |
| }, |
| { |
| "entropy": 1.95859375, |
| "epoch": 0.5103969754253308, |
| "grad_norm": 7.375, |
| "learning_rate": 3.569763470686262e-05, |
| "loss": 0.162, |
| "mean_token_accuracy": 0.9484993875026703, |
| "num_input_tokens_seen": 3111092, |
| "num_tokens": 3092605.0, |
| "step": 135, |
| "train_runtime": 115.8418, |
| "train_tokens_per_second": 26856.393 |
| }, |
| { |
| "entropy": 1.99921875, |
| "epoch": 0.5293005671077504, |
| "grad_norm": 7.125, |
| "learning_rate": 3.530303115275597e-05, |
| "loss": 0.1892, |
| "mean_token_accuracy": 0.9394895970821381, |
| "num_input_tokens_seen": 3226396, |
| "num_tokens": 3207190.0, |
| "step": 140, |
| "train_runtime": 120.7172, |
| "train_tokens_per_second": 26726.892 |
| }, |
| { |
| "entropy": 1.96484375, |
| "epoch": 0.5482041587901701, |
| "grad_norm": 4.5625, |
| "learning_rate": 3.4893504948701185e-05, |
| "loss": 0.1614, |
| "mean_token_accuracy": 0.9600624740123749, |
| "num_input_tokens_seen": 3341802, |
| "num_tokens": 3321840.0, |
| "step": 145, |
| "train_runtime": 124.4268, |
| "train_tokens_per_second": 26857.576 |
| }, |
| { |
| "entropy": 1.90859375, |
| "epoch": 0.5671077504725898, |
| "grad_norm": 6.96875, |
| "learning_rate": 3.4469455441476475e-05, |
| "loss": 0.1334, |
| "mean_token_accuracy": 0.9625543296337128, |
| "num_input_tokens_seen": 3456964, |
| "num_tokens": 3436339.0, |
| "step": 150, |
| "train_runtime": 130.3081, |
| "train_tokens_per_second": 26529.148 |
| }, |
| { |
| "entropy": 1.92421875, |
| "epoch": 0.5860113421550095, |
| "grad_norm": 12.4375, |
| "learning_rate": 3.403129614016339e-05, |
| "loss": 0.1427, |
| "mean_token_accuracy": 0.9588114261627197, |
| "num_input_tokens_seen": 3572084, |
| "num_tokens": 3550813.0, |
| "step": 155, |
| "train_runtime": 133.8989, |
| "train_tokens_per_second": 26677.47 |
| }, |
| { |
| "entropy": 1.98671875, |
| "epoch": 0.6049149338374291, |
| "grad_norm": 7.3125, |
| "learning_rate": 3.357945431291618e-05, |
| "loss": 0.2129, |
| "mean_token_accuracy": 0.9367718935012818, |
| "num_input_tokens_seen": 3687248, |
| "num_tokens": 3665300.0, |
| "step": 160, |
| "train_runtime": 138.2948, |
| "train_tokens_per_second": 26662.235 |
| }, |
| { |
| "entropy": 2.1359375, |
| "epoch": 0.6238185255198487, |
| "grad_norm": 3.09375, |
| "learning_rate": 3.311437057031406e-05, |
| "loss": 0.2219, |
| "mean_token_accuracy": 0.9387097895145416, |
| "num_input_tokens_seen": 3802458, |
| "num_tokens": 3779809.0, |
| "step": 165, |
| "train_runtime": 142.569, |
| "train_tokens_per_second": 26671.004 |
| }, |
| { |
| "entropy": 2.0859375, |
| "epoch": 0.6427221172022685, |
| "grad_norm": 4.53125, |
| "learning_rate": 3.263649843570271e-05, |
| "loss": 0.1355, |
| "mean_token_accuracy": 0.9585716307163239, |
| "num_input_tokens_seen": 3917580, |
| "num_tokens": 3894322.0, |
| "step": 170, |
| "train_runtime": 145.9767, |
| "train_tokens_per_second": 26837.021 |
| }, |
| { |
| "entropy": 1.946875, |
| "epoch": 0.6616257088846881, |
| "grad_norm": 6.53125, |
| "learning_rate": 3.214630390294396e-05, |
| "loss": 0.2962, |
| "mean_token_accuracy": 0.9372412860393524, |
| "num_input_tokens_seen": 4032748, |
| "num_tokens": 4008844.0, |
| "step": 175, |
| "train_runtime": 151.6027, |
| "train_tokens_per_second": 26600.765 |
| }, |
| { |
| "entropy": 1.98671875, |
| "epoch": 0.6805293005671077, |
| "grad_norm": 6.96875, |
| "learning_rate": 3.1644264982005e-05, |
| "loss": 0.1841, |
| "mean_token_accuracy": 0.9490657150745392, |
| "num_input_tokens_seen": 4148142, |
| "num_tokens": 4123487.0, |
| "step": 180, |
| "train_runtime": 154.9764, |
| "train_tokens_per_second": 26766.274 |
| }, |
| { |
| "entropy": 2.021875, |
| "epoch": 0.6994328922495274, |
| "grad_norm": 2.953125, |
| "learning_rate": 3.113087123283002e-05, |
| "loss": 0.124, |
| "mean_token_accuracy": 0.964401924610138, |
| "num_input_tokens_seen": 4263312, |
| "num_tokens": 4238014.0, |
| "step": 185, |
| "train_runtime": 159.4694, |
| "train_tokens_per_second": 26734.354 |
| }, |
| { |
| "entropy": 1.96171875, |
| "epoch": 0.718336483931947, |
| "grad_norm": 3.4375, |
| "learning_rate": 3.060662328794916e-05, |
| "loss": 0.1498, |
| "mean_token_accuracy": 0.9481843888759613, |
| "num_input_tokens_seen": 4378630, |
| "num_tokens": 4352627.0, |
| "step": 190, |
| "train_runtime": 163.6223, |
| "train_tokens_per_second": 26760.595 |
| }, |
| { |
| "entropy": 1.9640625, |
| "epoch": 0.7372400756143668, |
| "grad_norm": 4.1875, |
| "learning_rate": 3.0072032364289914e-05, |
| "loss": 0.1076, |
| "mean_token_accuracy": 0.9691859900951385, |
| "num_input_tokens_seen": 4493600, |
| "num_tokens": 4467053.0, |
| "step": 195, |
| "train_runtime": 166.9247, |
| "train_tokens_per_second": 26919.915 |
| }, |
| { |
| "entropy": 2.02734375, |
| "epoch": 0.7561436672967864, |
| "grad_norm": 3.875, |
| "learning_rate": 2.9527619764667376e-05, |
| "loss": 0.2501, |
| "mean_token_accuracy": 0.9455641567707062, |
| "num_input_tokens_seen": 4609216, |
| "num_tokens": 4581812.0, |
| "step": 200, |
| "train_runtime": 172.0695, |
| "train_tokens_per_second": 26786.938 |
| }, |
| { |
| "entropy": 2.14375, |
| "epoch": 0.775047258979206, |
| "grad_norm": 5.4375, |
| "learning_rate": 2.8973916369439194e-05, |
| "loss": 0.2157, |
| "mean_token_accuracy": 0.9492439985275268, |
| "num_input_tokens_seen": 4724086, |
| "num_tokens": 4696178.0, |
| "step": 205, |
| "train_runtime": 175.6473, |
| "train_tokens_per_second": 26895.294 |
| }, |
| { |
| "entropy": 2.2625, |
| "epoch": 0.7939508506616257, |
| "grad_norm": 3.75, |
| "learning_rate": 2.84114621188211e-05, |
| "loss": 0.1762, |
| "mean_token_accuracy": 0.9574925601482391, |
| "num_input_tokens_seen": 4839702, |
| "num_tokens": 4810939.0, |
| "step": 210, |
| "train_runtime": 180.4712, |
| "train_tokens_per_second": 26817.036 |
| }, |
| { |
| "entropy": 2.2953125, |
| "epoch": 0.8128544423440454, |
| "grad_norm": 3.9375, |
| "learning_rate": 2.7840805486367792e-05, |
| "loss": 0.1703, |
| "mean_token_accuracy": 0.9540181159973145, |
| "num_input_tokens_seen": 4955098, |
| "num_tokens": 4925591.0, |
| "step": 215, |
| "train_runtime": 184.4177, |
| "train_tokens_per_second": 26868.891 |
| }, |
| { |
| "entropy": 2.2828125, |
| "epoch": 0.831758034026465, |
| "grad_norm": 4.625, |
| "learning_rate": 2.7262502944132526e-05, |
| "loss": 0.0938, |
| "mean_token_accuracy": 0.9725252389907837, |
| "num_input_tokens_seen": 5070258, |
| "num_tokens": 5040089.0, |
| "step": 220, |
| "train_runtime": 188.065, |
| "train_tokens_per_second": 26960.132 |
| }, |
| { |
| "entropy": 2.1265625, |
| "epoch": 0.8506616257088847, |
| "grad_norm": 5.71875, |
| "learning_rate": 2.667711842002707e-05, |
| "loss": 0.1704, |
| "mean_token_accuracy": 0.9579161703586578, |
| "num_input_tokens_seen": 5185478, |
| "num_tokens": 5154604.0, |
| "step": 225, |
| "train_runtime": 192.8301, |
| "train_tokens_per_second": 26891.43 |
| }, |
| { |
| "entropy": 2.0484375, |
| "epoch": 0.8695652173913043, |
| "grad_norm": 4.3125, |
| "learning_rate": 2.6085222747911155e-05, |
| "loss": 0.4284, |
| "mean_token_accuracy": 0.9190201222896576, |
| "num_input_tokens_seen": 5301020, |
| "num_tokens": 5269357.0, |
| "step": 230, |
| "train_runtime": 196.1744, |
| "train_tokens_per_second": 27021.971 |
| }, |
| { |
| "entropy": 1.98671875, |
| "epoch": 0.888468809073724, |
| "grad_norm": 13.3125, |
| "learning_rate": 2.5487393110947557e-05, |
| "loss": 0.1346, |
| "mean_token_accuracy": 0.9579481542110443, |
| "num_input_tokens_seen": 5416464, |
| "num_tokens": 5384069.0, |
| "step": 235, |
| "train_runtime": 201.21, |
| "train_tokens_per_second": 26919.463 |
| }, |
| { |
| "entropy": 1.9875, |
| "epoch": 0.9073724007561437, |
| "grad_norm": 3.84375, |
| "learning_rate": 2.4884212478765747e-05, |
| "loss": 0.097, |
| "mean_token_accuracy": 0.9672803819179535, |
| "num_input_tokens_seen": 5531644, |
| "num_tokens": 5498568.0, |
| "step": 240, |
| "train_runtime": 205.075, |
| "train_tokens_per_second": 26973.766 |
| }, |
| { |
| "entropy": 2.00234375, |
| "epoch": 0.9262759924385633, |
| "grad_norm": 4.9375, |
| "learning_rate": 2.427626903898292e-05, |
| "loss": 0.2298, |
| "mean_token_accuracy": 0.9443018674850464, |
| "num_input_tokens_seen": 5646952, |
| "num_tokens": 5613157.0, |
| "step": 245, |
| "train_runtime": 208.4891, |
| "train_tokens_per_second": 27085.115 |
| }, |
| { |
| "entropy": 2.0140625, |
| "epoch": 0.945179584120983, |
| "grad_norm": 6.03125, |
| "learning_rate": 2.3664155623636715e-05, |
| "loss": 0.1732, |
| "mean_token_accuracy": 0.9442705571651459, |
| "num_input_tokens_seen": 5762366, |
| "num_tokens": 5727795.0, |
| "step": 250, |
| "train_runtime": 214.059, |
| "train_tokens_per_second": 26919.525 |
| }, |
| { |
| "entropy": 2.0125, |
| "epoch": 0.9640831758034026, |
| "grad_norm": 3.546875, |
| "learning_rate": 2.304846913108891e-05, |
| "loss": 0.1083, |
| "mean_token_accuracy": 0.9664817750453949, |
| "num_input_tokens_seen": 5877646, |
| "num_tokens": 5842437.0, |
| "step": 255, |
| "train_runtime": 275.7098, |
| "train_tokens_per_second": 21318.232 |
| }, |
| { |
| "entropy": 2.0, |
| "epoch": 0.9829867674858223, |
| "grad_norm": 2.671875, |
| "learning_rate": 2.242980994396401e-05, |
| "loss": 0.0875, |
| "mean_token_accuracy": 0.9795427262783051, |
| "num_input_tokens_seen": 5992710, |
| "num_tokens": 5956870.0, |
| "step": 260, |
| "train_runtime": 280.9684, |
| "train_tokens_per_second": 21328.766 |
| }, |
| { |
| "entropy": 1.9513888888888888, |
| "epoch": 1.0, |
| "grad_norm": 7.4375, |
| "learning_rate": 2.1808781343690027e-05, |
| "loss": 0.1654, |
| "mean_token_accuracy": 0.9603289763132731, |
| "num_input_tokens_seen": 6096342, |
| "num_tokens": 6059927.0, |
| "step": 265, |
| "train_runtime": 284.3725, |
| "train_tokens_per_second": 21437.877 |
| }, |
| { |
| "entropy": 1.903125, |
| "epoch": 1.0189035916824196, |
| "grad_norm": 3.453125, |
| "learning_rate": 2.118598892221257e-05, |
| "loss": 0.0783, |
| "mean_token_accuracy": 0.9817151129245758, |
| "num_input_tokens_seen": 6211574, |
| "num_tokens": 6174483.0, |
| "step": 270, |
| "train_runtime": 288.2049, |
| "train_tokens_per_second": 21552.63 |
| }, |
| { |
| "entropy": 1.84375, |
| "epoch": 1.0378071833648392, |
| "grad_norm": 1.734375, |
| "learning_rate": 2.0562039991455877e-05, |
| "loss": 0.1214, |
| "mean_token_accuracy": 0.9741188943386078, |
| "num_input_tokens_seen": 6327000, |
| "num_tokens": 6289163.0, |
| "step": 275, |
| "train_runtime": 293.7126, |
| "train_tokens_per_second": 21541.469 |
| }, |
| { |
| "entropy": 1.8421875, |
| "epoch": 1.056710775047259, |
| "grad_norm": 3.78125, |
| "learning_rate": 1.99375429911066e-05, |
| "loss": 0.1393, |
| "mean_token_accuracy": 0.9579156279563904, |
| "num_input_tokens_seen": 6442290, |
| "num_tokens": 6403766.0, |
| "step": 280, |
| "train_runtime": 297.1668, |
| "train_tokens_per_second": 21679.038 |
| }, |
| { |
| "entropy": 1.85078125, |
| "epoch": 1.0756143667296787, |
| "grad_norm": 3.953125, |
| "learning_rate": 1.931310689529781e-05, |
| "loss": 0.0872, |
| "mean_token_accuracy": 0.9788394093513488, |
| "num_input_tokens_seen": 6557852, |
| "num_tokens": 6518469.0, |
| "step": 285, |
| "train_runtime": 301.7702, |
| "train_tokens_per_second": 21731.276 |
| }, |
| { |
| "entropy": 1.8234375, |
| "epoch": 1.0945179584120983, |
| "grad_norm": 7.1875, |
| "learning_rate": 1.8689340618771937e-05, |
| "loss": 0.0637, |
| "mean_token_accuracy": 0.972537738084793, |
| "num_input_tokens_seen": 6673032, |
| "num_tokens": 6632963.0, |
| "step": 290, |
| "train_runtime": 306.4769, |
| "train_tokens_per_second": 21773.362 |
| }, |
| { |
| "entropy": 1.78359375, |
| "epoch": 1.113421550094518, |
| "grad_norm": 5.78125, |
| "learning_rate": 1.806685242310156e-05, |
| "loss": 0.0565, |
| "mean_token_accuracy": 0.9854797184467315, |
| "num_input_tokens_seen": 6788174, |
| "num_tokens": 6747403.0, |
| "step": 295, |
| "train_runtime": 310.3851, |
| "train_tokens_per_second": 21870.17 |
| }, |
| { |
| "entropy": 1.76015625, |
| "epoch": 1.1323251417769375, |
| "grad_norm": 8.8125, |
| "learning_rate": 1.7446249323547117e-05, |
| "loss": 0.0973, |
| "mean_token_accuracy": 0.9734237968921662, |
| "num_input_tokens_seen": 6903146, |
| "num_tokens": 6861788.0, |
| "step": 300, |
| "train_runtime": 315.4655, |
| "train_tokens_per_second": 21882.41 |
| }, |
| { |
| "entropy": 1.75078125, |
| "epoch": 1.1512287334593574, |
| "grad_norm": 1.4453125, |
| "learning_rate": 1.6828136497130014e-05, |
| "loss": 0.0681, |
| "mean_token_accuracy": 0.9820096373558045, |
| "num_input_tokens_seen": 7018350, |
| "num_tokens": 6976277.0, |
| "step": 305, |
| "train_runtime": 319.0527, |
| "train_tokens_per_second": 21997.465 |
| }, |
| { |
| "entropy": 1.740625, |
| "epoch": 1.170132325141777, |
| "grad_norm": 4.90625, |
| "learning_rate": 1.6213116692498206e-05, |
| "loss": 0.0625, |
| "mean_token_accuracy": 0.9826828062534332, |
| "num_input_tokens_seen": 7133636, |
| "num_tokens": 7090874.0, |
| "step": 310, |
| "train_runtime": 323.8986, |
| "train_tokens_per_second": 22024.29 |
| }, |
| { |
| "entropy": 1.7328125, |
| "epoch": 1.1890359168241966, |
| "grad_norm": 0.66015625, |
| "learning_rate": 1.560178964215987e-05, |
| "loss": 0.077, |
| "mean_token_accuracy": 0.978941410779953, |
| "num_input_tokens_seen": 7248866, |
| "num_tokens": 7205391.0, |
| "step": 315, |
| "train_runtime": 327.5895, |
| "train_tokens_per_second": 22127.897 |
| }, |
| { |
| "entropy": 1.73203125, |
| "epoch": 1.2079395085066162, |
| "grad_norm": 4.0625, |
| "learning_rate": 1.4994751477658139e-05, |
| "loss": 0.067, |
| "mean_token_accuracy": 0.9818780541419982, |
| "num_input_tokens_seen": 7363900, |
| "num_tokens": 7319827.0, |
| "step": 320, |
| "train_runtime": 331.4598, |
| "train_tokens_per_second": 22216.571 |
| }, |
| { |
| "entropy": 1.73515625, |
| "epoch": 1.2268431001890359, |
| "grad_norm": 2.734375, |
| "learning_rate": 1.4392594148257426e-05, |
| "loss": 0.1153, |
| "mean_token_accuracy": 0.9638942897319793, |
| "num_input_tokens_seen": 7479394, |
| "num_tokens": 7434543.0, |
| "step": 325, |
| "train_runtime": 336.2629, |
| "train_tokens_per_second": 22242.696 |
| }, |
| { |
| "entropy": 1.74609375, |
| "epoch": 1.2457466918714555, |
| "grad_norm": 2.046875, |
| "learning_rate": 1.3795904843707959e-05, |
| "loss": 0.0359, |
| "mean_token_accuracy": 0.9886789560317993, |
| "num_input_tokens_seen": 7594632, |
| "num_tokens": 7549134.0, |
| "step": 330, |
| "train_runtime": 339.6052, |
| "train_tokens_per_second": 22363.12 |
| }, |
| { |
| "entropy": 1.740625, |
| "epoch": 1.264650283553875, |
| "grad_norm": 2.25, |
| "learning_rate": 1.3205265421651588e-05, |
| "loss": 0.0808, |
| "mean_token_accuracy": 0.9852688193321228, |
| "num_input_tokens_seen": 7709704, |
| "num_tokens": 7663583.0, |
| "step": 335, |
| "train_runtime": 344.9458, |
| "train_tokens_per_second": 22350.48 |
| }, |
| { |
| "entropy": 1.75078125, |
| "epoch": 1.283553875236295, |
| "grad_norm": 1.8125, |
| "learning_rate": 1.2621251840227112e-05, |
| "loss": 0.0663, |
| "mean_token_accuracy": 0.9817369997501373, |
| "num_input_tokens_seen": 7824834, |
| "num_tokens": 7778064.0, |
| "step": 340, |
| "train_runtime": 348.223, |
| "train_tokens_per_second": 22470.756 |
| }, |
| { |
| "entropy": 1.75234375, |
| "epoch": 1.3024574669187146, |
| "grad_norm": 4.28125, |
| "learning_rate": 1.2044433596428537e-05, |
| "loss": 0.0678, |
| "mean_token_accuracy": 0.9812626421451569, |
| "num_input_tokens_seen": 7939832, |
| "num_tokens": 7892415.0, |
| "step": 345, |
| "train_runtime": 352.0847, |
| "train_tokens_per_second": 22550.916 |
| }, |
| { |
| "entropy": 1.746875, |
| "epoch": 1.3213610586011342, |
| "grad_norm": 2.703125, |
| "learning_rate": 1.1475373170763819e-05, |
| "loss": 0.0465, |
| "mean_token_accuracy": 0.9823280215263367, |
| "num_input_tokens_seen": 8054988, |
| "num_tokens": 8006926.0, |
| "step": 350, |
| "train_runtime": 357.1271, |
| "train_tokens_per_second": 22554.962 |
| }, |
| { |
| "entropy": 1.74765625, |
| "epoch": 1.3402646502835538, |
| "grad_norm": 1.4921875, |
| "learning_rate": 1.0914625478755672e-05, |
| "loss": 0.1174, |
| "mean_token_accuracy": 0.9695515096187591, |
| "num_input_tokens_seen": 8170098, |
| "num_tokens": 8121373.0, |
| "step": 355, |
| "train_runtime": 360.7524, |
| "train_tokens_per_second": 22647.381 |
| }, |
| { |
| "entropy": 1.74453125, |
| "epoch": 1.3591682419659734, |
| "grad_norm": 1.1015625, |
| "learning_rate": 1.0362737329819413e-05, |
| "loss": 0.045, |
| "mean_token_accuracy": 0.9885900497436524, |
| "num_input_tokens_seen": 8285346, |
| "num_tokens": 8235981.0, |
| "step": 360, |
| "train_runtime": 366.0216, |
| "train_tokens_per_second": 22636.221 |
| }, |
| { |
| "entropy": 1.74296875, |
| "epoch": 1.3780718336483933, |
| "grad_norm": 5.15625, |
| "learning_rate": 9.820246894045316e-06, |
| "loss": 0.0428, |
| "mean_token_accuracy": 0.9822307825088501, |
| "num_input_tokens_seen": 8400240, |
| "num_tokens": 8350356.0, |
| "step": 365, |
| "train_runtime": 369.6364, |
| "train_tokens_per_second": 22725.685 |
| }, |
| { |
| "entropy": 1.73515625, |
| "epoch": 1.3969754253308129, |
| "grad_norm": 4.09375, |
| "learning_rate": 9.28768317740564e-06, |
| "loss": 0.099, |
| "mean_token_accuracy": 0.9710565328598022, |
| "num_input_tokens_seen": 8515740, |
| "num_tokens": 8465025.0, |
| "step": 370, |
| "train_runtime": 373.5701, |
| "train_tokens_per_second": 22795.56 |
| }, |
| { |
| "entropy": 1.7328125, |
| "epoch": 1.4158790170132325, |
| "grad_norm": 4.96875, |
| "learning_rate": 8.765565505897902e-06, |
| "loss": 0.0736, |
| "mean_token_accuracy": 0.9741575241088867, |
| "num_input_tokens_seen": 8631054, |
| "num_tokens": 8579648.0, |
| "step": 375, |
| "train_runtime": 378.7394, |
| "train_tokens_per_second": 22788.901 |
| }, |
| { |
| "entropy": 1.73359375, |
| "epoch": 1.434782608695652, |
| "grad_norm": 3.265625, |
| "learning_rate": 8.254403019127566e-06, |
| "loss": 0.0806, |
| "mean_token_accuracy": 0.9791056990623475, |
| "num_input_tokens_seen": 8746364, |
| "num_tokens": 8694249.0, |
| "step": 380, |
| "train_runtime": 382.0615, |
| "train_tokens_per_second": 22892.552 |
| }, |
| { |
| "entropy": 1.73515625, |
| "epoch": 1.4536862003780717, |
| "grad_norm": 3.75, |
| "learning_rate": 7.754694173823947e-06, |
| "loss": 0.0404, |
| "mean_token_accuracy": 0.9839386224746705, |
| "num_input_tokens_seen": 8861574, |
| "num_tokens": 8808789.0, |
| "step": 385, |
| "train_runtime": 387.2205, |
| "train_tokens_per_second": 22885.084 |
| }, |
| { |
| "entropy": 1.73359375, |
| "epoch": 1.4725897920604916, |
| "grad_norm": 5.09375, |
| "learning_rate": 7.266926257773346e-06, |
| "loss": 0.0926, |
| "mean_token_accuracy": 0.9714232623577118, |
| "num_input_tokens_seen": 8976944, |
| "num_tokens": 8923407.0, |
| "step": 390, |
| "train_runtime": 390.891, |
| "train_tokens_per_second": 22965.336 |
| }, |
| { |
| "entropy": 1.72265625, |
| "epoch": 1.4914933837429112, |
| "grad_norm": 5.0, |
| "learning_rate": 6.7915749146436415e-06, |
| "loss": 0.0519, |
| "mean_token_accuracy": 0.9837916433811188, |
| "num_input_tokens_seen": 9092050, |
| "num_tokens": 9037924.0, |
| "step": 395, |
| "train_runtime": 395.3397, |
| "train_tokens_per_second": 22998.071 |
| }, |
| { |
| "entropy": 1.71796875, |
| "epoch": 1.5103969754253308, |
| "grad_norm": 3.875, |
| "learning_rate": 6.329103680163495e-06, |
| "loss": 0.2115, |
| "mean_token_accuracy": 0.9516554296016693, |
| "num_input_tokens_seen": 9207594, |
| "num_tokens": 9152659.0, |
| "step": 400, |
| "train_runtime": 399.5499, |
| "train_tokens_per_second": 23044.916 |
| }, |
| { |
| "entropy": 1.71640625, |
| "epoch": 1.5293005671077504, |
| "grad_norm": 0.51953125, |
| "learning_rate": 5.879963530108506e-06, |
| "loss": 0.0348, |
| "mean_token_accuracy": 0.9919346511363983, |
| "num_input_tokens_seen": 9322572, |
| "num_tokens": 9267059.0, |
| "step": 405, |
| "train_runtime": 403.4031, |
| "train_tokens_per_second": 23109.815 |
| }, |
| { |
| "entropy": 1.7125, |
| "epoch": 1.54820415879017, |
| "grad_norm": 2.234375, |
| "learning_rate": 5.444592440535177e-06, |
| "loss": 0.0374, |
| "mean_token_accuracy": 0.9837370038032531, |
| "num_input_tokens_seen": 9438004, |
| "num_tokens": 9381725.0, |
| "step": 410, |
| "train_runtime": 407.9692, |
| "train_tokens_per_second": 23134.111 |
| }, |
| { |
| "entropy": 1.7078125, |
| "epoch": 1.5671077504725899, |
| "grad_norm": 3.09375, |
| "learning_rate": 5.023414960691469e-06, |
| "loss": 0.0325, |
| "mean_token_accuracy": 0.9918534696102143, |
| "num_input_tokens_seen": 9553156, |
| "num_tokens": 9496255.0, |
| "step": 415, |
| "train_runtime": 412.1408, |
| "train_tokens_per_second": 23179.35 |
| }, |
| { |
| "entropy": 1.703125, |
| "epoch": 1.5860113421550095, |
| "grad_norm": 4.78125, |
| "learning_rate": 4.616841799020364e-06, |
| "loss": 0.0618, |
| "mean_token_accuracy": 0.9808044970035553, |
| "num_input_tokens_seen": 9668364, |
| "num_tokens": 9610808.0, |
| "step": 420, |
| "train_runtime": 416.3235, |
| "train_tokens_per_second": 23223.203 |
| }, |
| { |
| "entropy": 1.703125, |
| "epoch": 1.6049149338374291, |
| "grad_norm": 6.1875, |
| "learning_rate": 4.225269422660258e-06, |
| "loss": 0.0493, |
| "mean_token_accuracy": 0.9843941271305084, |
| "num_input_tokens_seen": 9783552, |
| "num_tokens": 9725283.0, |
| "step": 425, |
| "train_runtime": 421.1316, |
| "train_tokens_per_second": 23231.576 |
| }, |
| { |
| "entropy": 1.70390625, |
| "epoch": 1.6238185255198487, |
| "grad_norm": 1.5078125, |
| "learning_rate": 3.8490796708326404e-06, |
| "loss": 0.0595, |
| "mean_token_accuracy": 0.9822299420833588, |
| "num_input_tokens_seen": 9898934, |
| "num_tokens": 9839878.0, |
| "step": 430, |
| "train_runtime": 424.7606, |
| "train_tokens_per_second": 23304.735 |
| }, |
| { |
| "entropy": 1.7015625, |
| "epoch": 1.6427221172022684, |
| "grad_norm": 1.328125, |
| "learning_rate": 3.4886393824940924e-06, |
| "loss": 0.059, |
| "mean_token_accuracy": 0.9807979345321656, |
| "num_input_tokens_seen": 10014142, |
| "num_tokens": 9954403.0, |
| "step": 435, |
| "train_runtime": 429.8927, |
| "train_tokens_per_second": 23294.514 |
| }, |
| { |
| "entropy": 1.70390625, |
| "epoch": 1.6616257088846882, |
| "grad_norm": 2.09375, |
| "learning_rate": 3.144300038615691e-06, |
| "loss": 0.0574, |
| "mean_token_accuracy": 0.9839386105537414, |
| "num_input_tokens_seen": 10129264, |
| "num_tokens": 10068933.0, |
| "step": 440, |
| "train_runtime": 433.4828, |
| "train_tokens_per_second": 23367.164 |
| }, |
| { |
| "entropy": 1.6984375, |
| "epoch": 1.6805293005671076, |
| "grad_norm": 4.25, |
| "learning_rate": 2.8163974194386766e-06, |
| "loss": 0.0669, |
| "mean_token_accuracy": 0.9792383193969727, |
| "num_input_tokens_seen": 10244732, |
| "num_tokens": 10183591.0, |
| "step": 445, |
| "train_runtime": 437.9792, |
| "train_tokens_per_second": 23390.909 |
| }, |
| { |
| "entropy": 1.7015625, |
| "epoch": 1.6994328922495274, |
| "grad_norm": 3.46875, |
| "learning_rate": 2.5052512770405434e-06, |
| "loss": 0.0801, |
| "mean_token_accuracy": 0.9761136710643769, |
| "num_input_tokens_seen": 10360212, |
| "num_tokens": 10298251.0, |
| "step": 450, |
| "train_runtime": 442.481, |
| "train_tokens_per_second": 23413.915 |
| }, |
| { |
| "entropy": 1.70234375, |
| "epoch": 1.718336483931947, |
| "grad_norm": 0.59765625, |
| "learning_rate": 2.2111650235309147e-06, |
| "loss": 0.0297, |
| "mean_token_accuracy": 0.9904489517211914, |
| "num_input_tokens_seen": 10475400, |
| "num_tokens": 10412810.0, |
| "step": 455, |
| "train_runtime": 446.3738, |
| "train_tokens_per_second": 23467.773 |
| }, |
| { |
| "entropy": 1.69921875, |
| "epoch": 1.7372400756143667, |
| "grad_norm": 4.0625, |
| "learning_rate": 1.9344254351812287e-06, |
| "loss": 0.0989, |
| "mean_token_accuracy": 0.9743396818637848, |
| "num_input_tokens_seen": 10590710, |
| "num_tokens": 10527389.0, |
| "step": 460, |
| "train_runtime": 451.1755, |
| "train_tokens_per_second": 23473.591 |
| }, |
| { |
| "entropy": 1.703125, |
| "epoch": 1.7561436672967865, |
| "grad_norm": 0.890625, |
| "learning_rate": 1.6753023727767436e-06, |
| "loss": 0.0476, |
| "mean_token_accuracy": 0.9838890075683594, |
| "num_input_tokens_seen": 10705900, |
| "num_tokens": 10641918.0, |
| "step": 465, |
| "train_runtime": 454.754, |
| "train_tokens_per_second": 23542.179 |
| }, |
| { |
| "entropy": 1.7, |
| "epoch": 1.775047258979206, |
| "grad_norm": 1.359375, |
| "learning_rate": 1.4340485184635712e-06, |
| "loss": 0.0556, |
| "mean_token_accuracy": 0.9777659058570862, |
| "num_input_tokens_seen": 10821144, |
| "num_tokens": 10756496.0, |
| "step": 470, |
| "train_runtime": 459.2027, |
| "train_tokens_per_second": 23565.072 |
| }, |
| { |
| "entropy": 1.69921875, |
| "epoch": 1.7939508506616257, |
| "grad_norm": 1.171875, |
| "learning_rate": 1.2108991293473627e-06, |
| "loss": 0.0595, |
| "mean_token_accuracy": 0.9741835057735443, |
| "num_input_tokens_seen": 10936460, |
| "num_tokens": 10871124.0, |
| "step": 475, |
| "train_runtime": 463.6099, |
| "train_tokens_per_second": 23589.79 |
| }, |
| { |
| "entropy": 1.69765625, |
| "epoch": 1.8128544423440454, |
| "grad_norm": 3.265625, |
| "learning_rate": 1.0060718080838683e-06, |
| "loss": 0.0541, |
| "mean_token_accuracy": 0.9831156551837921, |
| "num_input_tokens_seen": 11051508, |
| "num_tokens": 10985594.0, |
| "step": 480, |
| "train_runtime": 467.1593, |
| "train_tokens_per_second": 23656.828 |
| }, |
| { |
| "entropy": 1.70078125, |
| "epoch": 1.831758034026465, |
| "grad_norm": 2.4375, |
| "learning_rate": 8.197662906851534e-07, |
| "loss": 0.0835, |
| "mean_token_accuracy": 0.9726030707359314, |
| "num_input_tokens_seen": 11166904, |
| "num_tokens": 11100230.0, |
| "step": 485, |
| "train_runtime": 472.195, |
| "train_tokens_per_second": 23648.922 |
| }, |
| { |
| "entropy": 1.69921875, |
| "epoch": 1.8506616257088848, |
| "grad_norm": 2.765625, |
| "learning_rate": 6.521642517483573e-07, |
| "loss": 0.0532, |
| "mean_token_accuracy": 0.9853454470634461, |
| "num_input_tokens_seen": 11281802, |
| "num_tokens": 11214624.0, |
| "step": 490, |
| "train_runtime": 475.7718, |
| "train_tokens_per_second": 23712.635 |
| }, |
| { |
| "entropy": 1.70078125, |
| "epoch": 1.8695652173913042, |
| "grad_norm": 2.171875, |
| "learning_rate": 5.034291272968772e-07, |
| "loss": 0.027, |
| "mean_token_accuracy": 0.9934648215770722, |
| "num_input_tokens_seen": 11396946, |
| "num_tokens": 11329098.0, |
| "step": 495, |
| "train_runtime": 480.2436, |
| "train_tokens_per_second": 23731.596 |
| }, |
| { |
| "entropy": 1.6984375, |
| "epoch": 1.888468809073724, |
| "grad_norm": 4.0625, |
| "learning_rate": 3.737059554068334e-07, |
| "loss": 0.0742, |
| "mean_token_accuracy": 0.9744843065738678, |
| "num_input_tokens_seen": 11512282, |
| "num_tokens": 11443715.0, |
| "step": 500, |
| "train_runtime": 484.6792, |
| "train_tokens_per_second": 23752.376 |
| }, |
| { |
| "entropy": 1.69921875, |
| "epoch": 1.9073724007561437, |
| "grad_norm": 6.84375, |
| "learning_rate": 2.631212347741352e-07, |
| "loss": 0.1322, |
| "mean_token_accuracy": 0.9680740118026734, |
| "num_input_tokens_seen": 11627828, |
| "num_tokens": 11558513.0, |
| "step": 505, |
| "train_runtime": 544.5283, |
| "train_tokens_per_second": 21353.945 |
| }, |
| { |
| "entropy": 1.69921875, |
| "epoch": 1.9262759924385633, |
| "grad_norm": 1.0078125, |
| "learning_rate": 1.7178280136011417e-07, |
| "loss": 0.0864, |
| "mean_token_accuracy": 0.9749818980693817, |
| "num_input_tokens_seen": 11743010, |
| "num_tokens": 11673010.0, |
| "step": 510, |
| "train_runtime": 549.7569, |
| "train_tokens_per_second": 21360.369 |
| }, |
| { |
| "entropy": 1.69921875, |
| "epoch": 1.9451795841209831, |
| "grad_norm": 2.5625, |
| "learning_rate": 9.977972323599095e-08, |
| "loss": 0.1175, |
| "mean_token_accuracy": 0.9680160820484162, |
| "num_input_tokens_seen": 11858430, |
| "num_tokens": 11787637.0, |
| "step": 515, |
| "train_runtime": 553.6509, |
| "train_tokens_per_second": 21418.605 |
| }, |
| { |
| "entropy": 1.69765625, |
| "epoch": 1.9640831758034025, |
| "grad_norm": 2.921875, |
| "learning_rate": 4.718221372874254e-08, |
| "loss": 0.0695, |
| "mean_token_accuracy": 0.9804269134998321, |
| "num_input_tokens_seen": 11973576, |
| "num_tokens": 11902111.0, |
| "step": 520, |
| "train_runtime": 557.8609, |
| "train_tokens_per_second": 21463.371 |
| }, |
| { |
| "entropy": 1.69609375, |
| "epoch": 1.9829867674858224, |
| "grad_norm": 5.8125, |
| "learning_rate": 1.4041562953031051e-08, |
| "loss": 0.1152, |
| "mean_token_accuracy": 0.9696780204772949, |
| "num_input_tokens_seen": 12088990, |
| "num_tokens": 12016759.0, |
| "step": 525, |
| "train_runtime": 561.9991, |
| "train_tokens_per_second": 21510.694 |
| }, |
| { |
| "entropy": 1.6961805555555556, |
| "epoch": 2.0, |
| "grad_norm": 3.75, |
| "learning_rate": 3.900877959917004e-10, |
| "loss": 0.0989, |
| "mean_token_accuracy": 0.9715293182267083, |
| "num_input_tokens_seen": 12192662, |
| "num_tokens": 12119827.0, |
| "step": 530, |
| "train_runtime": 565.5622, |
| "train_tokens_per_second": 21558.482 |
| }, |
| { |
| "epoch": 2.0, |
| "num_input_tokens_seen": 12192662, |
| "step": 530, |
| "total_flos": 3.3226637176733696e+16, |
| "train_loss": 0.1822078584218925, |
| "train_runtime": 612.9949, |
| "train_samples_per_second": 27.592, |
| "train_steps_per_second": 0.865, |
| "train_tokens_per_second": 2486.879 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 530, |
| "num_input_tokens_seen": 12192662, |
| "num_train_epochs": 2, |
| "save_steps": 250, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.3226637176733696e+16, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|