{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.903225806451613, "eval_steps": 30, "global_step": 360, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008064516129032258, "grad_norm": NaN, "learning_rate": 0.0, "loss": 2.8821, "num_input_tokens_seen": 2460, "step": 1, "train_runtime": 20.2008, "train_tokens_per_second": 121.777 }, { "epoch": 0.016129032258064516, "grad_norm": 2.2230546474456787, "learning_rate": 0.0, "loss": 2.8396, "num_input_tokens_seen": 4964, "step": 2, "train_runtime": 29.3232, "train_tokens_per_second": 169.286 }, { "epoch": 0.024193548387096774, "grad_norm": 2.2324392795562744, "learning_rate": 4e-05, "loss": 2.8582, "num_input_tokens_seen": 7402, "step": 3, "train_runtime": 37.5642, "train_tokens_per_second": 197.05 }, { "epoch": 0.03225806451612903, "grad_norm": 2.004035234451294, "learning_rate": 8e-05, "loss": 2.7738, "num_input_tokens_seen": 9844, "step": 4, "train_runtime": 45.3627, "train_tokens_per_second": 217.006 }, { "epoch": 0.04032258064516129, "grad_norm": 1.749263048171997, "learning_rate": 0.00012, "loss": 2.6939, "num_input_tokens_seen": 12411, "step": 5, "train_runtime": 53.1839, "train_tokens_per_second": 233.36 }, { "epoch": 0.04838709677419355, "grad_norm": 1.2987128496170044, "learning_rate": 0.00016, "loss": 2.4797, "num_input_tokens_seen": 14846, "step": 6, "train_runtime": 60.5056, "train_tokens_per_second": 245.366 }, { "epoch": 0.056451612903225805, "grad_norm": 1.2643777132034302, "learning_rate": 0.0002, "loss": 2.2646, "num_input_tokens_seen": 17165, "step": 7, "train_runtime": 67.4129, "train_tokens_per_second": 254.625 }, { "epoch": 0.06451612903225806, "grad_norm": 1.3214221000671387, "learning_rate": 0.00019999967645432384, "loss": 2.1451, "num_input_tokens_seen": 19599, "step": 8, "train_runtime": 74.7551, "train_tokens_per_second": 262.176 }, { "epoch": 0.07258064516129033, "grad_norm": 1.5586612224578857, "learning_rate": 0.00019999870581938894, "loss": 1.9267, "num_input_tokens_seen": 21933, "step": 9, "train_runtime": 81.9279, "train_tokens_per_second": 267.711 }, { "epoch": 0.08064516129032258, "grad_norm": 1.82594895362854, "learning_rate": 0.0001999970881014762, "loss": 1.7155, "num_input_tokens_seen": 24314, "step": 10, "train_runtime": 89.3509, "train_tokens_per_second": 272.118 }, { "epoch": 0.08870967741935484, "grad_norm": 1.8575149774551392, "learning_rate": 0.00019999482331105377, "loss": 1.4564, "num_input_tokens_seen": 26599, "step": 11, "train_runtime": 96.7349, "train_tokens_per_second": 274.968 }, { "epoch": 0.0967741935483871, "grad_norm": 1.5813839435577393, "learning_rate": 0.0001999919114627769, "loss": 1.4559, "num_input_tokens_seen": 29110, "step": 12, "train_runtime": 104.7926, "train_tokens_per_second": 277.787 }, { "epoch": 0.10483870967741936, "grad_norm": 1.6002471446990967, "learning_rate": 0.00019998835257548786, "loss": 1.1945, "num_input_tokens_seen": 31455, "step": 13, "train_runtime": 112.5233, "train_tokens_per_second": 279.542 }, { "epoch": 0.11290322580645161, "grad_norm": 1.4724524021148682, "learning_rate": 0.00019998414667221596, "loss": 1.1141, "num_input_tokens_seen": 33938, "step": 14, "train_runtime": 120.5539, "train_tokens_per_second": 281.517 }, { "epoch": 0.12096774193548387, "grad_norm": 1.295906662940979, "learning_rate": 0.00019997929378017725, "loss": 0.9111, "num_input_tokens_seen": 36328, "step": 15, "train_runtime": 128.1664, "train_tokens_per_second": 283.444 }, { "epoch": 0.12903225806451613, "grad_norm": 1.1280094385147095, "learning_rate": 0.00019997379393077428, "loss": 0.8585, "num_input_tokens_seen": 38720, "step": 16, "train_runtime": 135.6984, "train_tokens_per_second": 285.339 }, { "epoch": 0.13709677419354838, "grad_norm": 1.0555989742279053, "learning_rate": 0.00019996764715959618, "loss": 0.8459, "num_input_tokens_seen": 41264, "step": 17, "train_runtime": 143.6519, "train_tokens_per_second": 287.25 }, { "epoch": 0.14516129032258066, "grad_norm": 0.980623722076416, "learning_rate": 0.0001999608535064182, "loss": 0.7354, "num_input_tokens_seen": 43719, "step": 18, "train_runtime": 151.2792, "train_tokens_per_second": 288.995 }, { "epoch": 0.1532258064516129, "grad_norm": 0.9247991442680359, "learning_rate": 0.0001999534130152014, "loss": 0.6762, "num_input_tokens_seen": 46087, "step": 19, "train_runtime": 158.66, "train_tokens_per_second": 290.476 }, { "epoch": 0.16129032258064516, "grad_norm": 29.37914276123047, "learning_rate": 0.00019994532573409262, "loss": 0.7271, "num_input_tokens_seen": 48529, "step": 20, "train_runtime": 166.2671, "train_tokens_per_second": 291.874 }, { "epoch": 0.1693548387096774, "grad_norm": 0.9571717977523804, "learning_rate": 0.0001999365917154239, "loss": 0.5222, "num_input_tokens_seen": 50785, "step": 21, "train_runtime": 173.3922, "train_tokens_per_second": 292.891 }, { "epoch": 0.1774193548387097, "grad_norm": 0.5547913908958435, "learning_rate": 0.00019992721101571236, "loss": 0.5921, "num_input_tokens_seen": 53131, "step": 22, "train_runtime": 180.821, "train_tokens_per_second": 293.832 }, { "epoch": 0.18548387096774194, "grad_norm": 0.5877758264541626, "learning_rate": 0.0001999171836956597, "loss": 0.6327, "num_input_tokens_seen": 55531, "step": 23, "train_runtime": 188.3614, "train_tokens_per_second": 294.811 }, { "epoch": 0.1935483870967742, "grad_norm": 0.5965892672538757, "learning_rate": 0.0001999065098201518, "loss": 0.7097, "num_input_tokens_seen": 58012, "step": 24, "train_runtime": 196.1428, "train_tokens_per_second": 295.764 }, { "epoch": 0.20161290322580644, "grad_norm": 0.47157740592956543, "learning_rate": 0.00019989518945825844, "loss": 0.5306, "num_input_tokens_seen": 60347, "step": 25, "train_runtime": 203.5991, "train_tokens_per_second": 296.401 }, { "epoch": 0.20967741935483872, "grad_norm": 0.5811465382575989, "learning_rate": 0.00019988322268323268, "loss": 0.637, "num_input_tokens_seen": 62789, "step": 26, "train_runtime": 211.3235, "train_tokens_per_second": 297.123 }, { "epoch": 0.21774193548387097, "grad_norm": 0.5013178586959839, "learning_rate": 0.00019987060957251047, "loss": 0.6434, "num_input_tokens_seen": 65259, "step": 27, "train_runtime": 219.162, "train_tokens_per_second": 297.766 }, { "epoch": 0.22580645161290322, "grad_norm": 0.4896971583366394, "learning_rate": 0.00019985735020771017, "loss": 0.6165, "num_input_tokens_seen": 67627, "step": 28, "train_runtime": 226.7103, "train_tokens_per_second": 298.297 }, { "epoch": 0.23387096774193547, "grad_norm": 0.5190575122833252, "learning_rate": 0.00019984344467463197, "loss": 0.683, "num_input_tokens_seen": 70097, "step": 29, "train_runtime": 234.4542, "train_tokens_per_second": 298.979 }, { "epoch": 0.24193548387096775, "grad_norm": 0.485100656747818, "learning_rate": 0.0001998288930632574, "loss": 0.7283, "num_input_tokens_seen": 72599, "step": 30, "train_runtime": 242.2724, "train_tokens_per_second": 299.659 }, { "epoch": 0.24193548387096775, "eval_loss": 0.6429173946380615, "eval_runtime": 20.6444, "eval_samples_per_second": 2.519, "eval_steps_per_second": 1.259, "num_input_tokens_seen": 72599, "step": 30 }, { "epoch": 0.25, "grad_norm": 0.42926788330078125, "learning_rate": 0.00019981369546774865, "loss": 0.5616, "num_input_tokens_seen": 74963, "step": 31, "train_runtime": 270.939, "train_tokens_per_second": 276.678 }, { "epoch": 0.25806451612903225, "grad_norm": 0.4984983503818512, "learning_rate": 0.00019979785198644806, "loss": 0.6284, "num_input_tokens_seen": 77373, "step": 32, "train_runtime": 278.5531, "train_tokens_per_second": 277.768 }, { "epoch": 0.2661290322580645, "grad_norm": 0.47428232431411743, "learning_rate": 0.00019978136272187747, "loss": 0.6354, "num_input_tokens_seen": 79803, "step": 33, "train_runtime": 286.3198, "train_tokens_per_second": 278.72 }, { "epoch": 0.27419354838709675, "grad_norm": 0.5008767247200012, "learning_rate": 0.0001997642277807374, "loss": 0.6537, "num_input_tokens_seen": 82278, "step": 34, "train_runtime": 294.1164, "train_tokens_per_second": 279.746 }, { "epoch": 0.28225806451612906, "grad_norm": 0.42406603693962097, "learning_rate": 0.00019974644727390665, "loss": 0.5861, "num_input_tokens_seen": 84716, "step": 35, "train_runtime": 301.7918, "train_tokens_per_second": 280.71 }, { "epoch": 0.2903225806451613, "grad_norm": 0.4029514491558075, "learning_rate": 0.00019972802131644127, "loss": 0.5435, "num_input_tokens_seen": 87042, "step": 36, "train_runtime": 309.1953, "train_tokens_per_second": 281.511 }, { "epoch": 0.29838709677419356, "grad_norm": 0.3857174217700958, "learning_rate": 0.00019970895002757413, "loss": 0.4691, "num_input_tokens_seen": 89337, "step": 37, "train_runtime": 316.4937, "train_tokens_per_second": 282.271 }, { "epoch": 0.3064516129032258, "grad_norm": 0.5377873182296753, "learning_rate": 0.00019968923353071377, "loss": 0.6391, "num_input_tokens_seen": 91831, "step": 38, "train_runtime": 324.2416, "train_tokens_per_second": 283.218 }, { "epoch": 0.31451612903225806, "grad_norm": 0.4515826404094696, "learning_rate": 0.00019966887195344403, "loss": 0.5323, "num_input_tokens_seen": 94226, "step": 39, "train_runtime": 331.748, "train_tokens_per_second": 284.029 }, { "epoch": 0.3225806451612903, "grad_norm": 0.3910655677318573, "learning_rate": 0.0001996478654275229, "loss": 0.5125, "num_input_tokens_seen": 96510, "step": 40, "train_runtime": 338.8924, "train_tokens_per_second": 284.781 }, { "epoch": 0.33064516129032256, "grad_norm": 0.5480923056602478, "learning_rate": 0.00019962621408888177, "loss": 0.6256, "num_input_tokens_seen": 98963, "step": 41, "train_runtime": 346.5461, "train_tokens_per_second": 285.57 }, { "epoch": 0.3387096774193548, "grad_norm": 0.4644400477409363, "learning_rate": 0.00019960391807762463, "loss": 0.6775, "num_input_tokens_seen": 101522, "step": 42, "train_runtime": 354.466, "train_tokens_per_second": 286.408 }, { "epoch": 0.3467741935483871, "grad_norm": 0.45509329438209534, "learning_rate": 0.00019958097753802693, "loss": 0.5968, "num_input_tokens_seen": 103938, "step": 43, "train_runtime": 361.9713, "train_tokens_per_second": 287.144 }, { "epoch": 0.3548387096774194, "grad_norm": 0.5333311557769775, "learning_rate": 0.00019955739261853504, "loss": 0.6604, "num_input_tokens_seen": 106416, "step": 44, "train_runtime": 369.6866, "train_tokens_per_second": 287.855 }, { "epoch": 0.3629032258064516, "grad_norm": 0.421058714389801, "learning_rate": 0.00019953316347176488, "loss": 0.4962, "num_input_tokens_seen": 108748, "step": 45, "train_runtime": 377.0273, "train_tokens_per_second": 288.435 }, { "epoch": 0.3709677419354839, "grad_norm": 0.4439956843852997, "learning_rate": 0.00019950829025450114, "loss": 0.5743, "num_input_tokens_seen": 111155, "step": 46, "train_runtime": 384.5601, "train_tokens_per_second": 289.045 }, { "epoch": 0.3790322580645161, "grad_norm": 0.5222911834716797, "learning_rate": 0.0001994827731276963, "loss": 0.5686, "num_input_tokens_seen": 113593, "step": 47, "train_runtime": 392.1126, "train_tokens_per_second": 289.695 }, { "epoch": 0.3870967741935484, "grad_norm": 0.4172716736793518, "learning_rate": 0.00019945661225646946, "loss": 0.5262, "num_input_tokens_seen": 115968, "step": 48, "train_runtime": 399.5667, "train_tokens_per_second": 290.234 }, { "epoch": 0.3951612903225806, "grad_norm": 0.43297383189201355, "learning_rate": 0.0001994298078101054, "loss": 0.6139, "num_input_tokens_seen": 118476, "step": 49, "train_runtime": 407.3814, "train_tokens_per_second": 290.823 }, { "epoch": 0.4032258064516129, "grad_norm": 0.49559882283210754, "learning_rate": 0.00019940235996205333, "loss": 0.6726, "num_input_tokens_seen": 120946, "step": 50, "train_runtime": 415.0753, "train_tokens_per_second": 291.383 }, { "epoch": 0.4112903225806452, "grad_norm": 0.4947301149368286, "learning_rate": 0.0001993742688899259, "loss": 0.5937, "num_input_tokens_seen": 123371, "step": 51, "train_runtime": 422.6813, "train_tokens_per_second": 291.877 }, { "epoch": 0.41935483870967744, "grad_norm": 0.38399311900138855, "learning_rate": 0.00019934553477549794, "loss": 0.5638, "num_input_tokens_seen": 125783, "step": 52, "train_runtime": 430.2413, "train_tokens_per_second": 292.355 }, { "epoch": 0.4274193548387097, "grad_norm": 0.5241729617118835, "learning_rate": 0.00019931615780470558, "loss": 0.591, "num_input_tokens_seen": 128227, "step": 53, "train_runtime": 437.9384, "train_tokens_per_second": 292.797 }, { "epoch": 0.43548387096774194, "grad_norm": 0.5647182464599609, "learning_rate": 0.00019928613816764458, "loss": 0.5098, "num_input_tokens_seen": 130614, "step": 54, "train_runtime": 445.5243, "train_tokens_per_second": 293.169 }, { "epoch": 0.4435483870967742, "grad_norm": 0.5013054013252258, "learning_rate": 0.00019925547605856934, "loss": 0.6388, "num_input_tokens_seen": 133047, "step": 55, "train_runtime": 453.1951, "train_tokens_per_second": 293.576 }, { "epoch": 0.45161290322580644, "grad_norm": 0.5751994848251343, "learning_rate": 0.00019922417167589183, "loss": 0.5489, "num_input_tokens_seen": 135442, "step": 56, "train_runtime": 460.7707, "train_tokens_per_second": 293.947 }, { "epoch": 0.4596774193548387, "grad_norm": 0.5764442086219788, "learning_rate": 0.00019919222522217996, "loss": 0.6618, "num_input_tokens_seen": 137936, "step": 57, "train_runtime": 468.5965, "train_tokens_per_second": 294.36 }, { "epoch": 0.46774193548387094, "grad_norm": 0.48759251832962036, "learning_rate": 0.00019915963690415647, "loss": 0.5633, "num_input_tokens_seen": 140352, "step": 58, "train_runtime": 476.2388, "train_tokens_per_second": 294.709 }, { "epoch": 0.47580645161290325, "grad_norm": 1.1273839473724365, "learning_rate": 0.00019912640693269752, "loss": 0.557, "num_input_tokens_seen": 142729, "step": 59, "train_runtime": 483.8414, "train_tokens_per_second": 294.991 }, { "epoch": 0.4838709677419355, "grad_norm": 0.43796879053115845, "learning_rate": 0.00019909253552283143, "loss": 0.5718, "num_input_tokens_seen": 145195, "step": 60, "train_runtime": 491.5972, "train_tokens_per_second": 295.354 }, { "epoch": 0.4838709677419355, "eval_loss": 0.588404655456543, "eval_runtime": 17.1502, "eval_samples_per_second": 3.032, "eval_steps_per_second": 1.516, "num_input_tokens_seen": 145195, "step": 60 }, { "epoch": 0.49193548387096775, "grad_norm": 0.41540732979774475, "learning_rate": 0.00019905802289373715, "loss": 0.5533, "num_input_tokens_seen": 147570, "step": 61, "train_runtime": 516.8658, "train_tokens_per_second": 285.509 }, { "epoch": 0.5, "grad_norm": 0.4697861075401306, "learning_rate": 0.0001990228692687429, "loss": 0.659, "num_input_tokens_seen": 150082, "step": 62, "train_runtime": 524.7607, "train_tokens_per_second": 286.001 }, { "epoch": 0.5080645161290323, "grad_norm": 0.4594587981700897, "learning_rate": 0.00019898707487532474, "loss": 0.5754, "num_input_tokens_seen": 152502, "step": 63, "train_runtime": 532.3943, "train_tokens_per_second": 286.446 }, { "epoch": 0.5161290322580645, "grad_norm": 0.45399630069732666, "learning_rate": 0.0001989506399451051, "loss": 0.5575, "num_input_tokens_seen": 154888, "step": 64, "train_runtime": 539.9509, "train_tokens_per_second": 286.856 }, { "epoch": 0.5241935483870968, "grad_norm": 0.4408513307571411, "learning_rate": 0.0001989135647138513, "loss": 0.578, "num_input_tokens_seen": 157296, "step": 65, "train_runtime": 547.6373, "train_tokens_per_second": 287.227 }, { "epoch": 0.532258064516129, "grad_norm": 0.5508686304092407, "learning_rate": 0.00019887584942147394, "loss": 0.5551, "num_input_tokens_seen": 159619, "step": 66, "train_runtime": 555.0465, "train_tokens_per_second": 287.578 }, { "epoch": 0.5403225806451613, "grad_norm": 0.4901481568813324, "learning_rate": 0.0001988374943120254, "loss": 0.6144, "num_input_tokens_seen": 162016, "step": 67, "train_runtime": 562.5856, "train_tokens_per_second": 287.985 }, { "epoch": 0.5483870967741935, "grad_norm": 0.6244992017745972, "learning_rate": 0.00019879849963369827, "loss": 0.657, "num_input_tokens_seen": 164573, "step": 68, "train_runtime": 570.5194, "train_tokens_per_second": 288.462 }, { "epoch": 0.5564516129032258, "grad_norm": 0.4478534162044525, "learning_rate": 0.00019875886563882375, "loss": 0.5517, "num_input_tokens_seen": 167048, "step": 69, "train_runtime": 578.2943, "train_tokens_per_second": 288.863 }, { "epoch": 0.5645161290322581, "grad_norm": 0.6100202202796936, "learning_rate": 0.00019871859258387, "loss": 0.619, "num_input_tokens_seen": 169510, "step": 70, "train_runtime": 586.0262, "train_tokens_per_second": 289.253 }, { "epoch": 0.5725806451612904, "grad_norm": 0.4547867178916931, "learning_rate": 0.00019867768072944045, "loss": 0.5046, "num_input_tokens_seen": 171885, "step": 71, "train_runtime": 593.5521, "train_tokens_per_second": 289.587 }, { "epoch": 0.5806451612903226, "grad_norm": 0.43076831102371216, "learning_rate": 0.00019863613034027224, "loss": 0.6065, "num_input_tokens_seen": 174375, "step": 72, "train_runtime": 601.3642, "train_tokens_per_second": 289.966 }, { "epoch": 0.5887096774193549, "grad_norm": 0.45889994502067566, "learning_rate": 0.0001985939416852343, "loss": 0.5827, "num_input_tokens_seen": 176790, "step": 73, "train_runtime": 608.9783, "train_tokens_per_second": 290.306 }, { "epoch": 0.5967741935483871, "grad_norm": 0.5129091143608093, "learning_rate": 0.00019855111503732574, "loss": 0.4935, "num_input_tokens_seen": 179140, "step": 74, "train_runtime": 616.4507, "train_tokens_per_second": 290.599 }, { "epoch": 0.6048387096774194, "grad_norm": 0.4756077527999878, "learning_rate": 0.00019850765067367412, "loss": 0.4864, "num_input_tokens_seen": 181433, "step": 75, "train_runtime": 623.788, "train_tokens_per_second": 290.857 }, { "epoch": 0.6129032258064516, "grad_norm": 0.5090617537498474, "learning_rate": 0.00019846354887553358, "loss": 0.5477, "num_input_tokens_seen": 183817, "step": 76, "train_runtime": 631.336, "train_tokens_per_second": 291.156 }, { "epoch": 0.6209677419354839, "grad_norm": 0.4573401212692261, "learning_rate": 0.00019841880992828306, "loss": 0.5305, "num_input_tokens_seen": 186158, "step": 77, "train_runtime": 638.7889, "train_tokens_per_second": 291.423 }, { "epoch": 0.6290322580645161, "grad_norm": 0.43317022919654846, "learning_rate": 0.0001983734341214244, "loss": 0.4796, "num_input_tokens_seen": 188490, "step": 78, "train_runtime": 646.2095, "train_tokens_per_second": 291.686 }, { "epoch": 0.6370967741935484, "grad_norm": 0.5083560943603516, "learning_rate": 0.00019832742174858052, "loss": 0.5368, "num_input_tokens_seen": 190868, "step": 79, "train_runtime": 653.7184, "train_tokens_per_second": 291.973 }, { "epoch": 0.6451612903225806, "grad_norm": 0.5274792313575745, "learning_rate": 0.0001982807731074935, "loss": 0.6258, "num_input_tokens_seen": 193343, "step": 80, "train_runtime": 661.5295, "train_tokens_per_second": 292.267 }, { "epoch": 0.6532258064516129, "grad_norm": 0.46456336975097656, "learning_rate": 0.00019823348850002268, "loss": 0.5115, "num_input_tokens_seen": 195723, "step": 81, "train_runtime": 669.1413, "train_tokens_per_second": 292.499 }, { "epoch": 0.6612903225806451, "grad_norm": 0.4614492356777191, "learning_rate": 0.00019818556823214268, "loss": 0.5547, "num_input_tokens_seen": 198142, "step": 82, "train_runtime": 676.7932, "train_tokens_per_second": 292.766 }, { "epoch": 0.6693548387096774, "grad_norm": 0.431631863117218, "learning_rate": 0.00019813701261394136, "loss": 0.5297, "num_input_tokens_seen": 200459, "step": 83, "train_runtime": 684.2475, "train_tokens_per_second": 292.963 }, { "epoch": 0.6774193548387096, "grad_norm": 0.5262055397033691, "learning_rate": 0.00019808782195961797, "loss": 0.6186, "num_input_tokens_seen": 202979, "step": 84, "train_runtime": 692.1971, "train_tokens_per_second": 293.239 }, { "epoch": 0.6854838709677419, "grad_norm": 0.4315405786037445, "learning_rate": 0.00019803799658748094, "loss": 0.5863, "num_input_tokens_seen": 205408, "step": 85, "train_runtime": 699.9238, "train_tokens_per_second": 293.472 }, { "epoch": 0.6935483870967742, "grad_norm": 0.5156344771385193, "learning_rate": 0.000197987536819946, "loss": 0.6342, "num_input_tokens_seen": 207851, "step": 86, "train_runtime": 707.6972, "train_tokens_per_second": 293.7 }, { "epoch": 0.7016129032258065, "grad_norm": 0.610248327255249, "learning_rate": 0.0001979364429835339, "loss": 0.5762, "num_input_tokens_seen": 210293, "step": 87, "train_runtime": 715.4358, "train_tokens_per_second": 293.937 }, { "epoch": 0.7096774193548387, "grad_norm": 0.4106084704399109, "learning_rate": 0.00019788471540886844, "loss": 0.5651, "num_input_tokens_seen": 212741, "step": 88, "train_runtime": 723.1383, "train_tokens_per_second": 294.191 }, { "epoch": 0.717741935483871, "grad_norm": 0.4682006239891052, "learning_rate": 0.0001978323544306743, "loss": 0.5914, "num_input_tokens_seen": 215252, "step": 89, "train_runtime": 731.0074, "train_tokens_per_second": 294.459 }, { "epoch": 0.7258064516129032, "grad_norm": 0.39659127593040466, "learning_rate": 0.00019777936038777483, "loss": 0.4184, "num_input_tokens_seen": 217500, "step": 90, "train_runtime": 738.2319, "train_tokens_per_second": 294.623 }, { "epoch": 0.7258064516129032, "eval_loss": 0.5797551870346069, "eval_runtime": 17.1625, "eval_samples_per_second": 3.03, "eval_steps_per_second": 1.515, "num_input_tokens_seen": 217500, "step": 90 }, { "epoch": 0.7338709677419355, "grad_norm": 0.4853302538394928, "learning_rate": 0.0001977257336230899, "loss": 0.5532, "num_input_tokens_seen": 219939, "step": 91, "train_runtime": 763.6262, "train_tokens_per_second": 288.019 }, { "epoch": 0.7419354838709677, "grad_norm": 0.42730966210365295, "learning_rate": 0.00019767147448363366, "loss": 0.41, "num_input_tokens_seen": 222184, "step": 92, "train_runtime": 770.8107, "train_tokens_per_second": 288.247 }, { "epoch": 0.75, "grad_norm": 0.5217856764793396, "learning_rate": 0.00019761658332051235, "loss": 0.5564, "num_input_tokens_seen": 224608, "step": 93, "train_runtime": 778.427, "train_tokens_per_second": 288.541 }, { "epoch": 0.7580645161290323, "grad_norm": 0.6025580167770386, "learning_rate": 0.00019756106048892186, "loss": 0.5735, "num_input_tokens_seen": 227024, "step": 94, "train_runtime": 786.0237, "train_tokens_per_second": 288.826 }, { "epoch": 0.7661290322580645, "grad_norm": 0.47863927483558655, "learning_rate": 0.00019750490634814572, "loss": 0.6207, "num_input_tokens_seen": 229482, "step": 95, "train_runtime": 793.6569, "train_tokens_per_second": 289.145 }, { "epoch": 0.7741935483870968, "grad_norm": 0.6842419505119324, "learning_rate": 0.00019744812126155245, "loss": 0.5606, "num_input_tokens_seen": 231915, "step": 96, "train_runtime": 801.2741, "train_tokens_per_second": 289.433 }, { "epoch": 0.782258064516129, "grad_norm": 0.4753873348236084, "learning_rate": 0.00019739070559659347, "loss": 0.5656, "num_input_tokens_seen": 234337, "step": 97, "train_runtime": 808.959, "train_tokens_per_second": 289.677 }, { "epoch": 0.7903225806451613, "grad_norm": 0.4512401521205902, "learning_rate": 0.0001973326597248006, "loss": 0.54, "num_input_tokens_seen": 236729, "step": 98, "train_runtime": 816.542, "train_tokens_per_second": 289.916 }, { "epoch": 0.7983870967741935, "grad_norm": 0.41799741983413696, "learning_rate": 0.0001972739840217836, "loss": 0.504, "num_input_tokens_seen": 239095, "step": 99, "train_runtime": 824.0737, "train_tokens_per_second": 290.138 }, { "epoch": 0.8064516129032258, "grad_norm": 0.5296741724014282, "learning_rate": 0.00019721467886722792, "loss": 0.5715, "num_input_tokens_seen": 241541, "step": 100, "train_runtime": 831.8131, "train_tokens_per_second": 290.379 }, { "epoch": 0.8145161290322581, "grad_norm": 0.5183069705963135, "learning_rate": 0.00019715474464489208, "loss": 0.583, "num_input_tokens_seen": 244053, "step": 101, "train_runtime": 839.7441, "train_tokens_per_second": 290.628 }, { "epoch": 0.8225806451612904, "grad_norm": 0.5080683827400208, "learning_rate": 0.0001970941817426052, "loss": 0.6712, "num_input_tokens_seen": 246631, "step": 102, "train_runtime": 847.7725, "train_tokens_per_second": 290.916 }, { "epoch": 0.8306451612903226, "grad_norm": 0.47352856397628784, "learning_rate": 0.00019703299055226468, "loss": 0.5361, "num_input_tokens_seen": 249081, "step": 103, "train_runtime": 855.5397, "train_tokens_per_second": 291.139 }, { "epoch": 0.8387096774193549, "grad_norm": 0.4344908893108368, "learning_rate": 0.00019697117146983334, "loss": 0.3872, "num_input_tokens_seen": 251298, "step": 104, "train_runtime": 862.6163, "train_tokens_per_second": 291.321 }, { "epoch": 0.8467741935483871, "grad_norm": 0.6570983529090881, "learning_rate": 0.0001969087248953371, "loss": 0.5698, "num_input_tokens_seen": 253742, "step": 105, "train_runtime": 870.3871, "train_tokens_per_second": 291.528 }, { "epoch": 0.8548387096774194, "grad_norm": 0.46201083064079285, "learning_rate": 0.00019684565123286244, "loss": 0.567, "num_input_tokens_seen": 256212, "step": 106, "train_runtime": 878.1695, "train_tokens_per_second": 291.757 }, { "epoch": 0.8629032258064516, "grad_norm": 0.48119398951530457, "learning_rate": 0.00019678195089055346, "loss": 0.5579, "num_input_tokens_seen": 258577, "step": 107, "train_runtime": 885.714, "train_tokens_per_second": 291.942 }, { "epoch": 0.8709677419354839, "grad_norm": 0.5357433557510376, "learning_rate": 0.00019671762428060966, "loss": 0.6103, "num_input_tokens_seen": 261064, "step": 108, "train_runtime": 893.5704, "train_tokens_per_second": 292.158 }, { "epoch": 0.8790322580645161, "grad_norm": 0.44740840792655945, "learning_rate": 0.00019665267181928292, "loss": 0.5168, "num_input_tokens_seen": 263473, "step": 109, "train_runtime": 901.2705, "train_tokens_per_second": 292.335 }, { "epoch": 0.8870967741935484, "grad_norm": 0.470734566450119, "learning_rate": 0.00019658709392687506, "loss": 0.6253, "num_input_tokens_seen": 265968, "step": 110, "train_runtime": 909.1802, "train_tokens_per_second": 292.536 }, { "epoch": 0.8951612903225806, "grad_norm": 0.561008870601654, "learning_rate": 0.00019652089102773488, "loss": 0.5867, "num_input_tokens_seen": 268347, "step": 111, "train_runtime": 916.7609, "train_tokens_per_second": 292.712 }, { "epoch": 0.9032258064516129, "grad_norm": 0.42769816517829895, "learning_rate": 0.00019645406355025565, "loss": 0.4869, "num_input_tokens_seen": 270668, "step": 112, "train_runtime": 924.116, "train_tokens_per_second": 292.894 }, { "epoch": 0.9112903225806451, "grad_norm": 0.4861026108264923, "learning_rate": 0.00019638661192687216, "loss": 0.5352, "num_input_tokens_seen": 273106, "step": 113, "train_runtime": 931.8203, "train_tokens_per_second": 293.089 }, { "epoch": 0.9193548387096774, "grad_norm": 0.47908815741539, "learning_rate": 0.00019631853659405807, "loss": 0.5608, "num_input_tokens_seen": 275567, "step": 114, "train_runtime": 939.5584, "train_tokens_per_second": 293.294 }, { "epoch": 0.9274193548387096, "grad_norm": 0.5197163820266724, "learning_rate": 0.000196249837992323, "loss": 0.6687, "num_input_tokens_seen": 278145, "step": 115, "train_runtime": 947.5681, "train_tokens_per_second": 293.536 }, { "epoch": 0.9354838709677419, "grad_norm": 0.4960262179374695, "learning_rate": 0.0001961805165662096, "loss": 0.5896, "num_input_tokens_seen": 280661, "step": 116, "train_runtime": 955.4052, "train_tokens_per_second": 293.761 }, { "epoch": 0.9435483870967742, "grad_norm": 0.4456697702407837, "learning_rate": 0.00019611057276429085, "loss": 0.4862, "num_input_tokens_seen": 283075, "step": 117, "train_runtime": 962.9938, "train_tokens_per_second": 293.953 }, { "epoch": 0.9516129032258065, "grad_norm": 0.4729059338569641, "learning_rate": 0.00019604000703916705, "loss": 0.5744, "num_input_tokens_seen": 285541, "step": 118, "train_runtime": 970.7902, "train_tokens_per_second": 294.133 }, { "epoch": 0.9596774193548387, "grad_norm": 0.47211745381355286, "learning_rate": 0.00019596881984746287, "loss": 0.512, "num_input_tokens_seen": 287946, "step": 119, "train_runtime": 978.3472, "train_tokens_per_second": 294.319 }, { "epoch": 0.967741935483871, "grad_norm": 0.44912973046302795, "learning_rate": 0.00019589701164982452, "loss": 0.4114, "num_input_tokens_seen": 290184, "step": 120, "train_runtime": 985.5069, "train_tokens_per_second": 294.452 }, { "epoch": 0.967741935483871, "eval_loss": 0.5635989904403687, "eval_runtime": 17.033, "eval_samples_per_second": 3.053, "eval_steps_per_second": 1.526, "num_input_tokens_seen": 290184, "step": 120 }, { "epoch": 0.9758064516129032, "grad_norm": 0.43237707018852234, "learning_rate": 0.00019582458291091663, "loss": 0.4564, "num_input_tokens_seen": 292471, "step": 121, "train_runtime": 1010.3192, "train_tokens_per_second": 289.484 }, { "epoch": 0.9838709677419355, "grad_norm": 0.5699084997177124, "learning_rate": 0.0001957515340994193, "loss": 0.5604, "num_input_tokens_seen": 294924, "step": 122, "train_runtime": 1018.0245, "train_tokens_per_second": 289.702 }, { "epoch": 0.9919354838709677, "grad_norm": 0.5040406584739685, "learning_rate": 0.000195677865688025, "loss": 0.6539, "num_input_tokens_seen": 297473, "step": 123, "train_runtime": 1025.9426, "train_tokens_per_second": 289.951 }, { "epoch": 1.0, "grad_norm": 0.3699295222759247, "learning_rate": 0.00019560357815343577, "loss": 0.4415, "num_input_tokens_seen": 299768, "step": 124, "train_runtime": 1033.2914, "train_tokens_per_second": 290.11 }, { "epoch": 1.0080645161290323, "grad_norm": 0.5218486189842224, "learning_rate": 0.00019552867197635974, "loss": 0.4879, "num_input_tokens_seen": 302171, "step": 125, "train_runtime": 1040.8815, "train_tokens_per_second": 290.303 }, { "epoch": 1.0161290322580645, "grad_norm": 0.4307507872581482, "learning_rate": 0.00019545314764150837, "loss": 0.4855, "num_input_tokens_seen": 304584, "step": 126, "train_runtime": 1048.4955, "train_tokens_per_second": 290.496 }, { "epoch": 1.0241935483870968, "grad_norm": 0.46481820940971375, "learning_rate": 0.00019537700563759304, "loss": 0.5719, "num_input_tokens_seen": 307077, "step": 127, "train_runtime": 1056.3966, "train_tokens_per_second": 290.683 }, { "epoch": 1.032258064516129, "grad_norm": 0.4697766602039337, "learning_rate": 0.00019530024645732206, "loss": 0.5109, "num_input_tokens_seen": 309504, "step": 128, "train_runtime": 1064.077, "train_tokens_per_second": 290.866 }, { "epoch": 1.0403225806451613, "grad_norm": 0.43352261185646057, "learning_rate": 0.00019522287059739753, "loss": 0.4387, "num_input_tokens_seen": 311825, "step": 129, "train_runtime": 1071.4691, "train_tokens_per_second": 291.026 }, { "epoch": 1.0483870967741935, "grad_norm": 0.5191429257392883, "learning_rate": 0.00019514487855851184, "loss": 0.5712, "num_input_tokens_seen": 314337, "step": 130, "train_runtime": 1079.3989, "train_tokens_per_second": 291.215 }, { "epoch": 1.0564516129032258, "grad_norm": 0.4604818820953369, "learning_rate": 0.00019506627084534483, "loss": 0.4592, "num_input_tokens_seen": 316689, "step": 131, "train_runtime": 1086.9019, "train_tokens_per_second": 291.369 }, { "epoch": 1.064516129032258, "grad_norm": 0.4899859130382538, "learning_rate": 0.00019498704796656018, "loss": 0.5826, "num_input_tokens_seen": 319182, "step": 132, "train_runtime": 1094.7114, "train_tokens_per_second": 291.567 }, { "epoch": 1.0725806451612903, "grad_norm": 0.4882298409938812, "learning_rate": 0.00019490721043480226, "loss": 0.4948, "num_input_tokens_seen": 321575, "step": 133, "train_runtime": 1102.329, "train_tokens_per_second": 291.723 }, { "epoch": 1.0806451612903225, "grad_norm": 0.5484714508056641, "learning_rate": 0.00019482675876669286, "loss": 0.509, "num_input_tokens_seen": 323985, "step": 134, "train_runtime": 1109.9819, "train_tokens_per_second": 291.883 }, { "epoch": 1.0887096774193548, "grad_norm": 0.49963870644569397, "learning_rate": 0.00019474569348282774, "loss": 0.5362, "num_input_tokens_seen": 326450, "step": 135, "train_runtime": 1117.7341, "train_tokens_per_second": 292.064 }, { "epoch": 1.096774193548387, "grad_norm": 0.47701138257980347, "learning_rate": 0.0001946640151077734, "loss": 0.4584, "num_input_tokens_seen": 328784, "step": 136, "train_runtime": 1125.1692, "train_tokens_per_second": 292.208 }, { "epoch": 1.1048387096774193, "grad_norm": 0.518075168132782, "learning_rate": 0.00019458172417006347, "loss": 0.5822, "num_input_tokens_seen": 331217, "step": 137, "train_runtime": 1132.8395, "train_tokens_per_second": 292.378 }, { "epoch": 1.1129032258064515, "grad_norm": 0.540624737739563, "learning_rate": 0.00019449882120219555, "loss": 0.5769, "num_input_tokens_seen": 333726, "step": 138, "train_runtime": 1140.6942, "train_tokens_per_second": 292.564 }, { "epoch": 1.120967741935484, "grad_norm": 0.4455350637435913, "learning_rate": 0.00019441530674062753, "loss": 0.4574, "num_input_tokens_seen": 336111, "step": 139, "train_runtime": 1148.2429, "train_tokens_per_second": 292.718 }, { "epoch": 1.129032258064516, "grad_norm": 0.47862115502357483, "learning_rate": 0.0001943311813257743, "loss": 0.5124, "num_input_tokens_seen": 338553, "step": 140, "train_runtime": 1155.9576, "train_tokens_per_second": 292.877 }, { "epoch": 1.1370967741935485, "grad_norm": 0.5344946980476379, "learning_rate": 0.00019424644550200415, "loss": 0.5678, "num_input_tokens_seen": 341110, "step": 141, "train_runtime": 1163.8909, "train_tokens_per_second": 293.077 }, { "epoch": 1.1451612903225807, "grad_norm": 0.5066844820976257, "learning_rate": 0.00019416109981763526, "loss": 0.5531, "num_input_tokens_seen": 343583, "step": 142, "train_runtime": 1171.6216, "train_tokens_per_second": 293.254 }, { "epoch": 1.153225806451613, "grad_norm": 0.6196443438529968, "learning_rate": 0.00019407514482493214, "loss": 0.5097, "num_input_tokens_seen": 346053, "step": 143, "train_runtime": 1179.384, "train_tokens_per_second": 293.418 }, { "epoch": 1.1612903225806452, "grad_norm": 0.5004581809043884, "learning_rate": 0.00019398858108010217, "loss": 0.5073, "num_input_tokens_seen": 348502, "step": 144, "train_runtime": 1187.1001, "train_tokens_per_second": 293.574 }, { "epoch": 1.1693548387096775, "grad_norm": 0.4805152714252472, "learning_rate": 0.0001939014091432918, "loss": 0.436, "num_input_tokens_seen": 350803, "step": 145, "train_runtime": 1194.4056, "train_tokens_per_second": 293.705 }, { "epoch": 1.1774193548387097, "grad_norm": 0.5441202521324158, "learning_rate": 0.00019381362957858312, "loss": 0.582, "num_input_tokens_seen": 353320, "step": 146, "train_runtime": 1202.2979, "train_tokens_per_second": 293.871 }, { "epoch": 1.185483870967742, "grad_norm": 0.4514954388141632, "learning_rate": 0.00019372524295399013, "loss": 0.3974, "num_input_tokens_seen": 355630, "step": 147, "train_runtime": 1209.6036, "train_tokens_per_second": 294.005 }, { "epoch": 1.1935483870967742, "grad_norm": 0.49513113498687744, "learning_rate": 0.00019363624984145502, "loss": 0.5247, "num_input_tokens_seen": 358051, "step": 148, "train_runtime": 1217.2149, "train_tokens_per_second": 294.156 }, { "epoch": 1.2016129032258065, "grad_norm": 0.5256295204162598, "learning_rate": 0.00019354665081684446, "loss": 0.5084, "num_input_tokens_seen": 360569, "step": 149, "train_runtime": 1225.0281, "train_tokens_per_second": 294.335 }, { "epoch": 1.2096774193548387, "grad_norm": 0.4719817638397217, "learning_rate": 0.0001934564464599461, "loss": 0.5317, "num_input_tokens_seen": 363020, "step": 150, "train_runtime": 1232.7421, "train_tokens_per_second": 294.482 }, { "epoch": 1.2096774193548387, "eval_loss": 0.5541179180145264, "eval_runtime": 17.0756, "eval_samples_per_second": 3.045, "eval_steps_per_second": 1.523, "num_input_tokens_seen": 363020, "step": 150 }, { "epoch": 1.217741935483871, "grad_norm": 0.5494084358215332, "learning_rate": 0.00019336563735446446, "loss": 0.5502, "num_input_tokens_seen": 365462, "step": 151, "train_runtime": 1258.0076, "train_tokens_per_second": 290.509 }, { "epoch": 1.2258064516129032, "grad_norm": 0.5102791786193848, "learning_rate": 0.00019327422408801744, "loss": 0.5907, "num_input_tokens_seen": 367982, "step": 152, "train_runtime": 1265.9004, "train_tokens_per_second": 290.688 }, { "epoch": 1.2338709677419355, "grad_norm": 0.5099480152130127, "learning_rate": 0.0001931822072521323, "loss": 0.5405, "num_input_tokens_seen": 370446, "step": 153, "train_runtime": 1273.5949, "train_tokens_per_second": 290.866 }, { "epoch": 1.2419354838709677, "grad_norm": 0.49685347080230713, "learning_rate": 0.00019308958744224217, "loss": 0.434, "num_input_tokens_seen": 372753, "step": 154, "train_runtime": 1281.0, "train_tokens_per_second": 290.986 }, { "epoch": 1.25, "grad_norm": 0.5306271910667419, "learning_rate": 0.00019299636525768173, "loss": 0.5416, "num_input_tokens_seen": 375291, "step": 155, "train_runtime": 1288.923, "train_tokens_per_second": 291.166 }, { "epoch": 1.2580645161290323, "grad_norm": 0.4725121259689331, "learning_rate": 0.00019290254130168374, "loss": 0.4697, "num_input_tokens_seen": 377667, "step": 156, "train_runtime": 1296.4986, "train_tokens_per_second": 291.298 }, { "epoch": 1.2661290322580645, "grad_norm": 0.5130587220191956, "learning_rate": 0.00019280811618137484, "loss": 0.6108, "num_input_tokens_seen": 380238, "step": 157, "train_runtime": 1304.4722, "train_tokens_per_second": 291.488 }, { "epoch": 1.2741935483870968, "grad_norm": 0.5222141146659851, "learning_rate": 0.00019271309050777183, "loss": 0.5384, "num_input_tokens_seen": 382696, "step": 158, "train_runtime": 1312.2639, "train_tokens_per_second": 291.63 }, { "epoch": 1.282258064516129, "grad_norm": 0.47143712639808655, "learning_rate": 0.00019261746489577765, "loss": 0.4202, "num_input_tokens_seen": 385011, "step": 159, "train_runtime": 1319.6181, "train_tokens_per_second": 291.759 }, { "epoch": 1.2903225806451613, "grad_norm": 0.6677728295326233, "learning_rate": 0.00019252123996417738, "loss": 0.5565, "num_input_tokens_seen": 387512, "step": 160, "train_runtime": 1327.4482, "train_tokens_per_second": 291.923 }, { "epoch": 1.2983870967741935, "grad_norm": 0.48728442192077637, "learning_rate": 0.00019242441633563417, "loss": 0.5279, "num_input_tokens_seen": 389928, "step": 161, "train_runtime": 1335.0869, "train_tokens_per_second": 292.062 }, { "epoch": 1.3064516129032258, "grad_norm": 0.5674954056739807, "learning_rate": 0.00019232699463668542, "loss": 0.5222, "num_input_tokens_seen": 392411, "step": 162, "train_runtime": 1342.9323, "train_tokens_per_second": 292.205 }, { "epoch": 1.314516129032258, "grad_norm": 0.5219017267227173, "learning_rate": 0.00019222897549773848, "loss": 0.4657, "num_input_tokens_seen": 394800, "step": 163, "train_runtime": 1350.5582, "train_tokens_per_second": 292.324 }, { "epoch": 1.3225806451612903, "grad_norm": 0.571649968624115, "learning_rate": 0.0001921303595530667, "loss": 0.5388, "num_input_tokens_seen": 397315, "step": 164, "train_runtime": 1358.4396, "train_tokens_per_second": 292.479 }, { "epoch": 1.3306451612903225, "grad_norm": 0.5281589031219482, "learning_rate": 0.00019203114744080542, "loss": 0.5208, "num_input_tokens_seen": 399749, "step": 165, "train_runtime": 1366.1378, "train_tokens_per_second": 292.613 }, { "epoch": 1.3387096774193548, "grad_norm": 0.5312926769256592, "learning_rate": 0.0001919313398029475, "loss": 0.4603, "num_input_tokens_seen": 402110, "step": 166, "train_runtime": 1373.5564, "train_tokens_per_second": 292.751 }, { "epoch": 1.346774193548387, "grad_norm": 0.4809509813785553, "learning_rate": 0.00019183093728533966, "loss": 0.4728, "num_input_tokens_seen": 404491, "step": 167, "train_runtime": 1381.0939, "train_tokens_per_second": 292.877 }, { "epoch": 1.3548387096774195, "grad_norm": 0.5114589929580688, "learning_rate": 0.00019172994053767784, "loss": 0.5131, "num_input_tokens_seen": 406930, "step": 168, "train_runtime": 1388.7721, "train_tokens_per_second": 293.014 }, { "epoch": 1.3629032258064515, "grad_norm": 0.508705198764801, "learning_rate": 0.0001916283502135033, "loss": 0.4703, "num_input_tokens_seen": 409324, "step": 169, "train_runtime": 1396.2705, "train_tokens_per_second": 293.155 }, { "epoch": 1.370967741935484, "grad_norm": 0.5240468382835388, "learning_rate": 0.00019152616697019822, "loss": 0.3741, "num_input_tokens_seen": 411559, "step": 170, "train_runtime": 1403.353, "train_tokens_per_second": 293.268 }, { "epoch": 1.379032258064516, "grad_norm": 0.5561989545822144, "learning_rate": 0.0001914233914689815, "loss": 0.5322, "num_input_tokens_seen": 413970, "step": 171, "train_runtime": 1410.8552, "train_tokens_per_second": 293.418 }, { "epoch": 1.3870967741935485, "grad_norm": 0.4767642021179199, "learning_rate": 0.00019132002437490458, "loss": 0.4484, "num_input_tokens_seen": 416332, "step": 172, "train_runtime": 1418.2579, "train_tokens_per_second": 293.552 }, { "epoch": 1.3951612903225805, "grad_norm": 0.5309658646583557, "learning_rate": 0.00019121606635684696, "loss": 0.5277, "num_input_tokens_seen": 418754, "step": 173, "train_runtime": 1425.805, "train_tokens_per_second": 293.697 }, { "epoch": 1.403225806451613, "grad_norm": 0.5297260880470276, "learning_rate": 0.00019111151808751196, "loss": 0.5296, "num_input_tokens_seen": 421237, "step": 174, "train_runtime": 1433.5035, "train_tokens_per_second": 293.851 }, { "epoch": 1.4112903225806452, "grad_norm": 0.44537508487701416, "learning_rate": 0.00019100638024342244, "loss": 0.414, "num_input_tokens_seen": 423547, "step": 175, "train_runtime": 1440.7719, "train_tokens_per_second": 293.972 }, { "epoch": 1.4193548387096775, "grad_norm": 0.6102742552757263, "learning_rate": 0.00019090065350491626, "loss": 0.5886, "num_input_tokens_seen": 426041, "step": 176, "train_runtime": 1448.4454, "train_tokens_per_second": 294.137 }, { "epoch": 1.4274193548387097, "grad_norm": 0.4876335561275482, "learning_rate": 0.00019079433855614201, "loss": 0.5759, "num_input_tokens_seen": 428526, "step": 177, "train_runtime": 1456.2038, "train_tokens_per_second": 294.276 }, { "epoch": 1.435483870967742, "grad_norm": 0.5169773101806641, "learning_rate": 0.00019068743608505455, "loss": 0.4094, "num_input_tokens_seen": 430831, "step": 178, "train_runtime": 1463.4832, "train_tokens_per_second": 294.387 }, { "epoch": 1.4435483870967742, "grad_norm": 0.48358380794525146, "learning_rate": 0.0001905799467834105, "loss": 0.4663, "num_input_tokens_seen": 433206, "step": 179, "train_runtime": 1470.9104, "train_tokens_per_second": 294.516 }, { "epoch": 1.4516129032258065, "grad_norm": 0.558736264705658, "learning_rate": 0.00019047187134676387, "loss": 0.497, "num_input_tokens_seen": 435677, "step": 180, "train_runtime": 1478.6376, "train_tokens_per_second": 294.648 }, { "epoch": 1.4516129032258065, "eval_loss": 0.5531938076019287, "eval_runtime": 17.116, "eval_samples_per_second": 3.038, "eval_steps_per_second": 1.519, "num_input_tokens_seen": 435677, "step": 180 }, { "epoch": 1.4596774193548387, "grad_norm": 0.58433598279953, "learning_rate": 0.0001903632104744614, "loss": 0.5675, "num_input_tokens_seen": 438193, "step": 181, "train_runtime": 1504.1217, "train_tokens_per_second": 291.328 }, { "epoch": 1.467741935483871, "grad_norm": 0.5542019009590149, "learning_rate": 0.00019025396486963827, "loss": 0.4435, "num_input_tokens_seen": 440535, "step": 182, "train_runtime": 1511.568, "train_tokens_per_second": 291.442 }, { "epoch": 1.4758064516129032, "grad_norm": 0.6918825507164001, "learning_rate": 0.0001901441352392133, "loss": 0.5289, "num_input_tokens_seen": 442946, "step": 183, "train_runtime": 1519.2246, "train_tokens_per_second": 291.561 }, { "epoch": 1.4838709677419355, "grad_norm": 0.5807278156280518, "learning_rate": 0.00019003372229388452, "loss": 0.5884, "num_input_tokens_seen": 445434, "step": 184, "train_runtime": 1527.0904, "train_tokens_per_second": 291.688 }, { "epoch": 1.4919354838709677, "grad_norm": 0.4809093475341797, "learning_rate": 0.0001899227267481246, "loss": 0.5106, "num_input_tokens_seen": 447865, "step": 185, "train_runtime": 1534.7557, "train_tokens_per_second": 291.815 }, { "epoch": 1.5, "grad_norm": 0.5059263706207275, "learning_rate": 0.00018981114932017609, "loss": 0.5066, "num_input_tokens_seen": 450249, "step": 186, "train_runtime": 1542.3188, "train_tokens_per_second": 291.93 }, { "epoch": 1.5080645161290323, "grad_norm": 0.5323978662490845, "learning_rate": 0.00018969899073204686, "loss": 0.5413, "num_input_tokens_seen": 452720, "step": 187, "train_runtime": 1550.0997, "train_tokens_per_second": 292.059 }, { "epoch": 1.5161290322580645, "grad_norm": 0.5239408016204834, "learning_rate": 0.00018958625170950545, "loss": 0.4529, "num_input_tokens_seen": 455060, "step": 188, "train_runtime": 1557.5355, "train_tokens_per_second": 292.167 }, { "epoch": 1.5241935483870968, "grad_norm": 0.5529743432998657, "learning_rate": 0.00018947293298207635, "loss": 0.5093, "num_input_tokens_seen": 457475, "step": 189, "train_runtime": 1565.1928, "train_tokens_per_second": 292.28 }, { "epoch": 1.532258064516129, "grad_norm": 0.4671318233013153, "learning_rate": 0.00018935903528303523, "loss": 0.4162, "num_input_tokens_seen": 459738, "step": 190, "train_runtime": 1572.4177, "train_tokens_per_second": 292.377 }, { "epoch": 1.5403225806451613, "grad_norm": 0.4464218318462372, "learning_rate": 0.0001892445593494042, "loss": 0.465, "num_input_tokens_seen": 462172, "step": 191, "train_runtime": 1580.1066, "train_tokens_per_second": 292.494 }, { "epoch": 1.5483870967741935, "grad_norm": 0.4810047745704651, "learning_rate": 0.0001891295059219472, "loss": 0.4945, "num_input_tokens_seen": 464558, "step": 192, "train_runtime": 1587.6273, "train_tokens_per_second": 292.611 }, { "epoch": 1.5564516129032258, "grad_norm": 0.5889862179756165, "learning_rate": 0.00018901387574516497, "loss": 0.4944, "num_input_tokens_seen": 466950, "step": 193, "train_runtime": 1595.149, "train_tokens_per_second": 292.731 }, { "epoch": 1.564516129032258, "grad_norm": 0.6099379658699036, "learning_rate": 0.00018889766956729044, "loss": 0.483, "num_input_tokens_seen": 469382, "step": 194, "train_runtime": 1602.8182, "train_tokens_per_second": 292.848 }, { "epoch": 1.5725806451612905, "grad_norm": 0.5440390110015869, "learning_rate": 0.00018878088814028364, "loss": 0.4593, "num_input_tokens_seen": 471803, "step": 195, "train_runtime": 1610.4242, "train_tokens_per_second": 292.968 }, { "epoch": 1.5806451612903225, "grad_norm": 0.5005755424499512, "learning_rate": 0.00018866353221982718, "loss": 0.4294, "num_input_tokens_seen": 474211, "step": 196, "train_runtime": 1618.001, "train_tokens_per_second": 293.084 }, { "epoch": 1.588709677419355, "grad_norm": 0.6077679991722107, "learning_rate": 0.000188545602565321, "loss": 0.5771, "num_input_tokens_seen": 476689, "step": 197, "train_runtime": 1625.6796, "train_tokens_per_second": 293.224 }, { "epoch": 1.596774193548387, "grad_norm": 0.4779480993747711, "learning_rate": 0.00018842709993987776, "loss": 0.4385, "num_input_tokens_seen": 479013, "step": 198, "train_runtime": 1633.0479, "train_tokens_per_second": 293.325 }, { "epoch": 1.6048387096774195, "grad_norm": 0.4863346815109253, "learning_rate": 0.00018830802511031762, "loss": 0.4334, "num_input_tokens_seen": 481374, "step": 199, "train_runtime": 1640.4972, "train_tokens_per_second": 293.432 }, { "epoch": 1.6129032258064515, "grad_norm": 0.518204391002655, "learning_rate": 0.0001881883788471636, "loss": 0.4768, "num_input_tokens_seen": 483799, "step": 200, "train_runtime": 1648.0818, "train_tokens_per_second": 293.553 }, { "epoch": 1.620967741935484, "grad_norm": 0.5243327021598816, "learning_rate": 0.00018806816192463625, "loss": 0.5814, "num_input_tokens_seen": 486348, "step": 201, "train_runtime": 1655.9994, "train_tokens_per_second": 293.689 }, { "epoch": 1.629032258064516, "grad_norm": 0.5013379454612732, "learning_rate": 0.0001879473751206489, "loss": 0.4501, "num_input_tokens_seen": 488725, "step": 202, "train_runtime": 1663.4446, "train_tokens_per_second": 293.803 }, { "epoch": 1.6370967741935485, "grad_norm": 0.524978756904602, "learning_rate": 0.00018782601921680256, "loss": 0.5842, "num_input_tokens_seen": 491239, "step": 203, "train_runtime": 1671.245, "train_tokens_per_second": 293.936 }, { "epoch": 1.6451612903225805, "grad_norm": 0.5189937353134155, "learning_rate": 0.00018770409499838073, "loss": 0.5237, "num_input_tokens_seen": 493653, "step": 204, "train_runtime": 1678.7904, "train_tokens_per_second": 294.053 }, { "epoch": 1.653225806451613, "grad_norm": 0.4908079504966736, "learning_rate": 0.0001875816032543445, "loss": 0.4633, "num_input_tokens_seen": 496049, "step": 205, "train_runtime": 1686.2657, "train_tokens_per_second": 294.17 }, { "epoch": 1.661290322580645, "grad_norm": 0.45395177602767944, "learning_rate": 0.00018745854477732733, "loss": 0.4658, "num_input_tokens_seen": 498435, "step": 206, "train_runtime": 1693.8087, "train_tokens_per_second": 294.269 }, { "epoch": 1.6693548387096775, "grad_norm": 0.47721126675605774, "learning_rate": 0.00018733492036363005, "loss": 0.5109, "num_input_tokens_seen": 500909, "step": 207, "train_runtime": 1701.5132, "train_tokens_per_second": 294.39 }, { "epoch": 1.6774193548387095, "grad_norm": 0.48570311069488525, "learning_rate": 0.0001872107308132155, "loss": 0.4805, "num_input_tokens_seen": 503339, "step": 208, "train_runtime": 1709.082, "train_tokens_per_second": 294.508 }, { "epoch": 1.685483870967742, "grad_norm": 0.46823814511299133, "learning_rate": 0.00018708597692970353, "loss": 0.4247, "num_input_tokens_seen": 505708, "step": 209, "train_runtime": 1716.5731, "train_tokens_per_second": 294.603 }, { "epoch": 1.6935483870967742, "grad_norm": 0.5073066353797913, "learning_rate": 0.00018696065952036571, "loss": 0.4316, "num_input_tokens_seen": 507981, "step": 210, "train_runtime": 1723.7836, "train_tokens_per_second": 294.69 }, { "epoch": 1.6935483870967742, "eval_loss": 0.5447993874549866, "eval_runtime": 17.1405, "eval_samples_per_second": 3.034, "eval_steps_per_second": 1.517, "num_input_tokens_seen": 507981, "step": 210 }, { "epoch": 1.7016129032258065, "grad_norm": 0.5070942044258118, "learning_rate": 0.00018683477939612021, "loss": 0.4498, "num_input_tokens_seen": 510398, "step": 211, "train_runtime": 1748.9718, "train_tokens_per_second": 291.827 }, { "epoch": 1.7096774193548387, "grad_norm": 0.5081866383552551, "learning_rate": 0.0001867083373715264, "loss": 0.5194, "num_input_tokens_seen": 512927, "step": 212, "train_runtime": 1756.9223, "train_tokens_per_second": 291.946 }, { "epoch": 1.717741935483871, "grad_norm": 0.5030050277709961, "learning_rate": 0.00018658133426477965, "loss": 0.576, "num_input_tokens_seen": 515446, "step": 213, "train_runtime": 1764.7796, "train_tokens_per_second": 292.074 }, { "epoch": 1.7258064516129032, "grad_norm": 0.46002885699272156, "learning_rate": 0.00018645377089770616, "loss": 0.3902, "num_input_tokens_seen": 517761, "step": 214, "train_runtime": 1772.0597, "train_tokens_per_second": 292.18 }, { "epoch": 1.7338709677419355, "grad_norm": 0.5473560690879822, "learning_rate": 0.00018632564809575742, "loss": 0.5174, "num_input_tokens_seen": 520212, "step": 215, "train_runtime": 1779.8104, "train_tokens_per_second": 292.285 }, { "epoch": 1.7419354838709677, "grad_norm": 0.5015368461608887, "learning_rate": 0.00018619696668800492, "loss": 0.4974, "num_input_tokens_seen": 522595, "step": 216, "train_runtime": 1787.3722, "train_tokens_per_second": 292.382 }, { "epoch": 1.75, "grad_norm": 0.5964553952217102, "learning_rate": 0.00018606772750713504, "loss": 0.531, "num_input_tokens_seen": 525075, "step": 217, "train_runtime": 1795.0892, "train_tokens_per_second": 292.506 }, { "epoch": 1.7580645161290323, "grad_norm": 0.47636234760284424, "learning_rate": 0.00018593793138944328, "loss": 0.4391, "num_input_tokens_seen": 527478, "step": 218, "train_runtime": 1802.6529, "train_tokens_per_second": 292.612 }, { "epoch": 1.7661290322580645, "grad_norm": 0.5664883255958557, "learning_rate": 0.0001858075791748291, "loss": 0.5283, "num_input_tokens_seen": 529954, "step": 219, "train_runtime": 1810.4252, "train_tokens_per_second": 292.724 }, { "epoch": 1.7741935483870968, "grad_norm": 0.5361610054969788, "learning_rate": 0.0001856766717067904, "loss": 0.5343, "num_input_tokens_seen": 532412, "step": 220, "train_runtime": 1818.1176, "train_tokens_per_second": 292.837 }, { "epoch": 1.782258064516129, "grad_norm": 0.48824912309646606, "learning_rate": 0.00018554520983241814, "loss": 0.5223, "num_input_tokens_seen": 534838, "step": 221, "train_runtime": 1825.728, "train_tokens_per_second": 292.945 }, { "epoch": 1.7903225806451613, "grad_norm": 0.5257652401924133, "learning_rate": 0.00018541319440239066, "loss": 0.508, "num_input_tokens_seen": 537339, "step": 222, "train_runtime": 1833.5362, "train_tokens_per_second": 293.062 }, { "epoch": 1.7983870967741935, "grad_norm": 0.4608156681060791, "learning_rate": 0.00018528062627096845, "loss": 0.4316, "num_input_tokens_seen": 539716, "step": 223, "train_runtime": 1841.1084, "train_tokens_per_second": 293.147 }, { "epoch": 1.8064516129032258, "grad_norm": 0.6113476157188416, "learning_rate": 0.0001851475062959884, "loss": 0.5363, "num_input_tokens_seen": 542226, "step": 224, "train_runtime": 1848.9709, "train_tokens_per_second": 293.258 }, { "epoch": 1.814516129032258, "grad_norm": 0.5347176194190979, "learning_rate": 0.00018501383533885837, "loss": 0.4869, "num_input_tokens_seen": 544617, "step": 225, "train_runtime": 1856.5371, "train_tokens_per_second": 293.351 }, { "epoch": 1.8225806451612905, "grad_norm": 0.5930202603340149, "learning_rate": 0.00018487961426455157, "loss": 0.5905, "num_input_tokens_seen": 547121, "step": 226, "train_runtime": 1864.3896, "train_tokens_per_second": 293.459 }, { "epoch": 1.8306451612903225, "grad_norm": 0.5011629462242126, "learning_rate": 0.0001847448439416009, "loss": 0.5145, "num_input_tokens_seen": 549555, "step": 227, "train_runtime": 1872.0413, "train_tokens_per_second": 293.559 }, { "epoch": 1.838709677419355, "grad_norm": 0.6015912890434265, "learning_rate": 0.00018460952524209355, "loss": 0.4399, "num_input_tokens_seen": 551862, "step": 228, "train_runtime": 1879.419, "train_tokens_per_second": 293.634 }, { "epoch": 1.846774193548387, "grad_norm": 0.5135874152183533, "learning_rate": 0.0001844736590416651, "loss": 0.4815, "num_input_tokens_seen": 554312, "step": 229, "train_runtime": 1887.1751, "train_tokens_per_second": 293.726 }, { "epoch": 1.8548387096774195, "grad_norm": 0.5236052870750427, "learning_rate": 0.00018433724621949392, "loss": 0.5149, "num_input_tokens_seen": 556789, "step": 230, "train_runtime": 1894.9542, "train_tokens_per_second": 293.827 }, { "epoch": 1.8629032258064515, "grad_norm": 0.46906578540802, "learning_rate": 0.00018420028765829568, "loss": 0.4666, "num_input_tokens_seen": 559163, "step": 231, "train_runtime": 1902.5378, "train_tokens_per_second": 293.904 }, { "epoch": 1.870967741935484, "grad_norm": 0.5261631011962891, "learning_rate": 0.00018406278424431736, "loss": 0.4782, "num_input_tokens_seen": 561579, "step": 232, "train_runtime": 1910.1668, "train_tokens_per_second": 293.995 }, { "epoch": 1.879032258064516, "grad_norm": 0.5413320064544678, "learning_rate": 0.00018392473686733163, "loss": 0.5822, "num_input_tokens_seen": 564116, "step": 233, "train_runtime": 1918.1419, "train_tokens_per_second": 294.095 }, { "epoch": 1.8870967741935485, "grad_norm": 0.4339919686317444, "learning_rate": 0.00018378614642063115, "loss": 0.3655, "num_input_tokens_seen": 566348, "step": 234, "train_runtime": 1925.3367, "train_tokens_per_second": 294.155 }, { "epoch": 1.8951612903225805, "grad_norm": 0.4839528501033783, "learning_rate": 0.00018364701380102266, "loss": 0.3749, "num_input_tokens_seen": 568609, "step": 235, "train_runtime": 1932.5907, "train_tokens_per_second": 294.221 }, { "epoch": 1.903225806451613, "grad_norm": 0.5061867833137512, "learning_rate": 0.0001835073399088214, "loss": 0.564, "num_input_tokens_seen": 571056, "step": 236, "train_runtime": 1940.3213, "train_tokens_per_second": 294.31 }, { "epoch": 1.911290322580645, "grad_norm": 0.492093026638031, "learning_rate": 0.00018336712564784503, "loss": 0.5252, "num_input_tokens_seen": 573516, "step": 237, "train_runtime": 1948.0561, "train_tokens_per_second": 294.404 }, { "epoch": 1.9193548387096775, "grad_norm": 0.48326292634010315, "learning_rate": 0.00018322637192540785, "loss": 0.4631, "num_input_tokens_seen": 575864, "step": 238, "train_runtime": 1955.6449, "train_tokens_per_second": 294.462 }, { "epoch": 1.9274193548387095, "grad_norm": 0.5116320848464966, "learning_rate": 0.00018308507965231508, "loss": 0.4509, "num_input_tokens_seen": 578253, "step": 239, "train_runtime": 1963.2546, "train_tokens_per_second": 294.538 }, { "epoch": 1.935483870967742, "grad_norm": 0.4807403087615967, "learning_rate": 0.00018294324974285677, "loss": 0.4697, "num_input_tokens_seen": 580659, "step": 240, "train_runtime": 1970.8614, "train_tokens_per_second": 294.622 }, { "epoch": 1.935483870967742, "eval_loss": 0.5381087064743042, "eval_runtime": 17.118, "eval_samples_per_second": 3.038, "eval_steps_per_second": 1.519, "num_input_tokens_seen": 580659, "step": 240 }, { "epoch": 1.9435483870967742, "grad_norm": 0.4913121163845062, "learning_rate": 0.00018280088311480201, "loss": 0.4832, "num_input_tokens_seen": 583091, "step": 241, "train_runtime": 1996.1377, "train_tokens_per_second": 292.11 }, { "epoch": 1.9516129032258065, "grad_norm": 0.4860823154449463, "learning_rate": 0.00018265798068939294, "loss": 0.4735, "num_input_tokens_seen": 585429, "step": 242, "train_runtime": 2003.6251, "train_tokens_per_second": 292.185 }, { "epoch": 1.9596774193548387, "grad_norm": 0.501605212688446, "learning_rate": 0.0001825145433913388, "loss": 0.4724, "num_input_tokens_seen": 587740, "step": 243, "train_runtime": 2010.9061, "train_tokens_per_second": 292.276 }, { "epoch": 1.967741935483871, "grad_norm": 0.6114291548728943, "learning_rate": 0.00018237057214880994, "loss": 0.5296, "num_input_tokens_seen": 590200, "step": 244, "train_runtime": 2018.6341, "train_tokens_per_second": 292.376 }, { "epoch": 1.9758064516129032, "grad_norm": 0.5196385383605957, "learning_rate": 0.00018222606789343183, "loss": 0.4503, "num_input_tokens_seen": 592568, "step": 245, "train_runtime": 2026.0979, "train_tokens_per_second": 292.468 }, { "epoch": 1.9838709677419355, "grad_norm": 0.5557199120521545, "learning_rate": 0.00018208103156027897, "loss": 0.4142, "num_input_tokens_seen": 594859, "step": 246, "train_runtime": 2033.4129, "train_tokens_per_second": 292.542 }, { "epoch": 1.9919354838709677, "grad_norm": 0.46165671944618225, "learning_rate": 0.00018193546408786898, "loss": 0.4176, "num_input_tokens_seen": 597173, "step": 247, "train_runtime": 2040.7772, "train_tokens_per_second": 292.62 }, { "epoch": 2.0, "grad_norm": 0.47444823384284973, "learning_rate": 0.00018178936641815636, "loss": 0.5027, "num_input_tokens_seen": 599536, "step": 248, "train_runtime": 2048.2447, "train_tokens_per_second": 292.707 }, { "epoch": 2.0080645161290325, "grad_norm": 0.4732966721057892, "learning_rate": 0.0001816427394965265, "loss": 0.4184, "num_input_tokens_seen": 601912, "step": 249, "train_runtime": 2055.7988, "train_tokens_per_second": 292.787 }, { "epoch": 2.0161290322580645, "grad_norm": 0.4494001567363739, "learning_rate": 0.00018149558427178956, "loss": 0.3894, "num_input_tokens_seen": 604356, "step": 250, "train_runtime": 2063.5321, "train_tokens_per_second": 292.875 }, { "epoch": 2.024193548387097, "grad_norm": 0.44498300552368164, "learning_rate": 0.00018134790169617419, "loss": 0.3898, "num_input_tokens_seen": 606702, "step": 251, "train_runtime": 2070.9435, "train_tokens_per_second": 292.959 }, { "epoch": 2.032258064516129, "grad_norm": 0.4384775161743164, "learning_rate": 0.00018119969272532166, "loss": 0.3653, "num_input_tokens_seen": 609049, "step": 252, "train_runtime": 2078.4436, "train_tokens_per_second": 293.031 }, { "epoch": 2.0403225806451615, "grad_norm": 0.5194664597511292, "learning_rate": 0.00018105095831827934, "loss": 0.4256, "num_input_tokens_seen": 611538, "step": 253, "train_runtime": 2086.2988, "train_tokens_per_second": 293.121 }, { "epoch": 2.0483870967741935, "grad_norm": 0.591899037361145, "learning_rate": 0.00018090169943749476, "loss": 0.4612, "num_input_tokens_seen": 613951, "step": 254, "train_runtime": 2093.9481, "train_tokens_per_second": 293.203 }, { "epoch": 2.056451612903226, "grad_norm": 0.45229119062423706, "learning_rate": 0.0001807519170488092, "loss": 0.3442, "num_input_tokens_seen": 616318, "step": 255, "train_runtime": 2101.4584, "train_tokens_per_second": 293.281 }, { "epoch": 2.064516129032258, "grad_norm": 0.5321218371391296, "learning_rate": 0.00018060161212145155, "loss": 0.3251, "num_input_tokens_seen": 618622, "step": 256, "train_runtime": 2108.7139, "train_tokens_per_second": 293.365 }, { "epoch": 2.0725806451612905, "grad_norm": 0.534270167350769, "learning_rate": 0.00018045078562803203, "loss": 0.5195, "num_input_tokens_seen": 621111, "step": 257, "train_runtime": 2116.5523, "train_tokens_per_second": 293.454 }, { "epoch": 2.0806451612903225, "grad_norm": 0.5846385359764099, "learning_rate": 0.00018029943854453576, "loss": 0.5031, "num_input_tokens_seen": 623620, "step": 258, "train_runtime": 2124.4123, "train_tokens_per_second": 293.549 }, { "epoch": 2.088709677419355, "grad_norm": 0.47890356183052063, "learning_rate": 0.00018014757185031671, "loss": 0.3725, "num_input_tokens_seen": 625929, "step": 259, "train_runtime": 2131.761, "train_tokens_per_second": 293.621 }, { "epoch": 2.096774193548387, "grad_norm": 0.5304666757583618, "learning_rate": 0.0001799951865280911, "loss": 0.385, "num_input_tokens_seen": 628263, "step": 260, "train_runtime": 2139.2295, "train_tokens_per_second": 293.687 }, { "epoch": 2.1048387096774195, "grad_norm": 0.5279107689857483, "learning_rate": 0.00017984228356393117, "loss": 0.437, "num_input_tokens_seen": 630646, "step": 261, "train_runtime": 2146.7966, "train_tokens_per_second": 293.761 }, { "epoch": 2.1129032258064515, "grad_norm": 0.5816720128059387, "learning_rate": 0.00017968886394725874, "loss": 0.451, "num_input_tokens_seen": 633104, "step": 262, "train_runtime": 2154.5982, "train_tokens_per_second": 293.839 }, { "epoch": 2.120967741935484, "grad_norm": 0.5712978839874268, "learning_rate": 0.00017953492867083895, "loss": 0.5002, "num_input_tokens_seen": 635619, "step": 263, "train_runtime": 2162.5316, "train_tokens_per_second": 293.924 }, { "epoch": 2.129032258064516, "grad_norm": 0.49511051177978516, "learning_rate": 0.00017938047873077362, "loss": 0.4396, "num_input_tokens_seen": 638049, "step": 264, "train_runtime": 2170.2108, "train_tokens_per_second": 294.003 }, { "epoch": 2.1370967741935485, "grad_norm": 0.5378257632255554, "learning_rate": 0.00017922551512649496, "loss": 0.4671, "num_input_tokens_seen": 640517, "step": 265, "train_runtime": 2178.0065, "train_tokens_per_second": 294.084 }, { "epoch": 2.1451612903225805, "grad_norm": 0.5713797807693481, "learning_rate": 0.00017907003886075904, "loss": 0.4272, "num_input_tokens_seen": 642961, "step": 266, "train_runtime": 2185.7486, "train_tokens_per_second": 294.161 }, { "epoch": 2.153225806451613, "grad_norm": 0.5059149861335754, "learning_rate": 0.00017891405093963938, "loss": 0.4128, "num_input_tokens_seen": 645335, "step": 267, "train_runtime": 2193.2848, "train_tokens_per_second": 294.232 }, { "epoch": 2.161290322580645, "grad_norm": 0.5088847875595093, "learning_rate": 0.00017875755237252027, "loss": 0.3906, "num_input_tokens_seen": 647684, "step": 268, "train_runtime": 2200.7799, "train_tokens_per_second": 294.297 }, { "epoch": 2.1693548387096775, "grad_norm": 0.5963711142539978, "learning_rate": 0.00017860054417209042, "loss": 0.4691, "num_input_tokens_seen": 650205, "step": 269, "train_runtime": 2208.6495, "train_tokens_per_second": 294.39 }, { "epoch": 2.1774193548387095, "grad_norm": 0.6186060905456543, "learning_rate": 0.00017844302735433635, "loss": 0.4597, "num_input_tokens_seen": 652668, "step": 270, "train_runtime": 2216.3826, "train_tokens_per_second": 294.474 }, { "epoch": 2.1774193548387095, "eval_loss": 0.5312316417694092, "eval_runtime": 17.1243, "eval_samples_per_second": 3.037, "eval_steps_per_second": 1.518, "num_input_tokens_seen": 652668, "step": 270 }, { "epoch": 2.185483870967742, "grad_norm": 0.5696012377738953, "learning_rate": 0.00017828500293853576, "loss": 0.4434, "num_input_tokens_seen": 655108, "step": 271, "train_runtime": 2241.6839, "train_tokens_per_second": 292.239 }, { "epoch": 2.193548387096774, "grad_norm": 0.4818873107433319, "learning_rate": 0.00017812647194725094, "loss": 0.3463, "num_input_tokens_seen": 657419, "step": 272, "train_runtime": 2249.0146, "train_tokens_per_second": 292.314 }, { "epoch": 2.2016129032258065, "grad_norm": 0.5452220439910889, "learning_rate": 0.00017796743540632223, "loss": 0.3752, "num_input_tokens_seen": 659850, "step": 273, "train_runtime": 2256.6279, "train_tokens_per_second": 292.405 }, { "epoch": 2.2096774193548385, "grad_norm": 0.6274825930595398, "learning_rate": 0.0001778078943448614, "loss": 0.4796, "num_input_tokens_seen": 662308, "step": 274, "train_runtime": 2264.3474, "train_tokens_per_second": 292.494 }, { "epoch": 2.217741935483871, "grad_norm": 0.5836248397827148, "learning_rate": 0.00017764784979524477, "loss": 0.3798, "num_input_tokens_seen": 664650, "step": 275, "train_runtime": 2271.7662, "train_tokens_per_second": 292.57 }, { "epoch": 2.225806451612903, "grad_norm": 0.6024234294891357, "learning_rate": 0.00017748730279310685, "loss": 0.4063, "num_input_tokens_seen": 667075, "step": 276, "train_runtime": 2279.3952, "train_tokens_per_second": 292.654 }, { "epoch": 2.2338709677419355, "grad_norm": 0.5628243088722229, "learning_rate": 0.00017732625437733335, "loss": 0.4282, "num_input_tokens_seen": 669468, "step": 277, "train_runtime": 2286.8838, "train_tokens_per_second": 292.742 }, { "epoch": 2.241935483870968, "grad_norm": 0.5293336510658264, "learning_rate": 0.00017716470559005473, "loss": 0.4309, "num_input_tokens_seen": 671950, "step": 278, "train_runtime": 2294.6117, "train_tokens_per_second": 292.838 }, { "epoch": 2.25, "grad_norm": 0.5426146984100342, "learning_rate": 0.0001770026574766391, "loss": 0.4052, "num_input_tokens_seen": 674433, "step": 279, "train_runtime": 2302.3887, "train_tokens_per_second": 292.928 }, { "epoch": 2.258064516129032, "grad_norm": 0.6207026839256287, "learning_rate": 0.00017684011108568592, "loss": 0.5071, "num_input_tokens_seen": 676912, "step": 280, "train_runtime": 2310.1413, "train_tokens_per_second": 293.018 }, { "epoch": 2.2661290322580645, "grad_norm": 0.5530769228935242, "learning_rate": 0.0001766770674690187, "loss": 0.4343, "num_input_tokens_seen": 679269, "step": 281, "train_runtime": 2317.5627, "train_tokens_per_second": 293.096 }, { "epoch": 2.274193548387097, "grad_norm": 0.564753532409668, "learning_rate": 0.0001765135276816787, "loss": 0.4786, "num_input_tokens_seen": 681764, "step": 282, "train_runtime": 2325.405, "train_tokens_per_second": 293.181 }, { "epoch": 2.282258064516129, "grad_norm": 0.5628737807273865, "learning_rate": 0.0001763494927819177, "loss": 0.446, "num_input_tokens_seen": 684242, "step": 283, "train_runtime": 2333.1186, "train_tokens_per_second": 293.274 }, { "epoch": 2.2903225806451615, "grad_norm": 0.5662450194358826, "learning_rate": 0.00017618496383119128, "loss": 0.4688, "num_input_tokens_seen": 686667, "step": 284, "train_runtime": 2340.7592, "train_tokens_per_second": 293.352 }, { "epoch": 2.2983870967741935, "grad_norm": 0.5666800737380981, "learning_rate": 0.0001760199418941521, "loss": 0.453, "num_input_tokens_seen": 689157, "step": 285, "train_runtime": 2348.5315, "train_tokens_per_second": 293.442 }, { "epoch": 2.306451612903226, "grad_norm": 0.557955265045166, "learning_rate": 0.00017585442803864294, "loss": 0.4425, "num_input_tokens_seen": 691634, "step": 286, "train_runtime": 2356.2504, "train_tokens_per_second": 293.532 }, { "epoch": 2.314516129032258, "grad_norm": 0.48944398760795593, "learning_rate": 0.00017568842333568952, "loss": 0.3574, "num_input_tokens_seen": 693987, "step": 287, "train_runtime": 2363.644, "train_tokens_per_second": 293.609 }, { "epoch": 2.3225806451612905, "grad_norm": 0.5263356566429138, "learning_rate": 0.00017552192885949395, "loss": 0.4162, "num_input_tokens_seen": 696441, "step": 288, "train_runtime": 2371.3258, "train_tokens_per_second": 293.693 }, { "epoch": 2.3306451612903225, "grad_norm": 0.5621548891067505, "learning_rate": 0.0001753549456874276, "loss": 0.4279, "num_input_tokens_seen": 698839, "step": 289, "train_runtime": 2378.8114, "train_tokens_per_second": 293.777 }, { "epoch": 2.338709677419355, "grad_norm": 0.611740231513977, "learning_rate": 0.00017518747490002413, "loss": 0.4416, "num_input_tokens_seen": 701344, "step": 290, "train_runtime": 2386.6276, "train_tokens_per_second": 293.864 }, { "epoch": 2.346774193548387, "grad_norm": 0.6447325944900513, "learning_rate": 0.00017501951758097257, "loss": 0.4726, "num_input_tokens_seen": 703751, "step": 291, "train_runtime": 2394.2032, "train_tokens_per_second": 293.94 }, { "epoch": 2.3548387096774195, "grad_norm": 0.5147153735160828, "learning_rate": 0.00017485107481711012, "loss": 0.385, "num_input_tokens_seen": 706194, "step": 292, "train_runtime": 2401.889, "train_tokens_per_second": 294.016 }, { "epoch": 2.3629032258064515, "grad_norm": 0.5166308879852295, "learning_rate": 0.0001746821476984154, "loss": 0.3878, "num_input_tokens_seen": 708576, "step": 293, "train_runtime": 2409.389, "train_tokens_per_second": 294.089 }, { "epoch": 2.370967741935484, "grad_norm": 0.5455900430679321, "learning_rate": 0.00017451273731800115, "loss": 0.4115, "num_input_tokens_seen": 710954, "step": 294, "train_runtime": 2416.8337, "train_tokens_per_second": 294.168 }, { "epoch": 2.379032258064516, "grad_norm": 0.6575648188591003, "learning_rate": 0.00017434284477210735, "loss": 0.4544, "num_input_tokens_seen": 713370, "step": 295, "train_runtime": 2424.3973, "train_tokens_per_second": 294.246 }, { "epoch": 2.3870967741935485, "grad_norm": 0.5806699991226196, "learning_rate": 0.00017417247116009388, "loss": 0.4084, "num_input_tokens_seen": 715800, "step": 296, "train_runtime": 2431.961, "train_tokens_per_second": 294.33 }, { "epoch": 2.3951612903225805, "grad_norm": 0.6843745112419128, "learning_rate": 0.00017400161758443375, "loss": 0.5063, "num_input_tokens_seen": 718378, "step": 297, "train_runtime": 2439.9431, "train_tokens_per_second": 294.424 }, { "epoch": 2.403225806451613, "grad_norm": 0.6058449149131775, "learning_rate": 0.0001738302851507056, "loss": 0.4703, "num_input_tokens_seen": 720905, "step": 298, "train_runtime": 2447.8089, "train_tokens_per_second": 294.51 }, { "epoch": 2.411290322580645, "grad_norm": 0.5916767716407776, "learning_rate": 0.00017365847496758684, "loss": 0.456, "num_input_tokens_seen": 723346, "step": 299, "train_runtime": 2455.4811, "train_tokens_per_second": 294.584 }, { "epoch": 2.4193548387096775, "grad_norm": 0.6260201930999756, "learning_rate": 0.0001734861881468463, "loss": 0.3953, "num_input_tokens_seen": 725766, "step": 300, "train_runtime": 2463.1406, "train_tokens_per_second": 294.651 }, { "epoch": 2.4193548387096775, "eval_loss": 0.527380108833313, "eval_runtime": 17.1648, "eval_samples_per_second": 3.029, "eval_steps_per_second": 1.515, "num_input_tokens_seen": 725766, "step": 300 }, { "epoch": 2.4274193548387095, "grad_norm": 0.5748977661132812, "learning_rate": 0.00017331342580333706, "loss": 0.355, "num_input_tokens_seen": 728220, "step": 301, "train_runtime": 2488.5272, "train_tokens_per_second": 292.631 }, { "epoch": 2.435483870967742, "grad_norm": 0.5469366908073425, "learning_rate": 0.00017314018905498931, "loss": 0.4119, "num_input_tokens_seen": 730550, "step": 302, "train_runtime": 2495.9929, "train_tokens_per_second": 292.689 }, { "epoch": 2.443548387096774, "grad_norm": 0.5869015455245972, "learning_rate": 0.00017296647902280312, "loss": 0.4636, "num_input_tokens_seen": 733009, "step": 303, "train_runtime": 2503.7702, "train_tokens_per_second": 292.762 }, { "epoch": 2.4516129032258065, "grad_norm": 0.5205616354942322, "learning_rate": 0.00017279229683084103, "loss": 0.4071, "num_input_tokens_seen": 735380, "step": 304, "train_runtime": 2511.3332, "train_tokens_per_second": 292.825 }, { "epoch": 2.4596774193548385, "grad_norm": 0.5007433891296387, "learning_rate": 0.00017261764360622102, "loss": 0.315, "num_input_tokens_seen": 737686, "step": 305, "train_runtime": 2518.7119, "train_tokens_per_second": 292.882 }, { "epoch": 2.467741935483871, "grad_norm": 0.6229686141014099, "learning_rate": 0.00017244252047910892, "loss": 0.4113, "num_input_tokens_seen": 740115, "step": 306, "train_runtime": 2526.4292, "train_tokens_per_second": 292.949 }, { "epoch": 2.475806451612903, "grad_norm": 0.616976261138916, "learning_rate": 0.00017226692858271134, "loss": 0.3898, "num_input_tokens_seen": 742496, "step": 307, "train_runtime": 2533.9623, "train_tokens_per_second": 293.018 }, { "epoch": 2.4838709677419355, "grad_norm": 0.5927110314369202, "learning_rate": 0.00017209086905326833, "loss": 0.4703, "num_input_tokens_seen": 745018, "step": 308, "train_runtime": 2541.882, "train_tokens_per_second": 293.097 }, { "epoch": 2.491935483870968, "grad_norm": 0.5572441816329956, "learning_rate": 0.0001719143430300458, "loss": 0.4004, "num_input_tokens_seen": 747345, "step": 309, "train_runtime": 2549.3485, "train_tokens_per_second": 293.151 }, { "epoch": 2.5, "grad_norm": 0.5831232666969299, "learning_rate": 0.00017173735165532846, "loss": 0.4868, "num_input_tokens_seen": 749829, "step": 310, "train_runtime": 2557.1615, "train_tokens_per_second": 293.227 }, { "epoch": 2.508064516129032, "grad_norm": 0.6402316689491272, "learning_rate": 0.00017155989607441213, "loss": 0.4885, "num_input_tokens_seen": 752333, "step": 311, "train_runtime": 2564.9857, "train_tokens_per_second": 293.309 }, { "epoch": 2.5161290322580645, "grad_norm": 0.570831298828125, "learning_rate": 0.00017138197743559654, "loss": 0.4276, "num_input_tokens_seen": 754723, "step": 312, "train_runtime": 2572.5142, "train_tokens_per_second": 293.38 }, { "epoch": 2.524193548387097, "grad_norm": 0.579409122467041, "learning_rate": 0.0001712035968901778, "loss": 0.4517, "num_input_tokens_seen": 757147, "step": 313, "train_runtime": 2580.1465, "train_tokens_per_second": 293.451 }, { "epoch": 2.532258064516129, "grad_norm": 0.5915924906730652, "learning_rate": 0.00017102475559244105, "loss": 0.4085, "num_input_tokens_seen": 759595, "step": 314, "train_runtime": 2587.8168, "train_tokens_per_second": 293.527 }, { "epoch": 2.540322580645161, "grad_norm": 0.5570502281188965, "learning_rate": 0.00017084545469965283, "loss": 0.4807, "num_input_tokens_seen": 762102, "step": 315, "train_runtime": 2595.6436, "train_tokens_per_second": 293.608 }, { "epoch": 2.5483870967741935, "grad_norm": 0.5616797208786011, "learning_rate": 0.00017066569537205371, "loss": 0.4255, "num_input_tokens_seen": 764592, "step": 316, "train_runtime": 2603.4661, "train_tokens_per_second": 293.682 }, { "epoch": 2.556451612903226, "grad_norm": 0.5703938007354736, "learning_rate": 0.00017048547877285077, "loss": 0.4487, "num_input_tokens_seen": 767057, "step": 317, "train_runtime": 2611.2278, "train_tokens_per_second": 293.753 }, { "epoch": 2.564516129032258, "grad_norm": 0.520163893699646, "learning_rate": 0.00017030480606821, "loss": 0.3109, "num_input_tokens_seen": 769376, "step": 318, "train_runtime": 2618.6114, "train_tokens_per_second": 293.811 }, { "epoch": 2.5725806451612905, "grad_norm": 0.5977119207382202, "learning_rate": 0.00017012367842724887, "loss": 0.4699, "num_input_tokens_seen": 771849, "step": 319, "train_runtime": 2626.3552, "train_tokens_per_second": 293.886 }, { "epoch": 2.5806451612903225, "grad_norm": 0.5811707377433777, "learning_rate": 0.00016994209702202867, "loss": 0.4879, "num_input_tokens_seen": 774330, "step": 320, "train_runtime": 2634.1356, "train_tokens_per_second": 293.96 }, { "epoch": 2.588709677419355, "grad_norm": 0.4585582911968231, "learning_rate": 0.00016976006302754702, "loss": 0.3106, "num_input_tokens_seen": 776532, "step": 321, "train_runtime": 2641.1848, "train_tokens_per_second": 294.009 }, { "epoch": 2.596774193548387, "grad_norm": 0.5964781641960144, "learning_rate": 0.0001695775776217301, "loss": 0.4633, "num_input_tokens_seen": 779041, "step": 322, "train_runtime": 2649.044, "train_tokens_per_second": 294.084 }, { "epoch": 2.6048387096774195, "grad_norm": 0.5717709064483643, "learning_rate": 0.00016939464198542523, "loss": 0.4271, "num_input_tokens_seen": 781447, "step": 323, "train_runtime": 2656.6235, "train_tokens_per_second": 294.15 }, { "epoch": 2.6129032258064515, "grad_norm": 0.6638820171356201, "learning_rate": 0.00016921125730239307, "loss": 0.4948, "num_input_tokens_seen": 783883, "step": 324, "train_runtime": 2664.2512, "train_tokens_per_second": 294.223 }, { "epoch": 2.620967741935484, "grad_norm": 0.5894833207130432, "learning_rate": 0.00016902742475930006, "loss": 0.4224, "num_input_tokens_seen": 786317, "step": 325, "train_runtime": 2671.9046, "train_tokens_per_second": 294.291 }, { "epoch": 2.629032258064516, "grad_norm": 0.5976919531822205, "learning_rate": 0.00016884314554571064, "loss": 0.4773, "num_input_tokens_seen": 788765, "step": 326, "train_runtime": 2679.5878, "train_tokens_per_second": 294.361 }, { "epoch": 2.6370967741935485, "grad_norm": 0.5534376502037048, "learning_rate": 0.0001686584208540797, "loss": 0.3417, "num_input_tokens_seen": 791059, "step": 327, "train_runtime": 2686.9383, "train_tokens_per_second": 294.409 }, { "epoch": 2.6451612903225805, "grad_norm": 0.685295820236206, "learning_rate": 0.00016847325187974477, "loss": 0.3821, "num_input_tokens_seen": 793400, "step": 328, "train_runtime": 2694.3655, "train_tokens_per_second": 294.466 }, { "epoch": 2.653225806451613, "grad_norm": 0.569644033908844, "learning_rate": 0.00016828763982091826, "loss": 0.4001, "num_input_tokens_seen": 795767, "step": 329, "train_runtime": 2701.9131, "train_tokens_per_second": 294.52 }, { "epoch": 2.661290322580645, "grad_norm": 0.5384069681167603, "learning_rate": 0.00016810158587867973, "loss": 0.363, "num_input_tokens_seen": 798156, "step": 330, "train_runtime": 2709.485, "train_tokens_per_second": 294.578 }, { "epoch": 2.661290322580645, "eval_loss": 0.5271878838539124, "eval_runtime": 17.1438, "eval_samples_per_second": 3.033, "eval_steps_per_second": 1.517, "num_input_tokens_seen": 798156, "step": 330 }, { "epoch": 2.6693548387096775, "grad_norm": 0.5838504433631897, "learning_rate": 0.00016791509125696816, "loss": 0.3768, "num_input_tokens_seen": 800529, "step": 331, "train_runtime": 2734.6267, "train_tokens_per_second": 292.738 }, { "epoch": 2.6774193548387095, "grad_norm": 0.6681272983551025, "learning_rate": 0.00016772815716257412, "loss": 0.4815, "num_input_tokens_seen": 803033, "step": 332, "train_runtime": 2742.5363, "train_tokens_per_second": 292.807 }, { "epoch": 2.685483870967742, "grad_norm": 0.6045708060264587, "learning_rate": 0.00016754078480513197, "loss": 0.4308, "num_input_tokens_seen": 805467, "step": 333, "train_runtime": 2750.2472, "train_tokens_per_second": 292.871 }, { "epoch": 2.693548387096774, "grad_norm": 0.49178603291511536, "learning_rate": 0.00016735297539711204, "loss": 0.3579, "num_input_tokens_seen": 807830, "step": 334, "train_runtime": 2757.6998, "train_tokens_per_second": 292.936 }, { "epoch": 2.7016129032258065, "grad_norm": 0.6021568179130554, "learning_rate": 0.00016716473015381276, "loss": 0.5022, "num_input_tokens_seen": 810267, "step": 335, "train_runtime": 2765.4392, "train_tokens_per_second": 292.998 }, { "epoch": 2.709677419354839, "grad_norm": 0.642227292060852, "learning_rate": 0.0001669760502933528, "loss": 0.4935, "num_input_tokens_seen": 812742, "step": 336, "train_runtime": 2773.2451, "train_tokens_per_second": 293.065 }, { "epoch": 2.717741935483871, "grad_norm": 0.689476490020752, "learning_rate": 0.00016678693703666325, "loss": 0.4519, "num_input_tokens_seen": 815204, "step": 337, "train_runtime": 2780.9789, "train_tokens_per_second": 293.136 }, { "epoch": 2.725806451612903, "grad_norm": 0.639265775680542, "learning_rate": 0.00016659739160747967, "loss": 0.4412, "num_input_tokens_seen": 817573, "step": 338, "train_runtime": 2788.5484, "train_tokens_per_second": 293.189 }, { "epoch": 2.7338709677419355, "grad_norm": 0.5716695785522461, "learning_rate": 0.00016640741523233407, "loss": 0.3793, "num_input_tokens_seen": 819936, "step": 339, "train_runtime": 2796.0411, "train_tokens_per_second": 293.249 }, { "epoch": 2.741935483870968, "grad_norm": 0.604978084564209, "learning_rate": 0.00016621700914054718, "loss": 0.4706, "num_input_tokens_seen": 822393, "step": 340, "train_runtime": 2803.8011, "train_tokens_per_second": 293.314 }, { "epoch": 2.75, "grad_norm": 0.5651271343231201, "learning_rate": 0.00016602617456422034, "loss": 0.4308, "num_input_tokens_seen": 824807, "step": 341, "train_runtime": 2811.4387, "train_tokens_per_second": 293.375 }, { "epoch": 2.758064516129032, "grad_norm": 0.6630417108535767, "learning_rate": 0.00016583491273822765, "loss": 0.457, "num_input_tokens_seen": 827252, "step": 342, "train_runtime": 2819.1309, "train_tokens_per_second": 293.442 }, { "epoch": 2.7661290322580645, "grad_norm": 0.6293765306472778, "learning_rate": 0.00016564322490020776, "loss": 0.4465, "num_input_tokens_seen": 829666, "step": 343, "train_runtime": 2826.7387, "train_tokens_per_second": 293.506 }, { "epoch": 2.774193548387097, "grad_norm": 0.5662552714347839, "learning_rate": 0.00016545111229055614, "loss": 0.4357, "num_input_tokens_seen": 832061, "step": 344, "train_runtime": 2834.3219, "train_tokens_per_second": 293.566 }, { "epoch": 2.782258064516129, "grad_norm": 0.6861712336540222, "learning_rate": 0.00016525857615241687, "loss": 0.4776, "num_input_tokens_seen": 834532, "step": 345, "train_runtime": 2842.1306, "train_tokens_per_second": 293.629 }, { "epoch": 2.790322580645161, "grad_norm": 0.6480405330657959, "learning_rate": 0.00016506561773167464, "loss": 0.486, "num_input_tokens_seen": 837118, "step": 346, "train_runtime": 2850.1336, "train_tokens_per_second": 293.712 }, { "epoch": 2.7983870967741935, "grad_norm": 0.6150621771812439, "learning_rate": 0.00016487223827694672, "loss": 0.5001, "num_input_tokens_seen": 839620, "step": 347, "train_runtime": 2857.9459, "train_tokens_per_second": 293.784 }, { "epoch": 2.806451612903226, "grad_norm": 0.572364091873169, "learning_rate": 0.00016467843903957485, "loss": 0.4115, "num_input_tokens_seen": 842039, "step": 348, "train_runtime": 2865.4869, "train_tokens_per_second": 293.855 }, { "epoch": 2.814516129032258, "grad_norm": 0.5200697183609009, "learning_rate": 0.00016448422127361706, "loss": 0.3682, "num_input_tokens_seen": 844368, "step": 349, "train_runtime": 2872.8812, "train_tokens_per_second": 293.91 }, { "epoch": 2.8225806451612905, "grad_norm": 0.5125967264175415, "learning_rate": 0.00016428958623583982, "loss": 0.3246, "num_input_tokens_seen": 846670, "step": 350, "train_runtime": 2880.3239, "train_tokens_per_second": 293.95 }, { "epoch": 2.8306451612903225, "grad_norm": 0.5699242949485779, "learning_rate": 0.0001640945351857096, "loss": 0.4146, "num_input_tokens_seen": 849140, "step": 351, "train_runtime": 2888.0519, "train_tokens_per_second": 294.018 }, { "epoch": 2.838709677419355, "grad_norm": 0.5335444808006287, "learning_rate": 0.0001638990693853848, "loss": 0.3411, "num_input_tokens_seen": 851497, "step": 352, "train_runtime": 2895.5376, "train_tokens_per_second": 294.072 }, { "epoch": 2.846774193548387, "grad_norm": 0.5370481014251709, "learning_rate": 0.00016370319009970777, "loss": 0.3241, "num_input_tokens_seen": 853782, "step": 353, "train_runtime": 2902.8145, "train_tokens_per_second": 294.122 }, { "epoch": 2.8548387096774195, "grad_norm": 0.6068407893180847, "learning_rate": 0.0001635068985961965, "loss": 0.4412, "num_input_tokens_seen": 856228, "step": 354, "train_runtime": 2910.5006, "train_tokens_per_second": 294.186 }, { "epoch": 2.8629032258064515, "grad_norm": 0.5907058119773865, "learning_rate": 0.00016331019614503623, "loss": 0.4894, "num_input_tokens_seen": 858707, "step": 355, "train_runtime": 2918.2897, "train_tokens_per_second": 294.25 }, { "epoch": 2.870967741935484, "grad_norm": 0.6342933773994446, "learning_rate": 0.00016311308401907153, "loss": 0.4909, "num_input_tokens_seen": 861182, "step": 356, "train_runtime": 2926.0784, "train_tokens_per_second": 294.313 }, { "epoch": 2.879032258064516, "grad_norm": 0.5832836031913757, "learning_rate": 0.00016291556349379795, "loss": 0.4421, "num_input_tokens_seen": 863637, "step": 357, "train_runtime": 2933.8375, "train_tokens_per_second": 294.371 }, { "epoch": 2.8870967741935485, "grad_norm": 0.6093757152557373, "learning_rate": 0.0001627176358473537, "loss": 0.4871, "num_input_tokens_seen": 866076, "step": 358, "train_runtime": 2941.5666, "train_tokens_per_second": 294.427 }, { "epoch": 2.8951612903225805, "grad_norm": 0.5512157082557678, "learning_rate": 0.0001625193023605115, "loss": 0.379, "num_input_tokens_seen": 868405, "step": 359, "train_runtime": 2948.977, "train_tokens_per_second": 294.477 }, { "epoch": 2.903225806451613, "grad_norm": 0.540212869644165, "learning_rate": 0.00016232056431667017, "loss": 0.4007, "num_input_tokens_seen": 870779, "step": 360, "train_runtime": 2956.4862, "train_tokens_per_second": 294.532 }, { "epoch": 2.903225806451613, "eval_loss": 0.5227369070053101, "eval_runtime": 17.1651, "eval_samples_per_second": 3.029, "eval_steps_per_second": 1.515, "num_input_tokens_seen": 870779, "step": 360 } ], "logging_steps": 1, "max_steps": 1240, "num_input_tokens_seen": 870779, "num_train_epochs": 10, "save_steps": 30, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.92211345087744e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }