| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.5590297095805052, |
| "eval_steps": 100, |
| "global_step": 2000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 2.4441180236637594, |
| "epoch": 0.017803493935684877, |
| "grad_norm": 0.0079345703125, |
| "learning_rate": 1.8e-07, |
| "loss": 3.382, |
| "mean_token_accuracy": 0.4423037525266409, |
| "num_tokens": 257189.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 2.45082361549139, |
| "epoch": 0.035606987871369754, |
| "grad_norm": 0.0086669921875, |
| "learning_rate": 3.8e-07, |
| "loss": 3.375, |
| "mean_token_accuracy": 0.4396079422906041, |
| "num_tokens": 528285.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 2.461306857317686, |
| "epoch": 0.053410481807054634, |
| "grad_norm": 0.00860595703125, |
| "learning_rate": 5.800000000000001e-07, |
| "loss": 3.3839, |
| "mean_token_accuracy": 0.43654753603041174, |
| "num_tokens": 801635.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 2.486989914625883, |
| "epoch": 0.07121397574273951, |
| "grad_norm": 0.00860595703125, |
| "learning_rate": 7.8e-07, |
| "loss": 3.4182, |
| "mean_token_accuracy": 0.435793649405241, |
| "num_tokens": 1060812.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 2.4777335457503797, |
| "epoch": 0.08901746967842439, |
| "grad_norm": 0.007049560546875, |
| "learning_rate": 9.800000000000001e-07, |
| "loss": 3.407, |
| "mean_token_accuracy": 0.4357369150966406, |
| "num_tokens": 1327380.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 2.478399809449911, |
| "epoch": 0.10682096361410927, |
| "grad_norm": 0.00726318359375, |
| "learning_rate": 1.1800000000000001e-06, |
| "loss": 3.4405, |
| "mean_token_accuracy": 0.43458456825464964, |
| "num_tokens": 1597405.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 2.5134010180830955, |
| "epoch": 0.12462445754979415, |
| "grad_norm": 0.006683349609375, |
| "learning_rate": 1.3800000000000001e-06, |
| "loss": 3.4613, |
| "mean_token_accuracy": 0.4291722310706973, |
| "num_tokens": 1859684.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 2.5495446130633352, |
| "epoch": 0.14242795148547902, |
| "grad_norm": 0.00787353515625, |
| "learning_rate": 1.5800000000000001e-06, |
| "loss": 3.5072, |
| "mean_token_accuracy": 0.42735045179724696, |
| "num_tokens": 2118068.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 2.4757557071745397, |
| "epoch": 0.1602314454211639, |
| "grad_norm": 0.007659912109375, |
| "learning_rate": 1.7800000000000001e-06, |
| "loss": 3.4228, |
| "mean_token_accuracy": 0.43469055872410534, |
| "num_tokens": 2388824.0, |
| "step": 90 |
| }, |
| { |
| "entropy": 2.5207040145993234, |
| "epoch": 0.17803493935684878, |
| "grad_norm": 0.00994873046875, |
| "learning_rate": 1.98e-06, |
| "loss": 3.506, |
| "mean_token_accuracy": 0.4275117186829448, |
| "num_tokens": 2644330.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.17803493935684878, |
| "eval_chemistry_entropy": 2.4951189584732054, |
| "eval_chemistry_loss": 3.512981414794922, |
| "eval_chemistry_mean_token_accuracy": 0.4329572098255157, |
| "eval_chemistry_num_tokens": 2644330.0, |
| "eval_chemistry_runtime": 42.5715, |
| "eval_chemistry_samples_per_second": 11.745, |
| "eval_chemistry_steps_per_second": 2.936, |
| "step": 100 |
| }, |
| { |
| "entropy": 2.460470324009657, |
| "epoch": 0.19583843329253367, |
| "grad_norm": 0.0084228515625, |
| "learning_rate": 2.1800000000000003e-06, |
| "loss": 3.3945, |
| "mean_token_accuracy": 0.4391466261819005, |
| "num_tokens": 2913700.0, |
| "step": 110 |
| }, |
| { |
| "entropy": 2.478592600673437, |
| "epoch": 0.21364192722821854, |
| "grad_norm": 0.006866455078125, |
| "learning_rate": 2.38e-06, |
| "loss": 3.3703, |
| "mean_token_accuracy": 0.4405368799343705, |
| "num_tokens": 3185255.0, |
| "step": 120 |
| }, |
| { |
| "entropy": 2.4654680602252483, |
| "epoch": 0.2314454211639034, |
| "grad_norm": 0.00994873046875, |
| "learning_rate": 2.5800000000000003e-06, |
| "loss": 3.3981, |
| "mean_token_accuracy": 0.43972534593194723, |
| "num_tokens": 3454750.0, |
| "step": 130 |
| }, |
| { |
| "entropy": 2.5018775165081024, |
| "epoch": 0.2492489150995883, |
| "grad_norm": 0.0091552734375, |
| "learning_rate": 2.7800000000000005e-06, |
| "loss": 3.4682, |
| "mean_token_accuracy": 0.4316142167896032, |
| "num_tokens": 3719113.0, |
| "step": 140 |
| }, |
| { |
| "entropy": 2.568158584088087, |
| "epoch": 0.26705240903527316, |
| "grad_norm": 0.007659912109375, |
| "learning_rate": 2.9800000000000003e-06, |
| "loss": 3.5044, |
| "mean_token_accuracy": 0.4264186339452863, |
| "num_tokens": 3990505.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 2.420287236571312, |
| "epoch": 0.28485590297095803, |
| "grad_norm": 0.007415771484375, |
| "learning_rate": 3.1800000000000005e-06, |
| "loss": 3.3513, |
| "mean_token_accuracy": 0.44414950646460055, |
| "num_tokens": 4267403.0, |
| "step": 160 |
| }, |
| { |
| "entropy": 2.4532286070287226, |
| "epoch": 0.30265939690664295, |
| "grad_norm": 0.0079345703125, |
| "learning_rate": 3.3800000000000007e-06, |
| "loss": 3.3478, |
| "mean_token_accuracy": 0.4404185781255364, |
| "num_tokens": 4535458.0, |
| "step": 170 |
| }, |
| { |
| "entropy": 2.4985749416053293, |
| "epoch": 0.3204628908423278, |
| "grad_norm": 0.007720947265625, |
| "learning_rate": 3.58e-06, |
| "loss": 3.4363, |
| "mean_token_accuracy": 0.4365795683115721, |
| "num_tokens": 4796815.0, |
| "step": 180 |
| }, |
| { |
| "entropy": 2.4519996128976347, |
| "epoch": 0.3382663847780127, |
| "grad_norm": 0.00701904296875, |
| "learning_rate": 3.7800000000000002e-06, |
| "loss": 3.3977, |
| "mean_token_accuracy": 0.440047624707222, |
| "num_tokens": 5066948.0, |
| "step": 190 |
| }, |
| { |
| "entropy": 2.521295293420553, |
| "epoch": 0.35606987871369755, |
| "grad_norm": 0.010009765625, |
| "learning_rate": 3.980000000000001e-06, |
| "loss": 3.4496, |
| "mean_token_accuracy": 0.4322214875370264, |
| "num_tokens": 5324751.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.35606987871369755, |
| "eval_chemistry_entropy": 2.4951189584732054, |
| "eval_chemistry_loss": 3.512981414794922, |
| "eval_chemistry_mean_token_accuracy": 0.4329572098255157, |
| "eval_chemistry_num_tokens": 5324751.0, |
| "eval_chemistry_runtime": 42.5086, |
| "eval_chemistry_samples_per_second": 11.762, |
| "eval_chemistry_steps_per_second": 2.941, |
| "step": 200 |
| }, |
| { |
| "entropy": 2.44460117071867, |
| "epoch": 0.3738733726493824, |
| "grad_norm": 0.00732421875, |
| "learning_rate": 4.18e-06, |
| "loss": 3.3862, |
| "mean_token_accuracy": 0.4415476618334651, |
| "num_tokens": 5585508.0, |
| "step": 210 |
| }, |
| { |
| "entropy": 2.480318387597799, |
| "epoch": 0.39167686658506734, |
| "grad_norm": 0.006591796875, |
| "learning_rate": 4.38e-06, |
| "loss": 3.4199, |
| "mean_token_accuracy": 0.43676935136318207, |
| "num_tokens": 5848889.0, |
| "step": 220 |
| }, |
| { |
| "entropy": 2.4665837228298186, |
| "epoch": 0.4094803605207522, |
| "grad_norm": 0.007049560546875, |
| "learning_rate": 4.58e-06, |
| "loss": 3.399, |
| "mean_token_accuracy": 0.4371540261432528, |
| "num_tokens": 6114855.0, |
| "step": 230 |
| }, |
| { |
| "entropy": 2.4934676766395567, |
| "epoch": 0.4272838544564371, |
| "grad_norm": 0.00787353515625, |
| "learning_rate": 4.78e-06, |
| "loss": 3.392, |
| "mean_token_accuracy": 0.4352115549147129, |
| "num_tokens": 6378152.0, |
| "step": 240 |
| }, |
| { |
| "entropy": 2.510968402773142, |
| "epoch": 0.44508734839212194, |
| "grad_norm": 0.00811767578125, |
| "learning_rate": 4.980000000000001e-06, |
| "loss": 3.4512, |
| "mean_token_accuracy": 0.43123594429343937, |
| "num_tokens": 6637273.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 2.459284470230341, |
| "epoch": 0.4628908423278068, |
| "grad_norm": 0.0081787109375, |
| "learning_rate": 5.18e-06, |
| "loss": 3.4105, |
| "mean_token_accuracy": 0.43572237603366376, |
| "num_tokens": 6896684.0, |
| "step": 260 |
| }, |
| { |
| "entropy": 2.4737772688269617, |
| "epoch": 0.48069433626349173, |
| "grad_norm": 0.01324462890625, |
| "learning_rate": 5.380000000000001e-06, |
| "loss": 3.4275, |
| "mean_token_accuracy": 0.4344434389844537, |
| "num_tokens": 7166608.0, |
| "step": 270 |
| }, |
| { |
| "entropy": 2.393792650848627, |
| "epoch": 0.4984978301991766, |
| "grad_norm": 0.00714111328125, |
| "learning_rate": 5.580000000000001e-06, |
| "loss": 3.3174, |
| "mean_token_accuracy": 0.445861529558897, |
| "num_tokens": 7444923.0, |
| "step": 280 |
| }, |
| { |
| "entropy": 2.4808071456849574, |
| "epoch": 0.5163013241348615, |
| "grad_norm": 0.0125732421875, |
| "learning_rate": 5.78e-06, |
| "loss": 3.4216, |
| "mean_token_accuracy": 0.4335135340690613, |
| "num_tokens": 7706502.0, |
| "step": 290 |
| }, |
| { |
| "entropy": 2.4993100173771383, |
| "epoch": 0.5341048180705463, |
| "grad_norm": 0.01153564453125, |
| "learning_rate": 5.98e-06, |
| "loss": 3.4677, |
| "mean_token_accuracy": 0.43226035628467796, |
| "num_tokens": 7969704.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.5341048180705463, |
| "eval_chemistry_entropy": 2.4951189584732054, |
| "eval_chemistry_loss": 3.512981414794922, |
| "eval_chemistry_mean_token_accuracy": 0.4329572098255157, |
| "eval_chemistry_num_tokens": 7969704.0, |
| "eval_chemistry_runtime": 42.3808, |
| "eval_chemistry_samples_per_second": 11.798, |
| "eval_chemistry_steps_per_second": 2.949, |
| "step": 300 |
| }, |
| { |
| "entropy": 2.49663657695055, |
| "epoch": 0.5519083120062312, |
| "grad_norm": 0.0101318359375, |
| "learning_rate": 6.18e-06, |
| "loss": 3.452, |
| "mean_token_accuracy": 0.4326026676222682, |
| "num_tokens": 8242162.0, |
| "step": 310 |
| }, |
| { |
| "entropy": 2.491827418655157, |
| "epoch": 0.5697118059419161, |
| "grad_norm": 0.007476806640625, |
| "learning_rate": 6.380000000000001e-06, |
| "loss": 3.4701, |
| "mean_token_accuracy": 0.4347437607124448, |
| "num_tokens": 8497852.0, |
| "step": 320 |
| }, |
| { |
| "entropy": 2.526278664171696, |
| "epoch": 0.587515299877601, |
| "grad_norm": 0.00946044921875, |
| "learning_rate": 6.5800000000000005e-06, |
| "loss": 3.504, |
| "mean_token_accuracy": 0.4276865454390645, |
| "num_tokens": 8757753.0, |
| "step": 330 |
| }, |
| { |
| "entropy": 2.4302435807883738, |
| "epoch": 0.6053187938132859, |
| "grad_norm": 0.0189208984375, |
| "learning_rate": 6.780000000000001e-06, |
| "loss": 3.3655, |
| "mean_token_accuracy": 0.4372914243489504, |
| "num_tokens": 9024677.0, |
| "step": 340 |
| }, |
| { |
| "entropy": 2.4870374642312525, |
| "epoch": 0.6231222877489707, |
| "grad_norm": 0.00750732421875, |
| "learning_rate": 6.98e-06, |
| "loss": 3.4166, |
| "mean_token_accuracy": 0.4357576759532094, |
| "num_tokens": 9291760.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 2.440358117222786, |
| "epoch": 0.6409257816846556, |
| "grad_norm": 0.007415771484375, |
| "learning_rate": 7.180000000000001e-06, |
| "loss": 3.3708, |
| "mean_token_accuracy": 0.43819774594157934, |
| "num_tokens": 9561091.0, |
| "step": 360 |
| }, |
| { |
| "entropy": 2.4518581442534924, |
| "epoch": 0.6587292756203404, |
| "grad_norm": 0.01177978515625, |
| "learning_rate": 7.3800000000000005e-06, |
| "loss": 3.39, |
| "mean_token_accuracy": 0.4373545182868838, |
| "num_tokens": 9827272.0, |
| "step": 370 |
| }, |
| { |
| "entropy": 2.4458557978272437, |
| "epoch": 0.6765327695560254, |
| "grad_norm": 0.01019287109375, |
| "learning_rate": 7.58e-06, |
| "loss": 3.3803, |
| "mean_token_accuracy": 0.44243213571608064, |
| "num_tokens": 10096065.0, |
| "step": 380 |
| }, |
| { |
| "entropy": 2.457463264465332, |
| "epoch": 0.6943362634917103, |
| "grad_norm": 0.00775146484375, |
| "learning_rate": 7.78e-06, |
| "loss": 3.3973, |
| "mean_token_accuracy": 0.43960350602865217, |
| "num_tokens": 10364601.0, |
| "step": 390 |
| }, |
| { |
| "entropy": 2.4410374239087105, |
| "epoch": 0.7121397574273951, |
| "grad_norm": 0.00860595703125, |
| "learning_rate": 7.980000000000002e-06, |
| "loss": 3.3916, |
| "mean_token_accuracy": 0.4379643935710192, |
| "num_tokens": 10633325.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.7121397574273951, |
| "eval_chemistry_entropy": 2.4951189584732054, |
| "eval_chemistry_loss": 3.512981414794922, |
| "eval_chemistry_mean_token_accuracy": 0.4329572098255157, |
| "eval_chemistry_num_tokens": 10633325.0, |
| "eval_chemistry_runtime": 42.4526, |
| "eval_chemistry_samples_per_second": 11.778, |
| "eval_chemistry_steps_per_second": 2.944, |
| "step": 400 |
| }, |
| { |
| "entropy": 2.409457255154848, |
| "epoch": 0.72994325136308, |
| "grad_norm": 0.0079345703125, |
| "learning_rate": 8.18e-06, |
| "loss": 3.3536, |
| "mean_token_accuracy": 0.4413986885920167, |
| "num_tokens": 10897916.0, |
| "step": 410 |
| }, |
| { |
| "entropy": 2.4690888836979865, |
| "epoch": 0.7477467452987648, |
| "grad_norm": 0.009033203125, |
| "learning_rate": 8.380000000000001e-06, |
| "loss": 3.4202, |
| "mean_token_accuracy": 0.43675678241997956, |
| "num_tokens": 11165356.0, |
| "step": 420 |
| }, |
| { |
| "entropy": 2.456707588583231, |
| "epoch": 0.7655502392344498, |
| "grad_norm": 0.0074462890625, |
| "learning_rate": 8.580000000000001e-06, |
| "loss": 3.3849, |
| "mean_token_accuracy": 0.43797059506177904, |
| "num_tokens": 11436799.0, |
| "step": 430 |
| }, |
| { |
| "entropy": 2.481979151070118, |
| "epoch": 0.7833537331701347, |
| "grad_norm": 0.00927734375, |
| "learning_rate": 8.78e-06, |
| "loss": 3.3948, |
| "mean_token_accuracy": 0.4386452713981271, |
| "num_tokens": 11703496.0, |
| "step": 440 |
| }, |
| { |
| "entropy": 2.511328932642937, |
| "epoch": 0.8011572271058195, |
| "grad_norm": 0.00836181640625, |
| "learning_rate": 8.98e-06, |
| "loss": 3.4704, |
| "mean_token_accuracy": 0.4308528220281005, |
| "num_tokens": 11965530.0, |
| "step": 450 |
| }, |
| { |
| "entropy": 2.4488867104053496, |
| "epoch": 0.8189607210415044, |
| "grad_norm": 0.00897216796875, |
| "learning_rate": 9.180000000000002e-06, |
| "loss": 3.4121, |
| "mean_token_accuracy": 0.43401869516819713, |
| "num_tokens": 12224427.0, |
| "step": 460 |
| }, |
| { |
| "entropy": 2.41854225769639, |
| "epoch": 0.8367642149771892, |
| "grad_norm": 0.01202392578125, |
| "learning_rate": 9.38e-06, |
| "loss": 3.312, |
| "mean_token_accuracy": 0.4470580581575632, |
| "num_tokens": 12509124.0, |
| "step": 470 |
| }, |
| { |
| "entropy": 2.4362686768174173, |
| "epoch": 0.8545677089128741, |
| "grad_norm": 0.007232666015625, |
| "learning_rate": 9.58e-06, |
| "loss": 3.3794, |
| "mean_token_accuracy": 0.4400447830557823, |
| "num_tokens": 12778408.0, |
| "step": 480 |
| }, |
| { |
| "entropy": 2.4321289747953414, |
| "epoch": 0.8723712028485591, |
| "grad_norm": 0.00811767578125, |
| "learning_rate": 9.780000000000001e-06, |
| "loss": 3.3687, |
| "mean_token_accuracy": 0.44097492955625056, |
| "num_tokens": 13046473.0, |
| "step": 490 |
| }, |
| { |
| "entropy": 2.4888896882534026, |
| "epoch": 0.8901746967842439, |
| "grad_norm": 0.007720947265625, |
| "learning_rate": 9.980000000000001e-06, |
| "loss": 3.4261, |
| "mean_token_accuracy": 0.43556336853653194, |
| "num_tokens": 13301659.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.8901746967842439, |
| "eval_chemistry_entropy": 2.4951189584732054, |
| "eval_chemistry_loss": 3.512981414794922, |
| "eval_chemistry_mean_token_accuracy": 0.4329572098255157, |
| "eval_chemistry_num_tokens": 13301659.0, |
| "eval_chemistry_runtime": 42.4183, |
| "eval_chemistry_samples_per_second": 11.787, |
| "eval_chemistry_steps_per_second": 2.947, |
| "step": 500 |
| }, |
| { |
| "entropy": 2.4065550975501537, |
| "epoch": 0.9079781907199288, |
| "grad_norm": 0.007110595703125, |
| "learning_rate": 1.018e-05, |
| "loss": 3.3268, |
| "mean_token_accuracy": 0.44642978757619856, |
| "num_tokens": 13570986.0, |
| "step": 510 |
| }, |
| { |
| "entropy": 2.4483018897473814, |
| "epoch": 0.9257816846556136, |
| "grad_norm": 0.0087890625, |
| "learning_rate": 1.038e-05, |
| "loss": 3.4018, |
| "mean_token_accuracy": 0.4386210318654776, |
| "num_tokens": 13833901.0, |
| "step": 520 |
| }, |
| { |
| "entropy": 2.454428631067276, |
| "epoch": 0.9435851785912985, |
| "grad_norm": 0.00921630859375, |
| "learning_rate": 1.0580000000000002e-05, |
| "loss": 3.4197, |
| "mean_token_accuracy": 0.43513929657638073, |
| "num_tokens": 14093888.0, |
| "step": 530 |
| }, |
| { |
| "entropy": 2.4978294670581818, |
| "epoch": 0.9613886725269835, |
| "grad_norm": 0.00750732421875, |
| "learning_rate": 1.0780000000000002e-05, |
| "loss": 3.43, |
| "mean_token_accuracy": 0.43464562948793173, |
| "num_tokens": 14354918.0, |
| "step": 540 |
| }, |
| { |
| "entropy": 2.46180320456624, |
| "epoch": 0.9791921664626683, |
| "grad_norm": 0.00811767578125, |
| "learning_rate": 1.0980000000000002e-05, |
| "loss": 3.4206, |
| "mean_token_accuracy": 0.43483266811817883, |
| "num_tokens": 14620244.0, |
| "step": 550 |
| }, |
| { |
| "entropy": 2.4592869602143765, |
| "epoch": 0.9969956603983532, |
| "grad_norm": 0.00787353515625, |
| "learning_rate": 1.1180000000000001e-05, |
| "loss": 3.4471, |
| "mean_token_accuracy": 0.43371057212352754, |
| "num_tokens": 14886593.0, |
| "step": 560 |
| }, |
| { |
| "entropy": 2.3787012269420007, |
| "epoch": 1.014242795148548, |
| "grad_norm": 0.00830078125, |
| "learning_rate": 1.138e-05, |
| "loss": 3.2918, |
| "mean_token_accuracy": 0.44634086931905437, |
| "num_tokens": 15146469.0, |
| "step": 570 |
| }, |
| { |
| "entropy": 2.524292258173227, |
| "epoch": 1.0320462890842328, |
| "grad_norm": 0.0128173828125, |
| "learning_rate": 1.1580000000000001e-05, |
| "loss": 3.4816, |
| "mean_token_accuracy": 0.43193594105541705, |
| "num_tokens": 15404569.0, |
| "step": 580 |
| }, |
| { |
| "entropy": 2.4150695733726026, |
| "epoch": 1.0498497830199176, |
| "grad_norm": 0.00921630859375, |
| "learning_rate": 1.178e-05, |
| "loss": 3.3678, |
| "mean_token_accuracy": 0.4405944043770432, |
| "num_tokens": 15681709.0, |
| "step": 590 |
| }, |
| { |
| "entropy": 2.441706320643425, |
| "epoch": 1.0676532769556026, |
| "grad_norm": 0.007659912109375, |
| "learning_rate": 1.198e-05, |
| "loss": 3.3742, |
| "mean_token_accuracy": 0.4395477233454585, |
| "num_tokens": 15949421.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.0676532769556026, |
| "eval_chemistry_entropy": 2.4951189584732054, |
| "eval_chemistry_loss": 3.512981414794922, |
| "eval_chemistry_mean_token_accuracy": 0.4329572098255157, |
| "eval_chemistry_num_tokens": 15949421.0, |
| "eval_chemistry_runtime": 42.5959, |
| "eval_chemistry_samples_per_second": 11.738, |
| "eval_chemistry_steps_per_second": 2.935, |
| "step": 600 |
| }, |
| { |
| "entropy": 2.4119591929018496, |
| "epoch": 1.0854567708912874, |
| "grad_norm": 0.0076904296875, |
| "learning_rate": 1.218e-05, |
| "loss": 3.367, |
| "mean_token_accuracy": 0.4415699910372496, |
| "num_tokens": 16221916.0, |
| "step": 610 |
| }, |
| { |
| "entropy": 2.51780494004488, |
| "epoch": 1.1032602648269723, |
| "grad_norm": 0.007537841796875, |
| "learning_rate": 1.2380000000000002e-05, |
| "loss": 3.4485, |
| "mean_token_accuracy": 0.4301307639107108, |
| "num_tokens": 16488154.0, |
| "step": 620 |
| }, |
| { |
| "entropy": 2.4647247321903705, |
| "epoch": 1.121063758762657, |
| "grad_norm": 0.007354736328125, |
| "learning_rate": 1.2580000000000002e-05, |
| "loss": 3.4027, |
| "mean_token_accuracy": 0.4368906727060676, |
| "num_tokens": 16756828.0, |
| "step": 630 |
| }, |
| { |
| "entropy": 2.4489255413413047, |
| "epoch": 1.138867252698342, |
| "grad_norm": 0.00799560546875, |
| "learning_rate": 1.2780000000000001e-05, |
| "loss": 3.3816, |
| "mean_token_accuracy": 0.43627936094999314, |
| "num_tokens": 17023681.0, |
| "step": 640 |
| }, |
| { |
| "entropy": 2.4757089667022227, |
| "epoch": 1.156670746634027, |
| "grad_norm": 0.01190185546875, |
| "learning_rate": 1.2980000000000001e-05, |
| "loss": 3.4233, |
| "mean_token_accuracy": 0.4360200822353363, |
| "num_tokens": 17286672.0, |
| "step": 650 |
| }, |
| { |
| "entropy": 2.542103400081396, |
| "epoch": 1.1744742405697117, |
| "grad_norm": 0.00726318359375, |
| "learning_rate": 1.3180000000000001e-05, |
| "loss": 3.4708, |
| "mean_token_accuracy": 0.42764721568673847, |
| "num_tokens": 17551581.0, |
| "step": 660 |
| }, |
| { |
| "entropy": 2.4632993809878827, |
| "epoch": 1.1922777345053968, |
| "grad_norm": 0.00946044921875, |
| "learning_rate": 1.3380000000000002e-05, |
| "loss": 3.3879, |
| "mean_token_accuracy": 0.43964554853737353, |
| "num_tokens": 17821205.0, |
| "step": 670 |
| }, |
| { |
| "entropy": 2.4974757246673107, |
| "epoch": 1.2100812284410816, |
| "grad_norm": 0.007415771484375, |
| "learning_rate": 1.3580000000000002e-05, |
| "loss": 3.4339, |
| "mean_token_accuracy": 0.4323098760098219, |
| "num_tokens": 18087759.0, |
| "step": 680 |
| }, |
| { |
| "entropy": 2.485871135443449, |
| "epoch": 1.2278847223767664, |
| "grad_norm": 0.0084228515625, |
| "learning_rate": 1.378e-05, |
| "loss": 3.4259, |
| "mean_token_accuracy": 0.4338374109938741, |
| "num_tokens": 18355732.0, |
| "step": 690 |
| }, |
| { |
| "entropy": 2.4948798827826977, |
| "epoch": 1.2456882163124514, |
| "grad_norm": 0.0069580078125, |
| "learning_rate": 1.398e-05, |
| "loss": 3.4297, |
| "mean_token_accuracy": 0.435987046174705, |
| "num_tokens": 18626245.0, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.2456882163124514, |
| "eval_chemistry_entropy": 2.4951189584732054, |
| "eval_chemistry_loss": 3.512981414794922, |
| "eval_chemistry_mean_token_accuracy": 0.4329572098255157, |
| "eval_chemistry_num_tokens": 18626245.0, |
| "eval_chemistry_runtime": 42.5841, |
| "eval_chemistry_samples_per_second": 11.741, |
| "eval_chemistry_steps_per_second": 2.935, |
| "step": 700 |
| }, |
| { |
| "entropy": 2.408124291151762, |
| "epoch": 1.2634917102481362, |
| "grad_norm": 0.00927734375, |
| "learning_rate": 1.418e-05, |
| "loss": 3.3266, |
| "mean_token_accuracy": 0.4450059160590172, |
| "num_tokens": 18899632.0, |
| "step": 710 |
| }, |
| { |
| "entropy": 2.4984724432229997, |
| "epoch": 1.281295204183821, |
| "grad_norm": 0.01214599609375, |
| "learning_rate": 1.4380000000000001e-05, |
| "loss": 3.4488, |
| "mean_token_accuracy": 0.4306912997737527, |
| "num_tokens": 19173635.0, |
| "step": 720 |
| }, |
| { |
| "entropy": 2.407220109552145, |
| "epoch": 1.299098698119506, |
| "grad_norm": 0.01129150390625, |
| "learning_rate": 1.4580000000000001e-05, |
| "loss": 3.3575, |
| "mean_token_accuracy": 0.44124190472066405, |
| "num_tokens": 19453495.0, |
| "step": 730 |
| }, |
| { |
| "entropy": 2.426318255066872, |
| "epoch": 1.3169021920551909, |
| "grad_norm": 0.007049560546875, |
| "learning_rate": 1.478e-05, |
| "loss": 3.366, |
| "mean_token_accuracy": 0.43959415052086115, |
| "num_tokens": 19719086.0, |
| "step": 740 |
| }, |
| { |
| "entropy": 2.4674886889755725, |
| "epoch": 1.3347056859908757, |
| "grad_norm": 0.009033203125, |
| "learning_rate": 1.498e-05, |
| "loss": 3.4065, |
| "mean_token_accuracy": 0.4346128342673182, |
| "num_tokens": 19981272.0, |
| "step": 750 |
| }, |
| { |
| "entropy": 2.4601897999644278, |
| "epoch": 1.3525091799265607, |
| "grad_norm": 0.007354736328125, |
| "learning_rate": 1.5180000000000002e-05, |
| "loss": 3.4277, |
| "mean_token_accuracy": 0.43639134224504234, |
| "num_tokens": 20243034.0, |
| "step": 760 |
| }, |
| { |
| "entropy": 2.4810195587575437, |
| "epoch": 1.3703126738622455, |
| "grad_norm": 0.0084228515625, |
| "learning_rate": 1.5380000000000002e-05, |
| "loss": 3.4245, |
| "mean_token_accuracy": 0.43384860958904026, |
| "num_tokens": 20509028.0, |
| "step": 770 |
| }, |
| { |
| "entropy": 2.5357875250279904, |
| "epoch": 1.3881161677979303, |
| "grad_norm": 0.006988525390625, |
| "learning_rate": 1.5580000000000003e-05, |
| "loss": 3.5276, |
| "mean_token_accuracy": 0.42721649557352065, |
| "num_tokens": 20762668.0, |
| "step": 780 |
| }, |
| { |
| "entropy": 2.573459583520889, |
| "epoch": 1.4059196617336152, |
| "grad_norm": 0.00787353515625, |
| "learning_rate": 1.578e-05, |
| "loss": 3.5041, |
| "mean_token_accuracy": 0.42504764050245286, |
| "num_tokens": 21025277.0, |
| "step": 790 |
| }, |
| { |
| "entropy": 2.5099629506468775, |
| "epoch": 1.4237231556693, |
| "grad_norm": 0.0091552734375, |
| "learning_rate": 1.5980000000000003e-05, |
| "loss": 3.4735, |
| "mean_token_accuracy": 0.42990608550608156, |
| "num_tokens": 21292320.0, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.4237231556693, |
| "eval_chemistry_entropy": 2.4951189584732054, |
| "eval_chemistry_loss": 3.512981414794922, |
| "eval_chemistry_mean_token_accuracy": 0.4329572098255157, |
| "eval_chemistry_num_tokens": 21292320.0, |
| "eval_chemistry_runtime": 42.58, |
| "eval_chemistry_samples_per_second": 11.743, |
| "eval_chemistry_steps_per_second": 2.936, |
| "step": 800 |
| }, |
| { |
| "entropy": 2.5165034629404546, |
| "epoch": 1.441526649604985, |
| "grad_norm": 0.00732421875, |
| "learning_rate": 1.618e-05, |
| "loss": 3.4497, |
| "mean_token_accuracy": 0.4366149786859751, |
| "num_tokens": 21563668.0, |
| "step": 810 |
| }, |
| { |
| "entropy": 2.4973253421485424, |
| "epoch": 1.4593301435406698, |
| "grad_norm": 0.007232666015625, |
| "learning_rate": 1.638e-05, |
| "loss": 3.4814, |
| "mean_token_accuracy": 0.4312220415100455, |
| "num_tokens": 21817298.0, |
| "step": 820 |
| }, |
| { |
| "entropy": 2.52367008626461, |
| "epoch": 1.4771336374763546, |
| "grad_norm": 0.007598876953125, |
| "learning_rate": 1.658e-05, |
| "loss": 3.468, |
| "mean_token_accuracy": 0.4320263473317027, |
| "num_tokens": 22083010.0, |
| "step": 830 |
| }, |
| { |
| "entropy": 2.50519383251667, |
| "epoch": 1.4949371314120397, |
| "grad_norm": 0.0074462890625, |
| "learning_rate": 1.6780000000000002e-05, |
| "loss": 3.4575, |
| "mean_token_accuracy": 0.4309014746919274, |
| "num_tokens": 22350046.0, |
| "step": 840 |
| }, |
| { |
| "entropy": 2.458015527576208, |
| "epoch": 1.5127406253477245, |
| "grad_norm": 0.0081787109375, |
| "learning_rate": 1.698e-05, |
| "loss": 3.4087, |
| "mean_token_accuracy": 0.4401903055608273, |
| "num_tokens": 22615933.0, |
| "step": 850 |
| }, |
| { |
| "entropy": 2.4192430078983307, |
| "epoch": 1.5305441192834093, |
| "grad_norm": 0.00787353515625, |
| "learning_rate": 1.718e-05, |
| "loss": 3.3493, |
| "mean_token_accuracy": 0.4408345725387335, |
| "num_tokens": 22886843.0, |
| "step": 860 |
| }, |
| { |
| "entropy": 2.4866544254124165, |
| "epoch": 1.5483476132190943, |
| "grad_norm": 0.006927490234375, |
| "learning_rate": 1.7380000000000003e-05, |
| "loss": 3.4323, |
| "mean_token_accuracy": 0.4359192430973053, |
| "num_tokens": 23140244.0, |
| "step": 870 |
| }, |
| { |
| "entropy": 2.472956708818674, |
| "epoch": 1.5661511071547791, |
| "grad_norm": 0.0076904296875, |
| "learning_rate": 1.758e-05, |
| "loss": 3.4127, |
| "mean_token_accuracy": 0.4368757115676999, |
| "num_tokens": 23405121.0, |
| "step": 880 |
| }, |
| { |
| "entropy": 2.4871600806713103, |
| "epoch": 1.583954601090464, |
| "grad_norm": 0.006378173828125, |
| "learning_rate": 1.7780000000000003e-05, |
| "loss": 3.425, |
| "mean_token_accuracy": 0.43275546226650474, |
| "num_tokens": 23671826.0, |
| "step": 890 |
| }, |
| { |
| "entropy": 2.5024729229509832, |
| "epoch": 1.601758095026149, |
| "grad_norm": 0.01068115234375, |
| "learning_rate": 1.798e-05, |
| "loss": 3.4415, |
| "mean_token_accuracy": 0.43367660194635393, |
| "num_tokens": 23933374.0, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.601758095026149, |
| "eval_chemistry_entropy": 2.4951189584732054, |
| "eval_chemistry_loss": 3.512981414794922, |
| "eval_chemistry_mean_token_accuracy": 0.4329572098255157, |
| "eval_chemistry_num_tokens": 23933374.0, |
| "eval_chemistry_runtime": 42.5199, |
| "eval_chemistry_samples_per_second": 11.759, |
| "eval_chemistry_steps_per_second": 2.94, |
| "step": 900 |
| }, |
| { |
| "entropy": 2.4241936951875687, |
| "epoch": 1.6195615889618338, |
| "grad_norm": 0.00836181640625, |
| "learning_rate": 1.8180000000000002e-05, |
| "loss": 3.3926, |
| "mean_token_accuracy": 0.4393082341179252, |
| "num_tokens": 24210156.0, |
| "step": 910 |
| }, |
| { |
| "entropy": 2.501750738918781, |
| "epoch": 1.6373650828975186, |
| "grad_norm": 0.01043701171875, |
| "learning_rate": 1.8380000000000004e-05, |
| "loss": 3.428, |
| "mean_token_accuracy": 0.4345035368576646, |
| "num_tokens": 24462726.0, |
| "step": 920 |
| }, |
| { |
| "entropy": 2.4093470610678196, |
| "epoch": 1.6551685768332036, |
| "grad_norm": 0.0118408203125, |
| "learning_rate": 1.858e-05, |
| "loss": 3.3407, |
| "mean_token_accuracy": 0.44576158542186023, |
| "num_tokens": 24732815.0, |
| "step": 930 |
| }, |
| { |
| "entropy": 2.469248706102371, |
| "epoch": 1.6729720707688884, |
| "grad_norm": 0.0128173828125, |
| "learning_rate": 1.878e-05, |
| "loss": 3.3903, |
| "mean_token_accuracy": 0.4362227080389857, |
| "num_tokens": 25002091.0, |
| "step": 940 |
| }, |
| { |
| "entropy": 2.4393011182546616, |
| "epoch": 1.6907755647045732, |
| "grad_norm": 0.00982666015625, |
| "learning_rate": 1.898e-05, |
| "loss": 3.4097, |
| "mean_token_accuracy": 0.4377862988039851, |
| "num_tokens": 25267949.0, |
| "step": 950 |
| }, |
| { |
| "entropy": 2.499985621124506, |
| "epoch": 1.7085790586402583, |
| "grad_norm": 0.007110595703125, |
| "learning_rate": 1.918e-05, |
| "loss": 3.4206, |
| "mean_token_accuracy": 0.4386218637228012, |
| "num_tokens": 25525772.0, |
| "step": 960 |
| }, |
| { |
| "entropy": 2.518709710240364, |
| "epoch": 1.7263825525759429, |
| "grad_norm": 0.015625, |
| "learning_rate": 1.938e-05, |
| "loss": 3.4744, |
| "mean_token_accuracy": 0.4290819203481078, |
| "num_tokens": 25787727.0, |
| "step": 970 |
| }, |
| { |
| "entropy": 2.434730440378189, |
| "epoch": 1.744186046511628, |
| "grad_norm": 0.007110595703125, |
| "learning_rate": 1.9580000000000002e-05, |
| "loss": 3.3452, |
| "mean_token_accuracy": 0.4429737152531743, |
| "num_tokens": 26058026.0, |
| "step": 980 |
| }, |
| { |
| "entropy": 2.405204129964113, |
| "epoch": 1.761989540447313, |
| "grad_norm": 0.007659912109375, |
| "learning_rate": 1.978e-05, |
| "loss": 3.3339, |
| "mean_token_accuracy": 0.44453586284071206, |
| "num_tokens": 26326631.0, |
| "step": 990 |
| }, |
| { |
| "entropy": 2.473785638809204, |
| "epoch": 1.7797930343829975, |
| "grad_norm": 0.008056640625, |
| "learning_rate": 1.9980000000000002e-05, |
| "loss": 3.3982, |
| "mean_token_accuracy": 0.4379414787515998, |
| "num_tokens": 26591735.0, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.7797930343829975, |
| "eval_chemistry_entropy": 2.4951189584732054, |
| "eval_chemistry_loss": 3.512981414794922, |
| "eval_chemistry_mean_token_accuracy": 0.4329572098255157, |
| "eval_chemistry_num_tokens": 26591735.0, |
| "eval_chemistry_runtime": 42.6898, |
| "eval_chemistry_samples_per_second": 11.712, |
| "eval_chemistry_steps_per_second": 2.928, |
| "step": 1000 |
| }, |
| { |
| "entropy": 2.4923629231750963, |
| "epoch": 1.7975965283186826, |
| "grad_norm": 0.007171630859375, |
| "learning_rate": 1.9980000000000002e-05, |
| "loss": 3.4449, |
| "mean_token_accuracy": 0.4350215895101428, |
| "num_tokens": 26857071.0, |
| "step": 1010 |
| }, |
| { |
| "entropy": 2.518780706077814, |
| "epoch": 1.8154000222543676, |
| "grad_norm": 0.01068115234375, |
| "learning_rate": 1.995777777777778e-05, |
| "loss": 3.4732, |
| "mean_token_accuracy": 0.42953309677541257, |
| "num_tokens": 27120617.0, |
| "step": 1020 |
| }, |
| { |
| "entropy": 2.4439337849617004, |
| "epoch": 1.8332035161900522, |
| "grad_norm": 0.00830078125, |
| "learning_rate": 1.9935555555555557e-05, |
| "loss": 3.3826, |
| "mean_token_accuracy": 0.4376870134845376, |
| "num_tokens": 27391174.0, |
| "step": 1030 |
| }, |
| { |
| "entropy": 2.4611451610922814, |
| "epoch": 1.8510070101257372, |
| "grad_norm": 0.0072021484375, |
| "learning_rate": 1.9913333333333335e-05, |
| "loss": 3.3908, |
| "mean_token_accuracy": 0.4394783997908235, |
| "num_tokens": 27656941.0, |
| "step": 1040 |
| }, |
| { |
| "entropy": 2.456043781340122, |
| "epoch": 1.868810504061422, |
| "grad_norm": 0.0081787109375, |
| "learning_rate": 1.9891111111111112e-05, |
| "loss": 3.4033, |
| "mean_token_accuracy": 0.4401937620714307, |
| "num_tokens": 27927118.0, |
| "step": 1050 |
| }, |
| { |
| "entropy": 2.4034786596894264, |
| "epoch": 1.8866139979971068, |
| "grad_norm": 0.009033203125, |
| "learning_rate": 1.986888888888889e-05, |
| "loss": 3.315, |
| "mean_token_accuracy": 0.4454406937584281, |
| "num_tokens": 28192620.0, |
| "step": 1060 |
| }, |
| { |
| "entropy": 2.451674747467041, |
| "epoch": 1.9044174919327919, |
| "grad_norm": 0.007232666015625, |
| "learning_rate": 1.9846666666666668e-05, |
| "loss": 3.3975, |
| "mean_token_accuracy": 0.4367500660941005, |
| "num_tokens": 28458168.0, |
| "step": 1070 |
| }, |
| { |
| "entropy": 2.477339595556259, |
| "epoch": 1.9222209858684767, |
| "grad_norm": 0.009033203125, |
| "learning_rate": 1.9824444444444445e-05, |
| "loss": 3.4136, |
| "mean_token_accuracy": 0.4369886856526136, |
| "num_tokens": 28722756.0, |
| "step": 1080 |
| }, |
| { |
| "entropy": 2.5017261296510696, |
| "epoch": 1.9400244798041615, |
| "grad_norm": 0.010498046875, |
| "learning_rate": 1.9802222222222226e-05, |
| "loss": 3.4096, |
| "mean_token_accuracy": 0.4368307461962104, |
| "num_tokens": 28978375.0, |
| "step": 1090 |
| }, |
| { |
| "entropy": 2.4673263989388943, |
| "epoch": 1.9578279737398465, |
| "grad_norm": 0.00787353515625, |
| "learning_rate": 1.978e-05, |
| "loss": 3.4071, |
| "mean_token_accuracy": 0.43733639605343344, |
| "num_tokens": 29236865.0, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.9578279737398465, |
| "eval_chemistry_entropy": 2.4951189584732054, |
| "eval_chemistry_loss": 3.512981414794922, |
| "eval_chemistry_mean_token_accuracy": 0.4329572098255157, |
| "eval_chemistry_num_tokens": 29236865.0, |
| "eval_chemistry_runtime": 42.6475, |
| "eval_chemistry_samples_per_second": 11.724, |
| "eval_chemistry_steps_per_second": 2.931, |
| "step": 1100 |
| }, |
| { |
| "entropy": 2.4563313499093056, |
| "epoch": 1.9756314676755313, |
| "grad_norm": 0.008056640625, |
| "learning_rate": 1.975777777777778e-05, |
| "loss": 3.3669, |
| "mean_token_accuracy": 0.4400833772495389, |
| "num_tokens": 29500428.0, |
| "step": 1110 |
| }, |
| { |
| "entropy": 2.4409970842301845, |
| "epoch": 1.9934349616112161, |
| "grad_norm": 0.009033203125, |
| "learning_rate": 1.9735555555555556e-05, |
| "loss": 3.3781, |
| "mean_token_accuracy": 0.44074930921196936, |
| "num_tokens": 29766739.0, |
| "step": 1120 |
| }, |
| { |
| "entropy": 2.46847146403405, |
| "epoch": 2.010682096361411, |
| "grad_norm": 0.00885009765625, |
| "learning_rate": 1.9713333333333337e-05, |
| "loss": 3.4114, |
| "mean_token_accuracy": 0.4375636675665455, |
| "num_tokens": 30025427.0, |
| "step": 1130 |
| }, |
| { |
| "entropy": 2.4604240149259566, |
| "epoch": 2.028485590297096, |
| "grad_norm": 0.0126953125, |
| "learning_rate": 1.969111111111111e-05, |
| "loss": 3.3852, |
| "mean_token_accuracy": 0.43901363387703896, |
| "num_tokens": 30286651.0, |
| "step": 1140 |
| }, |
| { |
| "entropy": 2.468199959397316, |
| "epoch": 2.0462890842327806, |
| "grad_norm": 0.007659912109375, |
| "learning_rate": 1.9668888888888892e-05, |
| "loss": 3.4268, |
| "mean_token_accuracy": 0.4350541580468416, |
| "num_tokens": 30552624.0, |
| "step": 1150 |
| }, |
| { |
| "entropy": 2.4525125212967396, |
| "epoch": 2.0640925781684656, |
| "grad_norm": 0.00823974609375, |
| "learning_rate": 1.9646666666666666e-05, |
| "loss": 3.3892, |
| "mean_token_accuracy": 0.4397247971966863, |
| "num_tokens": 30823746.0, |
| "step": 1160 |
| }, |
| { |
| "entropy": 2.575217917561531, |
| "epoch": 2.0818960721041506, |
| "grad_norm": 0.007476806640625, |
| "learning_rate": 1.9624444444444447e-05, |
| "loss": 3.5312, |
| "mean_token_accuracy": 0.42372212260961534, |
| "num_tokens": 31076307.0, |
| "step": 1170 |
| }, |
| { |
| "entropy": 2.5399364054203035, |
| "epoch": 2.099699566039835, |
| "grad_norm": 0.01165771484375, |
| "learning_rate": 1.9602222222222225e-05, |
| "loss": 3.4832, |
| "mean_token_accuracy": 0.4261633366346359, |
| "num_tokens": 31340763.0, |
| "step": 1180 |
| }, |
| { |
| "entropy": 2.440959357470274, |
| "epoch": 2.1175030599755202, |
| "grad_norm": 0.007232666015625, |
| "learning_rate": 1.9580000000000002e-05, |
| "loss": 3.3932, |
| "mean_token_accuracy": 0.4403660248965025, |
| "num_tokens": 31607200.0, |
| "step": 1190 |
| }, |
| { |
| "entropy": 2.5287065640091897, |
| "epoch": 2.1353065539112053, |
| "grad_norm": 0.00897216796875, |
| "learning_rate": 1.955777777777778e-05, |
| "loss": 3.4754, |
| "mean_token_accuracy": 0.4273608535528183, |
| "num_tokens": 31874392.0, |
| "step": 1200 |
| }, |
| { |
| "epoch": 2.1353065539112053, |
| "eval_chemistry_entropy": 2.4951189584732054, |
| "eval_chemistry_loss": 3.512981414794922, |
| "eval_chemistry_mean_token_accuracy": 0.4329572098255157, |
| "eval_chemistry_num_tokens": 31874392.0, |
| "eval_chemistry_runtime": 42.5525, |
| "eval_chemistry_samples_per_second": 11.75, |
| "eval_chemistry_steps_per_second": 2.938, |
| "step": 1200 |
| }, |
| { |
| "entropy": 2.4604250721633436, |
| "epoch": 2.15311004784689, |
| "grad_norm": 0.00732421875, |
| "learning_rate": 1.9535555555555557e-05, |
| "loss": 3.3662, |
| "mean_token_accuracy": 0.4413184406235814, |
| "num_tokens": 32144016.0, |
| "step": 1210 |
| }, |
| { |
| "entropy": 2.460251194238663, |
| "epoch": 2.170913541782575, |
| "grad_norm": 0.0098876953125, |
| "learning_rate": 1.9513333333333335e-05, |
| "loss": 3.4265, |
| "mean_token_accuracy": 0.43892472572624686, |
| "num_tokens": 32410902.0, |
| "step": 1220 |
| }, |
| { |
| "entropy": 2.4349136739969253, |
| "epoch": 2.18871703571826, |
| "grad_norm": 0.007537841796875, |
| "learning_rate": 1.9491111111111113e-05, |
| "loss": 3.386, |
| "mean_token_accuracy": 0.44226096775382756, |
| "num_tokens": 32674338.0, |
| "step": 1230 |
| }, |
| { |
| "entropy": 2.4614202618598937, |
| "epoch": 2.2065205296539445, |
| "grad_norm": 0.0123291015625, |
| "learning_rate": 1.946888888888889e-05, |
| "loss": 3.3965, |
| "mean_token_accuracy": 0.43613822385668755, |
| "num_tokens": 32930219.0, |
| "step": 1240 |
| }, |
| { |
| "entropy": 2.494431575387716, |
| "epoch": 2.2243240235896296, |
| "grad_norm": 0.013916015625, |
| "learning_rate": 1.9446666666666668e-05, |
| "loss": 3.4509, |
| "mean_token_accuracy": 0.433765122294426, |
| "num_tokens": 33194318.0, |
| "step": 1250 |
| }, |
| { |
| "entropy": 2.443953486531973, |
| "epoch": 2.242127517525314, |
| "grad_norm": 0.007659912109375, |
| "learning_rate": 1.9424444444444446e-05, |
| "loss": 3.3948, |
| "mean_token_accuracy": 0.43882110957056286, |
| "num_tokens": 33453292.0, |
| "step": 1260 |
| }, |
| { |
| "entropy": 2.3844580121338366, |
| "epoch": 2.259931011460999, |
| "grad_norm": 0.0081787109375, |
| "learning_rate": 1.9402222222222223e-05, |
| "loss": 3.3094, |
| "mean_token_accuracy": 0.4473427068442106, |
| "num_tokens": 33728875.0, |
| "step": 1270 |
| }, |
| { |
| "entropy": 2.5078157439827917, |
| "epoch": 2.277734505396684, |
| "grad_norm": 0.007354736328125, |
| "learning_rate": 1.938e-05, |
| "loss": 3.4404, |
| "mean_token_accuracy": 0.43358458634465935, |
| "num_tokens": 33999305.0, |
| "step": 1280 |
| }, |
| { |
| "entropy": 2.489903759211302, |
| "epoch": 2.295537999332369, |
| "grad_norm": 0.00750732421875, |
| "learning_rate": 1.935777777777778e-05, |
| "loss": 3.4548, |
| "mean_token_accuracy": 0.4296891471371055, |
| "num_tokens": 34269727.0, |
| "step": 1290 |
| }, |
| { |
| "entropy": 2.384353207051754, |
| "epoch": 2.313341493268054, |
| "grad_norm": 0.00775146484375, |
| "learning_rate": 1.9335555555555556e-05, |
| "loss": 3.3205, |
| "mean_token_accuracy": 0.44720757827162744, |
| "num_tokens": 34537714.0, |
| "step": 1300 |
| }, |
| { |
| "epoch": 2.313341493268054, |
| "eval_chemistry_entropy": 2.4951189584732054, |
| "eval_chemistry_loss": 3.512981414794922, |
| "eval_chemistry_mean_token_accuracy": 0.4329572098255157, |
| "eval_chemistry_num_tokens": 34537714.0, |
| "eval_chemistry_runtime": 42.6053, |
| "eval_chemistry_samples_per_second": 11.736, |
| "eval_chemistry_steps_per_second": 2.934, |
| "step": 1300 |
| }, |
| { |
| "entropy": 2.471423901617527, |
| "epoch": 2.331144987203739, |
| "grad_norm": 0.007781982421875, |
| "learning_rate": 1.9313333333333334e-05, |
| "loss": 3.4072, |
| "mean_token_accuracy": 0.43562198970466853, |
| "num_tokens": 34806647.0, |
| "step": 1310 |
| }, |
| { |
| "entropy": 2.460309857875109, |
| "epoch": 2.3489484811394234, |
| "grad_norm": 0.01104736328125, |
| "learning_rate": 1.9291111111111115e-05, |
| "loss": 3.4042, |
| "mean_token_accuracy": 0.4393015902489424, |
| "num_tokens": 35076423.0, |
| "step": 1320 |
| }, |
| { |
| "entropy": 2.50424614623189, |
| "epoch": 2.3667519750751085, |
| "grad_norm": 0.007049560546875, |
| "learning_rate": 1.926888888888889e-05, |
| "loss": 3.4226, |
| "mean_token_accuracy": 0.4346967810764909, |
| "num_tokens": 35343494.0, |
| "step": 1330 |
| }, |
| { |
| "entropy": 2.48561422675848, |
| "epoch": 2.3845554690107935, |
| "grad_norm": 0.0086669921875, |
| "learning_rate": 1.924666666666667e-05, |
| "loss": 3.4177, |
| "mean_token_accuracy": 0.4333121033385396, |
| "num_tokens": 35614731.0, |
| "step": 1340 |
| }, |
| { |
| "entropy": 2.498768079280853, |
| "epoch": 2.402358962946478, |
| "grad_norm": 0.00885009765625, |
| "learning_rate": 1.9224444444444444e-05, |
| "loss": 3.451, |
| "mean_token_accuracy": 0.4327333400025964, |
| "num_tokens": 35877264.0, |
| "step": 1350 |
| }, |
| { |
| "entropy": 2.420652062445879, |
| "epoch": 2.420162456882163, |
| "grad_norm": 0.007659912109375, |
| "learning_rate": 1.9202222222222225e-05, |
| "loss": 3.343, |
| "mean_token_accuracy": 0.4431915180757642, |
| "num_tokens": 36147587.0, |
| "step": 1360 |
| }, |
| { |
| "entropy": 2.4841696746647357, |
| "epoch": 2.437965950817848, |
| "grad_norm": 0.007049560546875, |
| "learning_rate": 1.918e-05, |
| "loss": 3.4194, |
| "mean_token_accuracy": 0.43716037590056656, |
| "num_tokens": 36416125.0, |
| "step": 1370 |
| }, |
| { |
| "entropy": 2.4914293572306634, |
| "epoch": 2.4557694447535328, |
| "grad_norm": 0.0081787109375, |
| "learning_rate": 1.915777777777778e-05, |
| "loss": 3.4345, |
| "mean_token_accuracy": 0.4334576532244682, |
| "num_tokens": 36680383.0, |
| "step": 1380 |
| }, |
| { |
| "entropy": 2.470908258855343, |
| "epoch": 2.473572938689218, |
| "grad_norm": 0.00921630859375, |
| "learning_rate": 1.9135555555555555e-05, |
| "loss": 3.4071, |
| "mean_token_accuracy": 0.4377790277823806, |
| "num_tokens": 36946349.0, |
| "step": 1390 |
| }, |
| { |
| "entropy": 2.407320798188448, |
| "epoch": 2.491376432624903, |
| "grad_norm": 0.0089111328125, |
| "learning_rate": 1.9113333333333336e-05, |
| "loss": 3.358, |
| "mean_token_accuracy": 0.44281436298042537, |
| "num_tokens": 37219532.0, |
| "step": 1400 |
| }, |
| { |
| "epoch": 2.491376432624903, |
| "eval_chemistry_entropy": 2.4951189584732054, |
| "eval_chemistry_loss": 3.512981414794922, |
| "eval_chemistry_mean_token_accuracy": 0.4329572098255157, |
| "eval_chemistry_num_tokens": 37219532.0, |
| "eval_chemistry_runtime": 42.6672, |
| "eval_chemistry_samples_per_second": 11.719, |
| "eval_chemistry_steps_per_second": 2.93, |
| "step": 1400 |
| }, |
| { |
| "entropy": 2.4535592779517175, |
| "epoch": 2.5091799265605874, |
| "grad_norm": 0.01446533203125, |
| "learning_rate": 1.9091111111111113e-05, |
| "loss": 3.3835, |
| "mean_token_accuracy": 0.4377025572583079, |
| "num_tokens": 37477914.0, |
| "step": 1410 |
| }, |
| { |
| "entropy": 2.4648735396564008, |
| "epoch": 2.5269834204962724, |
| "grad_norm": 0.00738525390625, |
| "learning_rate": 1.906888888888889e-05, |
| "loss": 3.3722, |
| "mean_token_accuracy": 0.44036708902567623, |
| "num_tokens": 37746690.0, |
| "step": 1420 |
| }, |
| { |
| "entropy": 2.5168155536055563, |
| "epoch": 2.544786914431957, |
| "grad_norm": 0.007598876953125, |
| "learning_rate": 1.904666666666667e-05, |
| "loss": 3.4422, |
| "mean_token_accuracy": 0.43462312165647743, |
| "num_tokens": 38005792.0, |
| "step": 1430 |
| }, |
| { |
| "entropy": 2.518996875733137, |
| "epoch": 2.562590408367642, |
| "grad_norm": 0.007598876953125, |
| "learning_rate": 1.9024444444444446e-05, |
| "loss": 3.4453, |
| "mean_token_accuracy": 0.43033833540976046, |
| "num_tokens": 38269021.0, |
| "step": 1440 |
| }, |
| { |
| "entropy": 2.4837927713990213, |
| "epoch": 2.580393902303327, |
| "grad_norm": 0.00732421875, |
| "learning_rate": 1.9002222222222224e-05, |
| "loss": 3.4046, |
| "mean_token_accuracy": 0.43629784174263475, |
| "num_tokens": 38545224.0, |
| "step": 1450 |
| }, |
| { |
| "entropy": 2.4778561271727084, |
| "epoch": 2.598197396239012, |
| "grad_norm": 0.007293701171875, |
| "learning_rate": 1.898e-05, |
| "loss": 3.4428, |
| "mean_token_accuracy": 0.4349894070997834, |
| "num_tokens": 38802290.0, |
| "step": 1460 |
| }, |
| { |
| "entropy": 2.4231295838952063, |
| "epoch": 2.6160008901746967, |
| "grad_norm": 0.009521484375, |
| "learning_rate": 1.895777777777778e-05, |
| "loss": 3.3682, |
| "mean_token_accuracy": 0.4430950226262212, |
| "num_tokens": 39065787.0, |
| "step": 1470 |
| }, |
| { |
| "entropy": 2.434224319458008, |
| "epoch": 2.6338043841103818, |
| "grad_norm": 0.007781982421875, |
| "learning_rate": 1.8935555555555556e-05, |
| "loss": 3.3787, |
| "mean_token_accuracy": 0.43969413749873637, |
| "num_tokens": 39328517.0, |
| "step": 1480 |
| }, |
| { |
| "entropy": 2.4157699927687646, |
| "epoch": 2.6516078780460663, |
| "grad_norm": 0.009765625, |
| "learning_rate": 1.8913333333333334e-05, |
| "loss": 3.3348, |
| "mean_token_accuracy": 0.4441758098080754, |
| "num_tokens": 39600641.0, |
| "step": 1490 |
| }, |
| { |
| "entropy": 2.5363398104906083, |
| "epoch": 2.6694113719817514, |
| "grad_norm": 0.007537841796875, |
| "learning_rate": 1.8891111111111115e-05, |
| "loss": 3.4932, |
| "mean_token_accuracy": 0.42894220296293495, |
| "num_tokens": 39857917.0, |
| "step": 1500 |
| }, |
| { |
| "epoch": 2.6694113719817514, |
| "eval_chemistry_entropy": 2.4951189584732054, |
| "eval_chemistry_loss": 3.512981414794922, |
| "eval_chemistry_mean_token_accuracy": 0.4329572098255157, |
| "eval_chemistry_num_tokens": 39857917.0, |
| "eval_chemistry_runtime": 42.5983, |
| "eval_chemistry_samples_per_second": 11.738, |
| "eval_chemistry_steps_per_second": 2.934, |
| "step": 1500 |
| }, |
| { |
| "entropy": 2.475891087204218, |
| "epoch": 2.6872148659174364, |
| "grad_norm": 0.0084228515625, |
| "learning_rate": 1.886888888888889e-05, |
| "loss": 3.4641, |
| "mean_token_accuracy": 0.4327057808637619, |
| "num_tokens": 40119174.0, |
| "step": 1510 |
| }, |
| { |
| "entropy": 2.538855505734682, |
| "epoch": 2.7050183598531214, |
| "grad_norm": 0.0086669921875, |
| "learning_rate": 1.884666666666667e-05, |
| "loss": 3.4886, |
| "mean_token_accuracy": 0.427852831967175, |
| "num_tokens": 40379054.0, |
| "step": 1520 |
| }, |
| { |
| "entropy": 2.465169349312782, |
| "epoch": 2.722821853788806, |
| "grad_norm": 0.00823974609375, |
| "learning_rate": 1.8824444444444445e-05, |
| "loss": 3.4114, |
| "mean_token_accuracy": 0.43687186017632484, |
| "num_tokens": 40650970.0, |
| "step": 1530 |
| }, |
| { |
| "entropy": 2.4815499387681483, |
| "epoch": 2.740625347724491, |
| "grad_norm": 0.00726318359375, |
| "learning_rate": 1.8802222222222226e-05, |
| "loss": 3.4391, |
| "mean_token_accuracy": 0.43311921060085296, |
| "num_tokens": 40914339.0, |
| "step": 1540 |
| }, |
| { |
| "entropy": 2.4799845792353152, |
| "epoch": 2.7584288416601757, |
| "grad_norm": 0.00732421875, |
| "learning_rate": 1.878e-05, |
| "loss": 3.419, |
| "mean_token_accuracy": 0.4352044124156237, |
| "num_tokens": 41181827.0, |
| "step": 1550 |
| }, |
| { |
| "entropy": 2.432544883340597, |
| "epoch": 2.7762323355958607, |
| "grad_norm": 0.00775146484375, |
| "learning_rate": 1.875777777777778e-05, |
| "loss": 3.3587, |
| "mean_token_accuracy": 0.43972424473613503, |
| "num_tokens": 41448961.0, |
| "step": 1560 |
| }, |
| { |
| "entropy": 2.4766519904136657, |
| "epoch": 2.7940358295315457, |
| "grad_norm": 0.0140380859375, |
| "learning_rate": 1.873555555555556e-05, |
| "loss": 3.4166, |
| "mean_token_accuracy": 0.43819848690181973, |
| "num_tokens": 41712243.0, |
| "step": 1570 |
| }, |
| { |
| "entropy": 2.45805629491806, |
| "epoch": 2.8118393234672303, |
| "grad_norm": 0.00811767578125, |
| "learning_rate": 1.8713333333333336e-05, |
| "loss": 3.3882, |
| "mean_token_accuracy": 0.4380352281033993, |
| "num_tokens": 41981755.0, |
| "step": 1580 |
| }, |
| { |
| "entropy": 2.3916926831007004, |
| "epoch": 2.8296428174029153, |
| "grad_norm": 0.0125732421875, |
| "learning_rate": 1.8691111111111114e-05, |
| "loss": 3.3124, |
| "mean_token_accuracy": 0.4474966680631042, |
| "num_tokens": 42258270.0, |
| "step": 1590 |
| }, |
| { |
| "entropy": 2.4504558242857457, |
| "epoch": 2.8474463113386, |
| "grad_norm": 0.0087890625, |
| "learning_rate": 1.866888888888889e-05, |
| "loss": 3.3973, |
| "mean_token_accuracy": 0.43714725859463216, |
| "num_tokens": 42529321.0, |
| "step": 1600 |
| }, |
| { |
| "epoch": 2.8474463113386, |
| "eval_chemistry_entropy": 2.4951189584732054, |
| "eval_chemistry_loss": 3.512981414794922, |
| "eval_chemistry_mean_token_accuracy": 0.4329572098255157, |
| "eval_chemistry_num_tokens": 42529321.0, |
| "eval_chemistry_runtime": 42.5546, |
| "eval_chemistry_samples_per_second": 11.75, |
| "eval_chemistry_steps_per_second": 2.937, |
| "step": 1600 |
| }, |
| { |
| "entropy": 2.526197586208582, |
| "epoch": 2.865249805274285, |
| "grad_norm": 0.0079345703125, |
| "learning_rate": 1.864666666666667e-05, |
| "loss": 3.4609, |
| "mean_token_accuracy": 0.42937238067388533, |
| "num_tokens": 42793729.0, |
| "step": 1610 |
| }, |
| { |
| "entropy": 2.4750846356153486, |
| "epoch": 2.88305329920997, |
| "grad_norm": 0.00921630859375, |
| "learning_rate": 1.8624444444444446e-05, |
| "loss": 3.4279, |
| "mean_token_accuracy": 0.432634992338717, |
| "num_tokens": 43067006.0, |
| "step": 1620 |
| }, |
| { |
| "entropy": 2.4470580220222473, |
| "epoch": 2.900856793145655, |
| "grad_norm": 0.0126953125, |
| "learning_rate": 1.8602222222222224e-05, |
| "loss": 3.3994, |
| "mean_token_accuracy": 0.4387952284887433, |
| "num_tokens": 43333868.0, |
| "step": 1630 |
| }, |
| { |
| "entropy": 2.4646801233291624, |
| "epoch": 2.9186602870813396, |
| "grad_norm": 0.00909423828125, |
| "learning_rate": 1.858e-05, |
| "loss": 3.397, |
| "mean_token_accuracy": 0.43661086559295653, |
| "num_tokens": 43605151.0, |
| "step": 1640 |
| }, |
| { |
| "entropy": 2.4074466191232204, |
| "epoch": 2.9364637810170247, |
| "grad_norm": 0.007415771484375, |
| "learning_rate": 1.855777777777778e-05, |
| "loss": 3.3384, |
| "mean_token_accuracy": 0.4428932035341859, |
| "num_tokens": 43863034.0, |
| "step": 1650 |
| }, |
| { |
| "entropy": 2.5186894349753857, |
| "epoch": 2.9542672749527092, |
| "grad_norm": 0.007354736328125, |
| "learning_rate": 1.8535555555555557e-05, |
| "loss": 3.4249, |
| "mean_token_accuracy": 0.43363551832735536, |
| "num_tokens": 44126796.0, |
| "step": 1660 |
| }, |
| { |
| "entropy": 2.5221563249826433, |
| "epoch": 2.9720707688883943, |
| "grad_norm": 0.00799560546875, |
| "learning_rate": 1.8513333333333335e-05, |
| "loss": 3.4755, |
| "mean_token_accuracy": 0.4296230224892497, |
| "num_tokens": 44382724.0, |
| "step": 1670 |
| }, |
| { |
| "entropy": 2.4261739112436773, |
| "epoch": 2.9898742628240793, |
| "grad_norm": 0.00921630859375, |
| "learning_rate": 1.8491111111111112e-05, |
| "loss": 3.347, |
| "mean_token_accuracy": 0.44398615062236785, |
| "num_tokens": 44651471.0, |
| "step": 1680 |
| }, |
| { |
| "entropy": 2.4371220088774157, |
| "epoch": 3.007121397574274, |
| "grad_norm": 0.008544921875, |
| "learning_rate": 1.846888888888889e-05, |
| "loss": 3.3525, |
| "mean_token_accuracy": 0.44014161959771186, |
| "num_tokens": 44909981.0, |
| "step": 1690 |
| }, |
| { |
| "entropy": 2.597546722739935, |
| "epoch": 3.0249248915099587, |
| "grad_norm": 0.007354736328125, |
| "learning_rate": 1.8446666666666667e-05, |
| "loss": 3.5249, |
| "mean_token_accuracy": 0.4250665778294206, |
| "num_tokens": 45176547.0, |
| "step": 1700 |
| }, |
| { |
| "epoch": 3.0249248915099587, |
| "eval_chemistry_entropy": 2.4951189584732054, |
| "eval_chemistry_loss": 3.512981414794922, |
| "eval_chemistry_mean_token_accuracy": 0.4329572098255157, |
| "eval_chemistry_num_tokens": 45176547.0, |
| "eval_chemistry_runtime": 42.5973, |
| "eval_chemistry_samples_per_second": 11.738, |
| "eval_chemistry_steps_per_second": 2.934, |
| "step": 1700 |
| }, |
| { |
| "entropy": 2.4016756273806097, |
| "epoch": 3.0427283854456437, |
| "grad_norm": 0.008056640625, |
| "learning_rate": 1.842444444444445e-05, |
| "loss": 3.3445, |
| "mean_token_accuracy": 0.44047501031309366, |
| "num_tokens": 45437181.0, |
| "step": 1710 |
| }, |
| { |
| "entropy": 2.4717557713389398, |
| "epoch": 3.0605318793813288, |
| "grad_norm": 0.00830078125, |
| "learning_rate": 1.8402222222222223e-05, |
| "loss": 3.4273, |
| "mean_token_accuracy": 0.4353035241365433, |
| "num_tokens": 45702431.0, |
| "step": 1720 |
| }, |
| { |
| "entropy": 2.4562893763184546, |
| "epoch": 3.0783353733170133, |
| "grad_norm": 0.00738525390625, |
| "learning_rate": 1.8380000000000004e-05, |
| "loss": 3.3755, |
| "mean_token_accuracy": 0.43956287633627655, |
| "num_tokens": 45975788.0, |
| "step": 1730 |
| }, |
| { |
| "entropy": 2.4700839944183826, |
| "epoch": 3.0961388672526984, |
| "grad_norm": 0.0103759765625, |
| "learning_rate": 1.8357777777777778e-05, |
| "loss": 3.3978, |
| "mean_token_accuracy": 0.4417863454669714, |
| "num_tokens": 46251349.0, |
| "step": 1740 |
| }, |
| { |
| "entropy": 2.5270811855793, |
| "epoch": 3.1139423611883834, |
| "grad_norm": 0.00775146484375, |
| "learning_rate": 1.833555555555556e-05, |
| "loss": 3.4516, |
| "mean_token_accuracy": 0.42940014600753784, |
| "num_tokens": 46516802.0, |
| "step": 1750 |
| }, |
| { |
| "entropy": 2.4189466029405593, |
| "epoch": 3.131745855124068, |
| "grad_norm": 0.007537841796875, |
| "learning_rate": 1.8313333333333333e-05, |
| "loss": 3.3473, |
| "mean_token_accuracy": 0.446175323985517, |
| "num_tokens": 46790537.0, |
| "step": 1760 |
| }, |
| { |
| "entropy": 2.437445816397667, |
| "epoch": 3.149549349059753, |
| "grad_norm": 0.01043701171875, |
| "learning_rate": 1.8291111111111114e-05, |
| "loss": 3.3659, |
| "mean_token_accuracy": 0.44263256788253785, |
| "num_tokens": 47052877.0, |
| "step": 1770 |
| }, |
| { |
| "entropy": 2.477902936190367, |
| "epoch": 3.167352842995438, |
| "grad_norm": 0.01214599609375, |
| "learning_rate": 1.8268888888888888e-05, |
| "loss": 3.411, |
| "mean_token_accuracy": 0.43642533868551253, |
| "num_tokens": 47315587.0, |
| "step": 1780 |
| }, |
| { |
| "entropy": 2.4296828620135784, |
| "epoch": 3.1851563369311227, |
| "grad_norm": 0.0081787109375, |
| "learning_rate": 1.824666666666667e-05, |
| "loss": 3.3685, |
| "mean_token_accuracy": 0.4402240352705121, |
| "num_tokens": 47578352.0, |
| "step": 1790 |
| }, |
| { |
| "entropy": 2.455223270505667, |
| "epoch": 3.2029598308668077, |
| "grad_norm": 0.01470947265625, |
| "learning_rate": 1.8224444444444447e-05, |
| "loss": 3.4172, |
| "mean_token_accuracy": 0.43478179927915334, |
| "num_tokens": 47844623.0, |
| "step": 1800 |
| }, |
| { |
| "epoch": 3.2029598308668077, |
| "eval_chemistry_entropy": 2.4951189584732054, |
| "eval_chemistry_loss": 3.512981414794922, |
| "eval_chemistry_mean_token_accuracy": 0.4329572098255157, |
| "eval_chemistry_num_tokens": 47844623.0, |
| "eval_chemistry_runtime": 42.5947, |
| "eval_chemistry_samples_per_second": 11.739, |
| "eval_chemistry_steps_per_second": 2.935, |
| "step": 1800 |
| }, |
| { |
| "entropy": 2.4119900725781918, |
| "epoch": 3.2207633248024923, |
| "grad_norm": 0.0123291015625, |
| "learning_rate": 1.8202222222222225e-05, |
| "loss": 3.3467, |
| "mean_token_accuracy": 0.4440757390111685, |
| "num_tokens": 48112262.0, |
| "step": 1810 |
| }, |
| { |
| "entropy": 2.5602151431143283, |
| "epoch": 3.2385668187381773, |
| "grad_norm": 0.0103759765625, |
| "learning_rate": 1.8180000000000002e-05, |
| "loss": 3.5068, |
| "mean_token_accuracy": 0.4251785816624761, |
| "num_tokens": 48369097.0, |
| "step": 1820 |
| }, |
| { |
| "entropy": 2.4663229435682297, |
| "epoch": 3.2563703126738623, |
| "grad_norm": 0.0078125, |
| "learning_rate": 1.815777777777778e-05, |
| "loss": 3.4226, |
| "mean_token_accuracy": 0.4367042934522033, |
| "num_tokens": 48628165.0, |
| "step": 1830 |
| }, |
| { |
| "entropy": 2.441531176120043, |
| "epoch": 3.274173806609547, |
| "grad_norm": 0.00933837890625, |
| "learning_rate": 1.8135555555555557e-05, |
| "loss": 3.3767, |
| "mean_token_accuracy": 0.43784170281142, |
| "num_tokens": 48898634.0, |
| "step": 1840 |
| }, |
| { |
| "entropy": 2.437831664085388, |
| "epoch": 3.291977300545232, |
| "grad_norm": 0.0089111328125, |
| "learning_rate": 1.8113333333333335e-05, |
| "loss": 3.3906, |
| "mean_token_accuracy": 0.43811208922415973, |
| "num_tokens": 49167839.0, |
| "step": 1850 |
| }, |
| { |
| "entropy": 2.4592106595635412, |
| "epoch": 3.309780794480917, |
| "grad_norm": 0.007415771484375, |
| "learning_rate": 1.8091111111111113e-05, |
| "loss": 3.3945, |
| "mean_token_accuracy": 0.43993463516235354, |
| "num_tokens": 49431378.0, |
| "step": 1860 |
| }, |
| { |
| "entropy": 2.4519283570349217, |
| "epoch": 3.3275842884166016, |
| "grad_norm": 0.00885009765625, |
| "learning_rate": 1.806888888888889e-05, |
| "loss": 3.3876, |
| "mean_token_accuracy": 0.43461998719722034, |
| "num_tokens": 49699309.0, |
| "step": 1870 |
| }, |
| { |
| "entropy": 2.483532691001892, |
| "epoch": 3.3453877823522866, |
| "grad_norm": 0.00921630859375, |
| "learning_rate": 1.8046666666666668e-05, |
| "loss": 3.4364, |
| "mean_token_accuracy": 0.43375972770154475, |
| "num_tokens": 49972032.0, |
| "step": 1880 |
| }, |
| { |
| "entropy": 2.470829947292805, |
| "epoch": 3.3631912762879717, |
| "grad_norm": 0.0150146484375, |
| "learning_rate": 1.8024444444444445e-05, |
| "loss": 3.4237, |
| "mean_token_accuracy": 0.4348238715901971, |
| "num_tokens": 50241367.0, |
| "step": 1890 |
| }, |
| { |
| "entropy": 2.503614215552807, |
| "epoch": 3.3809947702236562, |
| "grad_norm": 0.0074462890625, |
| "learning_rate": 1.8002222222222223e-05, |
| "loss": 3.4753, |
| "mean_token_accuracy": 0.43145324755460024, |
| "num_tokens": 50511167.0, |
| "step": 1900 |
| }, |
| { |
| "epoch": 3.3809947702236562, |
| "eval_chemistry_entropy": 2.4951189584732054, |
| "eval_chemistry_loss": 3.512981414794922, |
| "eval_chemistry_mean_token_accuracy": 0.4329572098255157, |
| "eval_chemistry_num_tokens": 50511167.0, |
| "eval_chemistry_runtime": 42.5948, |
| "eval_chemistry_samples_per_second": 11.739, |
| "eval_chemistry_steps_per_second": 2.935, |
| "step": 1900 |
| }, |
| { |
| "entropy": 2.5126790434122084, |
| "epoch": 3.3987982641593413, |
| "grad_norm": 0.0091552734375, |
| "learning_rate": 1.798e-05, |
| "loss": 3.4633, |
| "mean_token_accuracy": 0.43145898953080175, |
| "num_tokens": 50776842.0, |
| "step": 1910 |
| }, |
| { |
| "entropy": 2.483235966414213, |
| "epoch": 3.4166017580950263, |
| "grad_norm": 0.0087890625, |
| "learning_rate": 1.7957777777777778e-05, |
| "loss": 3.4276, |
| "mean_token_accuracy": 0.4340201187878847, |
| "num_tokens": 51040514.0, |
| "step": 1920 |
| }, |
| { |
| "entropy": 2.512890145927668, |
| "epoch": 3.434405252030711, |
| "grad_norm": 0.00775146484375, |
| "learning_rate": 1.7935555555555556e-05, |
| "loss": 3.4503, |
| "mean_token_accuracy": 0.431868121214211, |
| "num_tokens": 51300696.0, |
| "step": 1930 |
| }, |
| { |
| "entropy": 2.5034680627286434, |
| "epoch": 3.452208745966396, |
| "grad_norm": 0.00799560546875, |
| "learning_rate": 1.7913333333333337e-05, |
| "loss": 3.4761, |
| "mean_token_accuracy": 0.4282655959948897, |
| "num_tokens": 51558606.0, |
| "step": 1940 |
| }, |
| { |
| "entropy": 2.49245428070426, |
| "epoch": 3.470012239902081, |
| "grad_norm": 0.00830078125, |
| "learning_rate": 1.789111111111111e-05, |
| "loss": 3.4056, |
| "mean_token_accuracy": 0.434098650328815, |
| "num_tokens": 51827834.0, |
| "step": 1950 |
| }, |
| { |
| "entropy": 2.443667434155941, |
| "epoch": 3.4878157338377656, |
| "grad_norm": 0.0081787109375, |
| "learning_rate": 1.7868888888888892e-05, |
| "loss": 3.3762, |
| "mean_token_accuracy": 0.4413266645744443, |
| "num_tokens": 52100179.0, |
| "step": 1960 |
| }, |
| { |
| "entropy": 2.47061547935009, |
| "epoch": 3.5056192277734506, |
| "grad_norm": 0.007080078125, |
| "learning_rate": 1.7846666666666666e-05, |
| "loss": 3.3948, |
| "mean_token_accuracy": 0.43764521069824697, |
| "num_tokens": 52368055.0, |
| "step": 1970 |
| }, |
| { |
| "entropy": 2.4235463075339796, |
| "epoch": 3.523422721709135, |
| "grad_norm": 0.0072021484375, |
| "learning_rate": 1.7824444444444447e-05, |
| "loss": 3.392, |
| "mean_token_accuracy": 0.4394680192694068, |
| "num_tokens": 52637505.0, |
| "step": 1980 |
| }, |
| { |
| "entropy": 2.466986934840679, |
| "epoch": 3.54122621564482, |
| "grad_norm": 0.0123291015625, |
| "learning_rate": 1.780222222222222e-05, |
| "loss": 3.3834, |
| "mean_token_accuracy": 0.43876779917627573, |
| "num_tokens": 52906194.0, |
| "step": 1990 |
| }, |
| { |
| "entropy": 2.3780635252594946, |
| "epoch": 3.5590297095805052, |
| "grad_norm": 0.0089111328125, |
| "learning_rate": 1.7780000000000003e-05, |
| "loss": 3.3102, |
| "mean_token_accuracy": 0.44717768765985966, |
| "num_tokens": 53172376.0, |
| "step": 2000 |
| }, |
| { |
| "epoch": 3.5590297095805052, |
| "eval_chemistry_entropy": 2.4951189584732054, |
| "eval_chemistry_loss": 3.512981414794922, |
| "eval_chemistry_mean_token_accuracy": 0.4329572098255157, |
| "eval_chemistry_num_tokens": 53172376.0, |
| "eval_chemistry_runtime": 42.5948, |
| "eval_chemistry_samples_per_second": 11.739, |
| "eval_chemistry_steps_per_second": 2.935, |
| "step": 2000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 10000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 18, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 5.348963885285229e+18, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|