{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.6, "eval_steps": 100, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.2628081854432822, "epoch": 0.008, "grad_norm": 0.0179443359375, "learning_rate": 1.8e-07, "loss": 2.1855, "mean_token_accuracy": 0.584391780383885, "num_tokens": 138146.0, "step": 10 }, { "entropy": 1.2761256650090218, "epoch": 0.016, "grad_norm": 0.0177001953125, "learning_rate": 3.8e-07, "loss": 2.2119, "mean_token_accuracy": 0.5769438467919826, "num_tokens": 280941.0, "step": 20 }, { "entropy": 1.30821207202971, "epoch": 0.024, "grad_norm": 0.0079345703125, "learning_rate": 5.800000000000001e-07, "loss": 2.2252, "mean_token_accuracy": 0.5782777238637209, "num_tokens": 419127.0, "step": 30 }, { "entropy": 1.2832397993654012, "epoch": 0.032, "grad_norm": 0.00787353515625, "learning_rate": 7.8e-07, "loss": 2.1845, "mean_token_accuracy": 0.5787389608100056, "num_tokens": 558056.0, "step": 40 }, { "entropy": 1.281408105790615, "epoch": 0.04, "grad_norm": 0.0125732421875, "learning_rate": 9.800000000000001e-07, "loss": 2.2214, "mean_token_accuracy": 0.5814869659021497, "num_tokens": 697556.0, "step": 50 }, { "entropy": 1.2824515115469695, "epoch": 0.048, "grad_norm": 0.0206298828125, "learning_rate": 1.1800000000000001e-06, "loss": 2.1897, "mean_token_accuracy": 0.5798849314451218, "num_tokens": 836753.0, "step": 60 }, { "entropy": 1.2973103038966656, "epoch": 0.056, "grad_norm": 0.0216064453125, "learning_rate": 1.3800000000000001e-06, "loss": 2.2424, "mean_token_accuracy": 0.5715558419004083, "num_tokens": 981959.0, "step": 70 }, { "entropy": 1.2914881024509668, "epoch": 0.064, "grad_norm": 0.00927734375, "learning_rate": 1.5800000000000001e-06, "loss": 2.2528, "mean_token_accuracy": 0.5714382914826274, "num_tokens": 1126446.0, "step": 80 }, { "entropy": 1.2911656297743321, "epoch": 0.072, "grad_norm": 0.0145263671875, "learning_rate": 1.7800000000000001e-06, "loss": 2.2655, "mean_token_accuracy": 0.5728934396989643, "num_tokens": 1273905.0, "step": 90 }, { "entropy": 1.3042505994439124, "epoch": 0.08, "grad_norm": 0.00927734375, "learning_rate": 1.98e-06, "loss": 2.2384, "mean_token_accuracy": 0.574522521905601, "num_tokens": 1413596.0, "step": 100 }, { "epoch": 0.08, "eval_coding_entropy": 1.266241003036499, "eval_coding_loss": 1.2625732421875, "eval_coding_mean_token_accuracy": 0.6838934738636017, "eval_coding_num_tokens": 1413596.0, "eval_coding_runtime": 96.113, "eval_coding_samples_per_second": 5.202, "eval_coding_steps_per_second": 2.601, "step": 100 }, { "epoch": 0.08, "eval_math_entropy": 1.2986711366176604, "eval_math_loss": 2.2429049015045166, "eval_math_mean_token_accuracy": 0.5744417759180069, "eval_math_num_tokens": 1413596.0, "eval_math_runtime": 55.2867, "eval_math_samples_per_second": 9.044, "eval_math_steps_per_second": 4.522, "step": 100 }, { "entropy": 1.2642712604254485, "epoch": 0.088, "grad_norm": 0.0072021484375, "learning_rate": 2.1800000000000003e-06, "loss": 2.1897, "mean_token_accuracy": 0.5820998327806592, "num_tokens": 1559450.0, "step": 110 }, { "entropy": 1.3127624299377203, "epoch": 0.096, "grad_norm": 0.0101318359375, "learning_rate": 2.38e-06, "loss": 2.2696, "mean_token_accuracy": 0.5712630080059171, "num_tokens": 1701193.0, "step": 120 }, { "entropy": 1.2562327686697246, "epoch": 0.104, "grad_norm": 0.0089111328125, "learning_rate": 2.5800000000000003e-06, "loss": 2.1593, "mean_token_accuracy": 0.5853730095550418, "num_tokens": 1840165.0, "step": 130 }, { "entropy": 1.3122376587241888, "epoch": 0.112, "grad_norm": 0.00982666015625, "learning_rate": 2.7800000000000005e-06, "loss": 2.272, "mean_token_accuracy": 0.5697157179936767, "num_tokens": 1979232.0, "step": 140 }, { "entropy": 1.296381675079465, "epoch": 0.12, "grad_norm": 0.0084228515625, "learning_rate": 2.9800000000000003e-06, "loss": 2.2493, "mean_token_accuracy": 0.5788527738302947, "num_tokens": 2125815.0, "step": 150 }, { "entropy": 1.299617338180542, "epoch": 0.128, "grad_norm": 0.015380859375, "learning_rate": 3.1800000000000005e-06, "loss": 2.2113, "mean_token_accuracy": 0.5759535003453493, "num_tokens": 2274177.0, "step": 160 }, { "entropy": 1.2860674735158681, "epoch": 0.136, "grad_norm": 0.00946044921875, "learning_rate": 3.3800000000000007e-06, "loss": 2.2092, "mean_token_accuracy": 0.574743240699172, "num_tokens": 2412435.0, "step": 170 }, { "entropy": 1.299461904540658, "epoch": 0.144, "grad_norm": 0.0115966796875, "learning_rate": 3.58e-06, "loss": 2.1965, "mean_token_accuracy": 0.5797095803543926, "num_tokens": 2548445.0, "step": 180 }, { "entropy": 1.259111713245511, "epoch": 0.152, "grad_norm": 0.0115966796875, "learning_rate": 3.7800000000000002e-06, "loss": 2.1452, "mean_token_accuracy": 0.5854375848546625, "num_tokens": 2686462.0, "step": 190 }, { "entropy": 1.2716298930346965, "epoch": 0.16, "grad_norm": 0.014892578125, "learning_rate": 3.980000000000001e-06, "loss": 2.2284, "mean_token_accuracy": 0.5798038769513368, "num_tokens": 2824418.0, "step": 200 }, { "epoch": 0.16, "eval_coding_entropy": 1.266241003036499, "eval_coding_loss": 1.2625732421875, "eval_coding_mean_token_accuracy": 0.6838934738636017, "eval_coding_num_tokens": 2824418.0, "eval_coding_runtime": 95.662, "eval_coding_samples_per_second": 5.227, "eval_coding_steps_per_second": 2.613, "step": 200 }, { "epoch": 0.16, "eval_math_entropy": 1.2986711366176604, "eval_math_loss": 2.2429049015045166, "eval_math_mean_token_accuracy": 0.5744417759180069, "eval_math_num_tokens": 2824418.0, "eval_math_runtime": 55.266, "eval_math_samples_per_second": 9.047, "eval_math_steps_per_second": 4.524, "step": 200 }, { "entropy": 1.29471963532269, "epoch": 0.168, "grad_norm": 0.021240234375, "learning_rate": 4.18e-06, "loss": 2.2605, "mean_token_accuracy": 0.5726646112278104, "num_tokens": 2966031.0, "step": 210 }, { "entropy": 1.295415598154068, "epoch": 0.176, "grad_norm": 0.01519775390625, "learning_rate": 4.38e-06, "loss": 2.1967, "mean_token_accuracy": 0.5751344438642263, "num_tokens": 3110313.0, "step": 220 }, { "entropy": 1.3239446703344584, "epoch": 0.184, "grad_norm": 0.0089111328125, "learning_rate": 4.58e-06, "loss": 2.2475, "mean_token_accuracy": 0.5708544872701168, "num_tokens": 3251139.0, "step": 230 }, { "entropy": 1.3012913078069688, "epoch": 0.192, "grad_norm": 0.015625, "learning_rate": 4.78e-06, "loss": 2.2591, "mean_token_accuracy": 0.5751949936151505, "num_tokens": 3394170.0, "step": 240 }, { "entropy": 1.2939962450414897, "epoch": 0.2, "grad_norm": 0.0081787109375, "learning_rate": 4.980000000000001e-06, "loss": 2.2271, "mean_token_accuracy": 0.5751136729493738, "num_tokens": 3533571.0, "step": 250 }, { "entropy": 1.3175367139279843, "epoch": 0.208, "grad_norm": 0.0162353515625, "learning_rate": 5.18e-06, "loss": 2.2477, "mean_token_accuracy": 0.5703525686636567, "num_tokens": 3673600.0, "step": 260 }, { "entropy": 1.2911786384880544, "epoch": 0.216, "grad_norm": 0.00897216796875, "learning_rate": 5.380000000000001e-06, "loss": 2.2607, "mean_token_accuracy": 0.5701368801295758, "num_tokens": 3817397.0, "step": 270 }, { "entropy": 1.312684351950884, "epoch": 0.224, "grad_norm": 0.012451171875, "learning_rate": 5.580000000000001e-06, "loss": 2.2961, "mean_token_accuracy": 0.5682467905804515, "num_tokens": 3953732.0, "step": 280 }, { "entropy": 1.276525003835559, "epoch": 0.232, "grad_norm": 0.009033203125, "learning_rate": 5.78e-06, "loss": 2.2433, "mean_token_accuracy": 0.5770688287913799, "num_tokens": 4098989.0, "step": 290 }, { "entropy": 1.2831152826547623, "epoch": 0.24, "grad_norm": 0.008056640625, "learning_rate": 5.98e-06, "loss": 2.2829, "mean_token_accuracy": 0.5744210470467805, "num_tokens": 4243655.0, "step": 300 }, { "epoch": 0.24, "eval_coding_entropy": 1.266241003036499, "eval_coding_loss": 1.2625732421875, "eval_coding_mean_token_accuracy": 0.6838934738636017, "eval_coding_num_tokens": 4243655.0, "eval_coding_runtime": 95.6217, "eval_coding_samples_per_second": 5.229, "eval_coding_steps_per_second": 2.614, "step": 300 }, { "epoch": 0.24, "eval_math_entropy": 1.2986711366176604, "eval_math_loss": 2.2429049015045166, "eval_math_mean_token_accuracy": 0.5744417759180069, "eval_math_num_tokens": 4243655.0, "eval_math_runtime": 55.2698, "eval_math_samples_per_second": 9.047, "eval_math_steps_per_second": 4.523, "step": 300 }, { "entropy": 1.2523589238524437, "epoch": 0.248, "grad_norm": 0.01129150390625, "learning_rate": 6.18e-06, "loss": 2.1458, "mean_token_accuracy": 0.5875966742634773, "num_tokens": 4387651.0, "step": 310 }, { "entropy": 1.2836483523249627, "epoch": 0.256, "grad_norm": 0.01708984375, "learning_rate": 6.380000000000001e-06, "loss": 2.2171, "mean_token_accuracy": 0.5771771775558591, "num_tokens": 4531471.0, "step": 320 }, { "entropy": 1.2900966588407754, "epoch": 0.264, "grad_norm": 0.0172119140625, "learning_rate": 6.5800000000000005e-06, "loss": 2.2279, "mean_token_accuracy": 0.577950318530202, "num_tokens": 4669389.0, "step": 330 }, { "entropy": 1.2982714001089335, "epoch": 0.272, "grad_norm": 0.00897216796875, "learning_rate": 6.780000000000001e-06, "loss": 2.2464, "mean_token_accuracy": 0.5756928954273463, "num_tokens": 4810284.0, "step": 340 }, { "entropy": 1.278377178311348, "epoch": 0.28, "grad_norm": 0.00616455078125, "learning_rate": 6.98e-06, "loss": 2.172, "mean_token_accuracy": 0.5843851236626506, "num_tokens": 4955375.0, "step": 350 }, { "entropy": 1.2722328055649996, "epoch": 0.288, "grad_norm": 0.015869140625, "learning_rate": 7.180000000000001e-06, "loss": 2.2143, "mean_token_accuracy": 0.5805238038301468, "num_tokens": 5095104.0, "step": 360 }, { "entropy": 1.2866631541401148, "epoch": 0.296, "grad_norm": 0.01318359375, "learning_rate": 7.3800000000000005e-06, "loss": 2.2041, "mean_token_accuracy": 0.5798180354759097, "num_tokens": 5240988.0, "step": 370 }, { "entropy": 1.2868602696806193, "epoch": 0.304, "grad_norm": 0.0230712890625, "learning_rate": 7.58e-06, "loss": 2.2193, "mean_token_accuracy": 0.578575404547155, "num_tokens": 5383732.0, "step": 380 }, { "entropy": 1.3216422777622938, "epoch": 0.312, "grad_norm": 0.0115966796875, "learning_rate": 7.78e-06, "loss": 2.2847, "mean_token_accuracy": 0.5677898772060871, "num_tokens": 5530666.0, "step": 390 }, { "entropy": 1.2590181913226843, "epoch": 0.32, "grad_norm": 0.01171875, "learning_rate": 7.980000000000002e-06, "loss": 2.1719, "mean_token_accuracy": 0.5848465450108051, "num_tokens": 5676334.0, "step": 400 }, { "epoch": 0.32, "eval_coding_entropy": 1.266241003036499, "eval_coding_loss": 1.2625732421875, "eval_coding_mean_token_accuracy": 0.6838934738636017, "eval_coding_num_tokens": 5676334.0, "eval_coding_runtime": 95.6468, "eval_coding_samples_per_second": 5.228, "eval_coding_steps_per_second": 2.614, "step": 400 }, { "epoch": 0.32, "eval_math_entropy": 1.2986711366176604, "eval_math_loss": 2.2429049015045166, "eval_math_mean_token_accuracy": 0.5744417759180069, "eval_math_num_tokens": 5676334.0, "eval_math_runtime": 55.2877, "eval_math_samples_per_second": 9.044, "eval_math_steps_per_second": 4.522, "step": 400 }, { "entropy": 1.3136477686464787, "epoch": 0.328, "grad_norm": 0.009521484375, "learning_rate": 8.18e-06, "loss": 2.2817, "mean_token_accuracy": 0.5644237477332353, "num_tokens": 5816488.0, "step": 410 }, { "entropy": 1.296512581780553, "epoch": 0.336, "grad_norm": 0.008544921875, "learning_rate": 8.380000000000001e-06, "loss": 2.2127, "mean_token_accuracy": 0.5752231312915683, "num_tokens": 5958480.0, "step": 420 }, { "entropy": 1.2964510951191186, "epoch": 0.344, "grad_norm": 0.00885009765625, "learning_rate": 8.580000000000001e-06, "loss": 2.2215, "mean_token_accuracy": 0.5778196537867188, "num_tokens": 6102081.0, "step": 430 }, { "entropy": 1.2612504959106445, "epoch": 0.352, "grad_norm": 0.0084228515625, "learning_rate": 8.78e-06, "loss": 2.1833, "mean_token_accuracy": 0.5839220907539129, "num_tokens": 6242161.0, "step": 440 }, { "entropy": 1.2856441132724286, "epoch": 0.36, "grad_norm": 0.0177001953125, "learning_rate": 8.98e-06, "loss": 2.2414, "mean_token_accuracy": 0.5754987616091967, "num_tokens": 6377327.0, "step": 450 }, { "entropy": 1.288279192149639, "epoch": 0.368, "grad_norm": 0.0111083984375, "learning_rate": 9.180000000000002e-06, "loss": 2.1968, "mean_token_accuracy": 0.580965013988316, "num_tokens": 6523679.0, "step": 460 }, { "entropy": 1.28649943433702, "epoch": 0.376, "grad_norm": 0.007659912109375, "learning_rate": 9.38e-06, "loss": 2.2039, "mean_token_accuracy": 0.5775518488138914, "num_tokens": 6669278.0, "step": 470 }, { "entropy": 1.2801726717501878, "epoch": 0.384, "grad_norm": 0.01068115234375, "learning_rate": 9.58e-06, "loss": 2.1868, "mean_token_accuracy": 0.5841207258403301, "num_tokens": 6810978.0, "step": 480 }, { "entropy": 1.3082288071513175, "epoch": 0.392, "grad_norm": 0.01025390625, "learning_rate": 9.780000000000001e-06, "loss": 2.259, "mean_token_accuracy": 0.5713578343391419, "num_tokens": 6953929.0, "step": 490 }, { "entropy": 1.3287261452525854, "epoch": 0.4, "grad_norm": 0.008056640625, "learning_rate": 9.980000000000001e-06, "loss": 2.2733, "mean_token_accuracy": 0.5687045747414231, "num_tokens": 7096903.0, "step": 500 }, { "epoch": 0.4, "eval_coding_entropy": 1.266241003036499, "eval_coding_loss": 1.2625732421875, "eval_coding_mean_token_accuracy": 0.6838934738636017, "eval_coding_num_tokens": 7096903.0, "eval_coding_runtime": 95.6459, "eval_coding_samples_per_second": 5.228, "eval_coding_steps_per_second": 2.614, "step": 500 }, { "epoch": 0.4, "eval_math_entropy": 1.2986711366176604, "eval_math_loss": 2.2429049015045166, "eval_math_mean_token_accuracy": 0.5744417759180069, "eval_math_num_tokens": 7096903.0, "eval_math_runtime": 54.9641, "eval_math_samples_per_second": 9.097, "eval_math_steps_per_second": 4.548, "step": 500 }, { "entropy": 1.294243635982275, "epoch": 0.408, "grad_norm": 0.016845703125, "learning_rate": 1.018e-05, "loss": 2.2428, "mean_token_accuracy": 0.5740567265078426, "num_tokens": 7239071.0, "step": 510 }, { "entropy": 1.293457629531622, "epoch": 0.416, "grad_norm": 0.0140380859375, "learning_rate": 1.038e-05, "loss": 2.2374, "mean_token_accuracy": 0.5770649960264563, "num_tokens": 7377181.0, "step": 520 }, { "entropy": 1.29749961681664, "epoch": 0.424, "grad_norm": 0.01806640625, "learning_rate": 1.0580000000000002e-05, "loss": 2.1821, "mean_token_accuracy": 0.5780813867226243, "num_tokens": 7513035.0, "step": 530 }, { "entropy": 1.2768937222659589, "epoch": 0.432, "grad_norm": 0.00970458984375, "learning_rate": 1.0780000000000002e-05, "loss": 2.2319, "mean_token_accuracy": 0.5752488367259503, "num_tokens": 7650523.0, "step": 540 }, { "entropy": 1.27384728230536, "epoch": 0.44, "grad_norm": 0.0169677734375, "learning_rate": 1.0980000000000002e-05, "loss": 2.1808, "mean_token_accuracy": 0.5803057106211782, "num_tokens": 7794006.0, "step": 550 }, { "entropy": 1.2649936627596616, "epoch": 0.448, "grad_norm": 0.00933837890625, "learning_rate": 1.1180000000000001e-05, "loss": 2.1932, "mean_token_accuracy": 0.5795159580186009, "num_tokens": 7936788.0, "step": 560 }, { "entropy": 1.2891030061990023, "epoch": 0.456, "grad_norm": 0.00909423828125, "learning_rate": 1.138e-05, "loss": 2.2876, "mean_token_accuracy": 0.5727174270898103, "num_tokens": 8081035.0, "step": 570 }, { "entropy": 1.3052824169397355, "epoch": 0.464, "grad_norm": 0.0169677734375, "learning_rate": 1.1580000000000001e-05, "loss": 2.2796, "mean_token_accuracy": 0.5737911343574524, "num_tokens": 8223147.0, "step": 580 }, { "entropy": 1.2902213044464588, "epoch": 0.472, "grad_norm": 0.008544921875, "learning_rate": 1.178e-05, "loss": 2.2199, "mean_token_accuracy": 0.5761593583971262, "num_tokens": 8367380.0, "step": 590 }, { "entropy": 1.2913669254630804, "epoch": 0.48, "grad_norm": 0.0157470703125, "learning_rate": 1.198e-05, "loss": 2.2032, "mean_token_accuracy": 0.581071512401104, "num_tokens": 8506030.0, "step": 600 }, { "epoch": 0.48, "eval_coding_entropy": 1.266241003036499, "eval_coding_loss": 1.2625732421875, "eval_coding_mean_token_accuracy": 0.6838934738636017, "eval_coding_num_tokens": 8506030.0, "eval_coding_runtime": 95.6191, "eval_coding_samples_per_second": 5.229, "eval_coding_steps_per_second": 2.615, "step": 600 }, { "epoch": 0.48, "eval_math_entropy": 1.2986711366176604, "eval_math_loss": 2.2429049015045166, "eval_math_mean_token_accuracy": 0.5744417759180069, "eval_math_num_tokens": 8506030.0, "eval_math_runtime": 55.258, "eval_math_samples_per_second": 9.048, "eval_math_steps_per_second": 4.524, "step": 600 }, { "entropy": 1.2780281711369752, "epoch": 0.488, "grad_norm": 0.0093994140625, "learning_rate": 1.218e-05, "loss": 2.1848, "mean_token_accuracy": 0.5809735087677836, "num_tokens": 8646384.0, "step": 610 }, { "entropy": 1.2823347095400095, "epoch": 0.496, "grad_norm": 0.0125732421875, "learning_rate": 1.2380000000000002e-05, "loss": 2.2158, "mean_token_accuracy": 0.5798389609903097, "num_tokens": 8788726.0, "step": 620 }, { "entropy": 1.3198146648705005, "epoch": 0.504, "grad_norm": 0.00946044921875, "learning_rate": 1.2580000000000002e-05, "loss": 2.301, "mean_token_accuracy": 0.5685374505817891, "num_tokens": 8934959.0, "step": 630 }, { "entropy": 1.2555946599692107, "epoch": 0.512, "grad_norm": 0.0177001953125, "learning_rate": 1.2780000000000001e-05, "loss": 2.1431, "mean_token_accuracy": 0.589201102592051, "num_tokens": 9078039.0, "step": 640 }, { "entropy": 1.2702158033847808, "epoch": 0.52, "grad_norm": 0.024169921875, "learning_rate": 1.2980000000000001e-05, "loss": 2.1965, "mean_token_accuracy": 0.5821724396198988, "num_tokens": 9220298.0, "step": 650 }, { "entropy": 1.277011101320386, "epoch": 0.528, "grad_norm": 0.0118408203125, "learning_rate": 1.3180000000000001e-05, "loss": 2.2097, "mean_token_accuracy": 0.5796491207554937, "num_tokens": 9373860.0, "step": 660 }, { "entropy": 1.3105232425034046, "epoch": 0.536, "grad_norm": 0.00811767578125, "learning_rate": 1.3380000000000002e-05, "loss": 2.2839, "mean_token_accuracy": 0.5706269716843962, "num_tokens": 9515962.0, "step": 670 }, { "entropy": 1.2719210147857667, "epoch": 0.544, "grad_norm": 0.011474609375, "learning_rate": 1.3580000000000002e-05, "loss": 2.2322, "mean_token_accuracy": 0.5780333954840898, "num_tokens": 9660940.0, "step": 680 }, { "entropy": 1.336498522385955, "epoch": 0.552, "grad_norm": 0.00921630859375, "learning_rate": 1.378e-05, "loss": 2.2697, "mean_token_accuracy": 0.5707717033103108, "num_tokens": 9793170.0, "step": 690 }, { "entropy": 1.2782492753118277, "epoch": 0.56, "grad_norm": 0.00927734375, "learning_rate": 1.398e-05, "loss": 2.1596, "mean_token_accuracy": 0.583821403235197, "num_tokens": 9932302.0, "step": 700 }, { "epoch": 0.56, "eval_coding_entropy": 1.266241003036499, "eval_coding_loss": 1.2625732421875, "eval_coding_mean_token_accuracy": 0.6838934738636017, "eval_coding_num_tokens": 9932302.0, "eval_coding_runtime": 95.6281, "eval_coding_samples_per_second": 5.229, "eval_coding_steps_per_second": 2.614, "step": 700 }, { "epoch": 0.56, "eval_math_entropy": 1.2986711366176604, "eval_math_loss": 2.2429049015045166, "eval_math_mean_token_accuracy": 0.5744417759180069, "eval_math_num_tokens": 9932302.0, "eval_math_runtime": 55.2345, "eval_math_samples_per_second": 9.052, "eval_math_steps_per_second": 4.526, "step": 700 }, { "entropy": 1.2765720319002867, "epoch": 0.568, "grad_norm": 0.0135498046875, "learning_rate": 1.418e-05, "loss": 2.1618, "mean_token_accuracy": 0.5836544273421168, "num_tokens": 10072362.0, "step": 710 }, { "entropy": 1.3300136230885982, "epoch": 0.576, "grad_norm": 0.0101318359375, "learning_rate": 1.4380000000000001e-05, "loss": 2.3456, "mean_token_accuracy": 0.5648340210318565, "num_tokens": 10215462.0, "step": 720 }, { "entropy": 1.2994835320860147, "epoch": 0.584, "grad_norm": 0.01416015625, "learning_rate": 1.4580000000000001e-05, "loss": 2.2414, "mean_token_accuracy": 0.5734376270323992, "num_tokens": 10354952.0, "step": 730 }, { "entropy": 1.2754272140562535, "epoch": 0.592, "grad_norm": 0.01361083984375, "learning_rate": 1.478e-05, "loss": 2.1968, "mean_token_accuracy": 0.5793214218690992, "num_tokens": 10504396.0, "step": 740 }, { "entropy": 1.2948825549334289, "epoch": 0.6, "grad_norm": 0.0167236328125, "learning_rate": 1.498e-05, "loss": 2.209, "mean_token_accuracy": 0.5778197368606925, "num_tokens": 10644641.0, "step": 750 }, { "entropy": 1.309036424010992, "epoch": 0.608, "grad_norm": 0.0142822265625, "learning_rate": 1.5180000000000002e-05, "loss": 2.2825, "mean_token_accuracy": 0.5746870728209614, "num_tokens": 10793126.0, "step": 760 }, { "entropy": 1.2668032895773649, "epoch": 0.616, "grad_norm": 0.01904296875, "learning_rate": 1.5380000000000002e-05, "loss": 2.1989, "mean_token_accuracy": 0.5822413913905621, "num_tokens": 10941399.0, "step": 770 }, { "entropy": 1.2946851935237647, "epoch": 0.624, "grad_norm": 0.0086669921875, "learning_rate": 1.5580000000000003e-05, "loss": 2.2376, "mean_token_accuracy": 0.574727876111865, "num_tokens": 11081768.0, "step": 780 }, { "entropy": 1.2720156278461219, "epoch": 0.632, "grad_norm": 0.00726318359375, "learning_rate": 1.578e-05, "loss": 2.2091, "mean_token_accuracy": 0.5791729586198926, "num_tokens": 11226419.0, "step": 790 }, { "entropy": 1.2966302383691073, "epoch": 0.64, "grad_norm": 0.0103759765625, "learning_rate": 1.5980000000000003e-05, "loss": 2.2476, "mean_token_accuracy": 0.5757431708276272, "num_tokens": 11370320.0, "step": 800 }, { "epoch": 0.64, "eval_coding_entropy": 1.266241003036499, "eval_coding_loss": 1.2625732421875, "eval_coding_mean_token_accuracy": 0.6838934738636017, "eval_coding_num_tokens": 11370320.0, "eval_coding_runtime": 95.6426, "eval_coding_samples_per_second": 5.228, "eval_coding_steps_per_second": 2.614, "step": 800 }, { "epoch": 0.64, "eval_math_entropy": 1.2986711366176604, "eval_math_loss": 2.2429049015045166, "eval_math_mean_token_accuracy": 0.5744417759180069, "eval_math_num_tokens": 11370320.0, "eval_math_runtime": 55.2405, "eval_math_samples_per_second": 9.051, "eval_math_steps_per_second": 4.526, "step": 800 }, { "entropy": 1.2639863461256027, "epoch": 0.648, "grad_norm": 0.01806640625, "learning_rate": 1.618e-05, "loss": 2.2448, "mean_token_accuracy": 0.5776080982759595, "num_tokens": 11515332.0, "step": 810 }, { "entropy": 1.2712122540920974, "epoch": 0.656, "grad_norm": 0.01318359375, "learning_rate": 1.638e-05, "loss": 2.1846, "mean_token_accuracy": 0.5818307081237435, "num_tokens": 11657835.0, "step": 820 }, { "entropy": 1.3265905980020762, "epoch": 0.664, "grad_norm": 0.01007080078125, "learning_rate": 1.658e-05, "loss": 2.2425, "mean_token_accuracy": 0.5686553552746773, "num_tokens": 11799325.0, "step": 830 }, { "entropy": 1.291335643082857, "epoch": 0.672, "grad_norm": 0.0152587890625, "learning_rate": 1.6780000000000002e-05, "loss": 2.2767, "mean_token_accuracy": 0.5738575926050544, "num_tokens": 11949262.0, "step": 840 }, { "entropy": 1.2961845736950637, "epoch": 0.68, "grad_norm": 0.0164794921875, "learning_rate": 1.698e-05, "loss": 2.2309, "mean_token_accuracy": 0.5745207356289029, "num_tokens": 12090029.0, "step": 850 }, { "entropy": 1.301833227276802, "epoch": 0.688, "grad_norm": 0.011962890625, "learning_rate": 1.718e-05, "loss": 2.2309, "mean_token_accuracy": 0.5745828906074166, "num_tokens": 12227640.0, "step": 860 }, { "entropy": 1.3095486916601657, "epoch": 0.696, "grad_norm": 0.0087890625, "learning_rate": 1.7380000000000003e-05, "loss": 2.2881, "mean_token_accuracy": 0.5654223879799247, "num_tokens": 12371560.0, "step": 870 }, { "entropy": 1.2842330873012542, "epoch": 0.704, "grad_norm": 0.01055908203125, "learning_rate": 1.758e-05, "loss": 2.2148, "mean_token_accuracy": 0.5752653010189533, "num_tokens": 12516641.0, "step": 880 }, { "entropy": 1.3211415249854326, "epoch": 0.712, "grad_norm": 0.00799560546875, "learning_rate": 1.7780000000000003e-05, "loss": 2.2159, "mean_token_accuracy": 0.571710304543376, "num_tokens": 12656623.0, "step": 890 }, { "entropy": 1.2885905254632235, "epoch": 0.72, "grad_norm": 0.0201416015625, "learning_rate": 1.798e-05, "loss": 2.2299, "mean_token_accuracy": 0.5760715741664171, "num_tokens": 12793343.0, "step": 900 }, { "epoch": 0.72, "eval_coding_entropy": 1.266241003036499, "eval_coding_loss": 1.2625732421875, "eval_coding_mean_token_accuracy": 0.6838934738636017, "eval_coding_num_tokens": 12793343.0, "eval_coding_runtime": 95.6394, "eval_coding_samples_per_second": 5.228, "eval_coding_steps_per_second": 2.614, "step": 900 }, { "epoch": 0.72, "eval_math_entropy": 1.2986711366176604, "eval_math_loss": 2.2429049015045166, "eval_math_mean_token_accuracy": 0.5744417759180069, "eval_math_num_tokens": 12793343.0, "eval_math_runtime": 55.2679, "eval_math_samples_per_second": 9.047, "eval_math_steps_per_second": 4.523, "step": 900 }, { "entropy": 1.298223714157939, "epoch": 0.728, "grad_norm": 0.012939453125, "learning_rate": 1.8180000000000002e-05, "loss": 2.298, "mean_token_accuracy": 0.5706229966133833, "num_tokens": 12932479.0, "step": 910 }, { "entropy": 1.2950578905642032, "epoch": 0.736, "grad_norm": 0.01239013671875, "learning_rate": 1.8380000000000004e-05, "loss": 2.234, "mean_token_accuracy": 0.5742258787155151, "num_tokens": 13077981.0, "step": 920 }, { "entropy": 1.302649038657546, "epoch": 0.744, "grad_norm": 0.0211181640625, "learning_rate": 1.858e-05, "loss": 2.2467, "mean_token_accuracy": 0.5736184434965252, "num_tokens": 13217888.0, "step": 930 }, { "entropy": 1.2875377755612134, "epoch": 0.752, "grad_norm": 0.015869140625, "learning_rate": 1.878e-05, "loss": 2.2132, "mean_token_accuracy": 0.575059812888503, "num_tokens": 13358957.0, "step": 940 }, { "entropy": 1.2615925706923008, "epoch": 0.76, "grad_norm": 0.0096435546875, "learning_rate": 1.898e-05, "loss": 2.2597, "mean_token_accuracy": 0.5791866419836879, "num_tokens": 13510850.0, "step": 950 }, { "entropy": 1.269580041617155, "epoch": 0.768, "grad_norm": 0.0115966796875, "learning_rate": 1.918e-05, "loss": 2.2027, "mean_token_accuracy": 0.5817427391186356, "num_tokens": 13653412.0, "step": 960 }, { "entropy": 1.3158201549202204, "epoch": 0.776, "grad_norm": 0.00885009765625, "learning_rate": 1.938e-05, "loss": 2.2614, "mean_token_accuracy": 0.5728385889902711, "num_tokens": 13794252.0, "step": 970 }, { "entropy": 1.2957237258553504, "epoch": 0.784, "grad_norm": 0.00909423828125, "learning_rate": 1.9580000000000002e-05, "loss": 2.2335, "mean_token_accuracy": 0.5786556912586093, "num_tokens": 13940856.0, "step": 980 }, { "entropy": 1.285663042962551, "epoch": 0.792, "grad_norm": 0.020263671875, "learning_rate": 1.978e-05, "loss": 2.2096, "mean_token_accuracy": 0.5792105477303267, "num_tokens": 14086117.0, "step": 990 }, { "entropy": 1.3126300945878029, "epoch": 0.8, "grad_norm": 0.00958251953125, "learning_rate": 1.9980000000000002e-05, "loss": 2.3165, "mean_token_accuracy": 0.5648046223446727, "num_tokens": 14230754.0, "step": 1000 }, { "epoch": 0.8, "eval_coding_entropy": 1.266241003036499, "eval_coding_loss": 1.2625732421875, "eval_coding_mean_token_accuracy": 0.6838934738636017, "eval_coding_num_tokens": 14230754.0, "eval_coding_runtime": 96.0895, "eval_coding_samples_per_second": 5.203, "eval_coding_steps_per_second": 2.602, "step": 1000 }, { "epoch": 0.8, "eval_math_entropy": 1.2986711366176604, "eval_math_loss": 2.2429049015045166, "eval_math_mean_token_accuracy": 0.5744417759180069, "eval_math_num_tokens": 14230754.0, "eval_math_runtime": 54.9639, "eval_math_samples_per_second": 9.097, "eval_math_steps_per_second": 4.548, "step": 1000 }, { "entropy": 1.29276928268373, "epoch": 0.808, "grad_norm": 0.0126953125, "learning_rate": 1.9980000000000002e-05, "loss": 2.2326, "mean_token_accuracy": 0.5718907188624144, "num_tokens": 14374548.0, "step": 1010 }, { "entropy": 1.3095116015523671, "epoch": 0.816, "grad_norm": 0.01318359375, "learning_rate": 1.995777777777778e-05, "loss": 2.2316, "mean_token_accuracy": 0.5726707259193062, "num_tokens": 14519893.0, "step": 1020 }, { "entropy": 1.2931621059775353, "epoch": 0.824, "grad_norm": 0.0120849609375, "learning_rate": 1.9935555555555557e-05, "loss": 2.2255, "mean_token_accuracy": 0.5791132707148791, "num_tokens": 14664170.0, "step": 1030 }, { "entropy": 1.3060086965560913, "epoch": 0.832, "grad_norm": 0.0087890625, "learning_rate": 1.9913333333333335e-05, "loss": 2.2366, "mean_token_accuracy": 0.5756228402256965, "num_tokens": 14805088.0, "step": 1040 }, { "entropy": 1.29172132499516, "epoch": 0.84, "grad_norm": 0.00775146484375, "learning_rate": 1.9891111111111112e-05, "loss": 2.2138, "mean_token_accuracy": 0.5763768455013633, "num_tokens": 14944711.0, "step": 1050 }, { "entropy": 1.3122717272490263, "epoch": 0.848, "grad_norm": 0.00994873046875, "learning_rate": 1.986888888888889e-05, "loss": 2.2005, "mean_token_accuracy": 0.5786851424723863, "num_tokens": 15086306.0, "step": 1060 }, { "entropy": 1.3352035760879517, "epoch": 0.856, "grad_norm": 0.01043701171875, "learning_rate": 1.9846666666666668e-05, "loss": 2.3066, "mean_token_accuracy": 0.5666656039655209, "num_tokens": 15228872.0, "step": 1070 }, { "entropy": 1.2667071226984263, "epoch": 0.864, "grad_norm": 0.01251220703125, "learning_rate": 1.9824444444444445e-05, "loss": 2.1787, "mean_token_accuracy": 0.5817643126472831, "num_tokens": 15370985.0, "step": 1080 }, { "entropy": 1.3015164621174335, "epoch": 0.872, "grad_norm": 0.0103759765625, "learning_rate": 1.9802222222222226e-05, "loss": 2.2554, "mean_token_accuracy": 0.5708118518814445, "num_tokens": 15513360.0, "step": 1090 }, { "entropy": 1.3232146605849266, "epoch": 0.88, "grad_norm": 0.01025390625, "learning_rate": 1.978e-05, "loss": 2.222, "mean_token_accuracy": 0.5751912448555231, "num_tokens": 15653836.0, "step": 1100 }, { "epoch": 0.88, "eval_coding_entropy": 1.266241003036499, "eval_coding_loss": 1.2625732421875, "eval_coding_mean_token_accuracy": 0.6838934738636017, "eval_coding_num_tokens": 15653836.0, "eval_coding_runtime": 95.6833, "eval_coding_samples_per_second": 5.226, "eval_coding_steps_per_second": 2.613, "step": 1100 }, { "epoch": 0.88, "eval_math_entropy": 1.2986711366176604, "eval_math_loss": 2.2429049015045166, "eval_math_mean_token_accuracy": 0.5744417759180069, "eval_math_num_tokens": 15653836.0, "eval_math_runtime": 55.2235, "eval_math_samples_per_second": 9.054, "eval_math_steps_per_second": 4.527, "step": 1100 }, { "entropy": 1.2944145441055297, "epoch": 0.888, "grad_norm": 0.013427734375, "learning_rate": 1.975777777777778e-05, "loss": 2.2076, "mean_token_accuracy": 0.579839687794447, "num_tokens": 15791102.0, "step": 1110 }, { "entropy": 1.3253925543278455, "epoch": 0.896, "grad_norm": 0.00970458984375, "learning_rate": 1.9735555555555556e-05, "loss": 2.2785, "mean_token_accuracy": 0.5721883840858937, "num_tokens": 15932179.0, "step": 1120 }, { "entropy": 1.2849542308598756, "epoch": 0.904, "grad_norm": 0.033203125, "learning_rate": 1.9713333333333337e-05, "loss": 2.2034, "mean_token_accuracy": 0.576960513740778, "num_tokens": 16074409.0, "step": 1130 }, { "entropy": 1.2730964332818986, "epoch": 0.912, "grad_norm": 0.0089111328125, "learning_rate": 1.969111111111111e-05, "loss": 2.1835, "mean_token_accuracy": 0.5821036446839571, "num_tokens": 16219842.0, "step": 1140 }, { "entropy": 1.2843450970947743, "epoch": 0.92, "grad_norm": 0.009765625, "learning_rate": 1.9668888888888892e-05, "loss": 2.1674, "mean_token_accuracy": 0.5814145233482122, "num_tokens": 16359249.0, "step": 1150 }, { "entropy": 1.2754333186894655, "epoch": 0.928, "grad_norm": 0.011474609375, "learning_rate": 1.9646666666666666e-05, "loss": 2.176, "mean_token_accuracy": 0.5827303070574998, "num_tokens": 16499842.0, "step": 1160 }, { "entropy": 1.301246513798833, "epoch": 0.936, "grad_norm": 0.010498046875, "learning_rate": 1.9624444444444447e-05, "loss": 2.2071, "mean_token_accuracy": 0.5731328945606947, "num_tokens": 16640499.0, "step": 1170 }, { "entropy": 1.2947144400328399, "epoch": 0.944, "grad_norm": 0.01239013671875, "learning_rate": 1.9602222222222225e-05, "loss": 2.2016, "mean_token_accuracy": 0.5806368073448539, "num_tokens": 16781882.0, "step": 1180 }, { "entropy": 1.2729844357818365, "epoch": 0.952, "grad_norm": 0.00811767578125, "learning_rate": 1.9580000000000002e-05, "loss": 2.157, "mean_token_accuracy": 0.5873358538374305, "num_tokens": 16922913.0, "step": 1190 }, { "entropy": 1.3273278027772903, "epoch": 0.96, "grad_norm": 0.0133056640625, "learning_rate": 1.955777777777778e-05, "loss": 2.2697, "mean_token_accuracy": 0.5692729642614722, "num_tokens": 17067407.0, "step": 1200 }, { "epoch": 0.96, "eval_coding_entropy": 1.266241003036499, "eval_coding_loss": 1.2625732421875, "eval_coding_mean_token_accuracy": 0.6838934738636017, "eval_coding_num_tokens": 17067407.0, "eval_coding_runtime": 95.6335, "eval_coding_samples_per_second": 5.228, "eval_coding_steps_per_second": 2.614, "step": 1200 }, { "epoch": 0.96, "eval_math_entropy": 1.2986711366176604, "eval_math_loss": 2.2429049015045166, "eval_math_mean_token_accuracy": 0.5744417759180069, "eval_math_num_tokens": 17067407.0, "eval_math_runtime": 55.2304, "eval_math_samples_per_second": 9.053, "eval_math_steps_per_second": 4.526, "step": 1200 }, { "entropy": 1.2900268983095884, "epoch": 0.968, "grad_norm": 0.008056640625, "learning_rate": 1.9535555555555557e-05, "loss": 2.1958, "mean_token_accuracy": 0.5768731584772467, "num_tokens": 17211153.0, "step": 1210 }, { "entropy": 1.30126980766654, "epoch": 0.976, "grad_norm": 0.0245361328125, "learning_rate": 1.9513333333333335e-05, "loss": 2.2313, "mean_token_accuracy": 0.5746001824736595, "num_tokens": 17350994.0, "step": 1220 }, { "entropy": 1.294421959668398, "epoch": 0.984, "grad_norm": 0.018310546875, "learning_rate": 1.9491111111111113e-05, "loss": 2.2481, "mean_token_accuracy": 0.5734764914959669, "num_tokens": 17492960.0, "step": 1230 }, { "entropy": 1.3181488186120986, "epoch": 0.992, "grad_norm": 0.030517578125, "learning_rate": 1.946888888888889e-05, "loss": 2.3008, "mean_token_accuracy": 0.5649250147864222, "num_tokens": 17637514.0, "step": 1240 }, { "entropy": 1.3004115797579288, "epoch": 1.0, "grad_norm": 0.009033203125, "learning_rate": 1.9446666666666668e-05, "loss": 2.2079, "mean_token_accuracy": 0.5754020158201456, "num_tokens": 17780432.0, "step": 1250 }, { "entropy": 1.2847444698214532, "epoch": 1.008, "grad_norm": 0.017333984375, "learning_rate": 1.9424444444444446e-05, "loss": 2.1845, "mean_token_accuracy": 0.5801492627710104, "num_tokens": 17926570.0, "step": 1260 }, { "entropy": 1.2932130094617604, "epoch": 1.016, "grad_norm": 0.0189208984375, "learning_rate": 1.9402222222222223e-05, "loss": 2.2089, "mean_token_accuracy": 0.5793235875666142, "num_tokens": 18070722.0, "step": 1270 }, { "entropy": 1.2891677081584931, "epoch": 1.024, "grad_norm": 0.0081787109375, "learning_rate": 1.938e-05, "loss": 2.2189, "mean_token_accuracy": 0.5802426967769861, "num_tokens": 18207652.0, "step": 1280 }, { "entropy": 1.3298017375171185, "epoch": 1.032, "grad_norm": 0.00958251953125, "learning_rate": 1.935777777777778e-05, "loss": 2.2569, "mean_token_accuracy": 0.5690007250756025, "num_tokens": 18349030.0, "step": 1290 }, { "entropy": 1.2888877458870411, "epoch": 1.04, "grad_norm": 0.01336669921875, "learning_rate": 1.9335555555555556e-05, "loss": 2.2469, "mean_token_accuracy": 0.5769451957195997, "num_tokens": 18484931.0, "step": 1300 }, { "epoch": 1.04, "eval_coding_entropy": 1.266241003036499, "eval_coding_loss": 1.2625732421875, "eval_coding_mean_token_accuracy": 0.6838934738636017, "eval_coding_num_tokens": 18484931.0, "eval_coding_runtime": 95.6088, "eval_coding_samples_per_second": 5.23, "eval_coding_steps_per_second": 2.615, "step": 1300 }, { "epoch": 1.04, "eval_math_entropy": 1.2986711366176604, "eval_math_loss": 2.2429049015045166, "eval_math_mean_token_accuracy": 0.5744417759180069, "eval_math_num_tokens": 18484931.0, "eval_math_runtime": 55.238, "eval_math_samples_per_second": 9.052, "eval_math_steps_per_second": 4.526, "step": 1300 }, { "entropy": 1.3006268568336963, "epoch": 1.048, "grad_norm": 0.01495361328125, "learning_rate": 1.9313333333333334e-05, "loss": 2.2672, "mean_token_accuracy": 0.5714229056611657, "num_tokens": 18630034.0, "step": 1310 }, { "entropy": 1.279826521128416, "epoch": 1.056, "grad_norm": 0.0164794921875, "learning_rate": 1.9291111111111115e-05, "loss": 2.2346, "mean_token_accuracy": 0.5772927038371563, "num_tokens": 18773457.0, "step": 1320 }, { "entropy": 1.311249329522252, "epoch": 1.064, "grad_norm": 0.0126953125, "learning_rate": 1.926888888888889e-05, "loss": 2.3087, "mean_token_accuracy": 0.5693608418107032, "num_tokens": 18918389.0, "step": 1330 }, { "entropy": 1.2683462664484977, "epoch": 1.072, "grad_norm": 0.0140380859375, "learning_rate": 1.924666666666667e-05, "loss": 2.225, "mean_token_accuracy": 0.5798949740827084, "num_tokens": 19055365.0, "step": 1340 }, { "entropy": 1.3234039679169656, "epoch": 1.08, "grad_norm": 0.0096435546875, "learning_rate": 1.9224444444444444e-05, "loss": 2.2342, "mean_token_accuracy": 0.572484284453094, "num_tokens": 19202006.0, "step": 1350 }, { "entropy": 1.2823971390724183, "epoch": 1.088, "grad_norm": 0.00909423828125, "learning_rate": 1.9202222222222225e-05, "loss": 2.2072, "mean_token_accuracy": 0.5814019978046417, "num_tokens": 19345730.0, "step": 1360 }, { "entropy": 1.328789808228612, "epoch": 1.096, "grad_norm": 0.008544921875, "learning_rate": 1.918e-05, "loss": 2.274, "mean_token_accuracy": 0.5689424321055412, "num_tokens": 19494084.0, "step": 1370 }, { "entropy": 1.2878280565142632, "epoch": 1.104, "grad_norm": 0.01165771484375, "learning_rate": 1.915777777777778e-05, "loss": 2.2215, "mean_token_accuracy": 0.5786661734804511, "num_tokens": 19637390.0, "step": 1380 }, { "entropy": 1.2910890705883502, "epoch": 1.112, "grad_norm": 0.018310546875, "learning_rate": 1.9135555555555555e-05, "loss": 2.2246, "mean_token_accuracy": 0.5769055737182498, "num_tokens": 19777388.0, "step": 1390 }, { "entropy": 1.2642895836383103, "epoch": 1.12, "grad_norm": 0.01123046875, "learning_rate": 1.9113333333333336e-05, "loss": 2.2314, "mean_token_accuracy": 0.5781507005915045, "num_tokens": 19923914.0, "step": 1400 }, { "epoch": 1.12, "eval_coding_entropy": 1.266241003036499, "eval_coding_loss": 1.2625732421875, "eval_coding_mean_token_accuracy": 0.6838934738636017, "eval_coding_num_tokens": 19923914.0, "eval_coding_runtime": 95.6535, "eval_coding_samples_per_second": 5.227, "eval_coding_steps_per_second": 2.614, "step": 1400 }, { "epoch": 1.12, "eval_math_entropy": 1.2986711366176604, "eval_math_loss": 2.2429049015045166, "eval_math_mean_token_accuracy": 0.5744417759180069, "eval_math_num_tokens": 19923914.0, "eval_math_runtime": 55.1965, "eval_math_samples_per_second": 9.059, "eval_math_steps_per_second": 4.529, "step": 1400 }, { "entropy": 1.3085532050579787, "epoch": 1.1280000000000001, "grad_norm": 0.01422119140625, "learning_rate": 1.9091111111111113e-05, "loss": 2.322, "mean_token_accuracy": 0.5657644450664521, "num_tokens": 20061623.0, "step": 1410 }, { "entropy": 1.2904092378914356, "epoch": 1.1360000000000001, "grad_norm": 0.017333984375, "learning_rate": 1.906888888888889e-05, "loss": 2.2418, "mean_token_accuracy": 0.5722202086821199, "num_tokens": 20201892.0, "step": 1420 }, { "entropy": 1.297779395803809, "epoch": 1.144, "grad_norm": 0.0078125, "learning_rate": 1.904666666666667e-05, "loss": 2.2403, "mean_token_accuracy": 0.5758582351729273, "num_tokens": 20343948.0, "step": 1430 }, { "entropy": 1.302032370120287, "epoch": 1.152, "grad_norm": 0.00811767578125, "learning_rate": 1.9024444444444446e-05, "loss": 2.2752, "mean_token_accuracy": 0.57330860812217, "num_tokens": 20490282.0, "step": 1440 }, { "entropy": 1.2779104180634022, "epoch": 1.16, "grad_norm": 0.0203857421875, "learning_rate": 1.9002222222222224e-05, "loss": 2.1719, "mean_token_accuracy": 0.5854279346764087, "num_tokens": 20641737.0, "step": 1450 }, { "entropy": 1.2778663296252488, "epoch": 1.168, "grad_norm": 0.025146484375, "learning_rate": 1.898e-05, "loss": 2.173, "mean_token_accuracy": 0.582743788510561, "num_tokens": 20785786.0, "step": 1460 }, { "entropy": 1.3081208609044552, "epoch": 1.176, "grad_norm": 0.015380859375, "learning_rate": 1.895777777777778e-05, "loss": 2.2775, "mean_token_accuracy": 0.5689503744244575, "num_tokens": 20928880.0, "step": 1470 }, { "entropy": 1.2751742608845233, "epoch": 1.184, "grad_norm": 0.0086669921875, "learning_rate": 1.8935555555555556e-05, "loss": 2.2062, "mean_token_accuracy": 0.5821780592203141, "num_tokens": 21074205.0, "step": 1480 }, { "entropy": 1.27163728736341, "epoch": 1.192, "grad_norm": 0.017822265625, "learning_rate": 1.8913333333333334e-05, "loss": 2.1817, "mean_token_accuracy": 0.5830474335700273, "num_tokens": 21222288.0, "step": 1490 }, { "entropy": 1.2738530434668065, "epoch": 1.2, "grad_norm": 0.011962890625, "learning_rate": 1.8891111111111115e-05, "loss": 2.1801, "mean_token_accuracy": 0.5803947079926729, "num_tokens": 21369159.0, "step": 1500 }, { "epoch": 1.2, "eval_coding_entropy": 1.266241003036499, "eval_coding_loss": 1.2625732421875, "eval_coding_mean_token_accuracy": 0.6838934738636017, "eval_coding_num_tokens": 21369159.0, "eval_coding_runtime": 95.695, "eval_coding_samples_per_second": 5.225, "eval_coding_steps_per_second": 2.612, "step": 1500 }, { "epoch": 1.2, "eval_math_entropy": 1.2986711366176604, "eval_math_loss": 2.2429049015045166, "eval_math_mean_token_accuracy": 0.5744417759180069, "eval_math_num_tokens": 21369159.0, "eval_math_runtime": 54.7644, "eval_math_samples_per_second": 9.13, "eval_math_steps_per_second": 4.565, "step": 1500 }, { "entropy": 1.3021067466586829, "epoch": 1.208, "grad_norm": 0.01422119140625, "learning_rate": 1.886888888888889e-05, "loss": 2.1707, "mean_token_accuracy": 0.5797995328903198, "num_tokens": 21511015.0, "step": 1510 }, { "entropy": 1.303629470616579, "epoch": 1.216, "grad_norm": 0.01251220703125, "learning_rate": 1.884666666666667e-05, "loss": 2.1907, "mean_token_accuracy": 0.577221317216754, "num_tokens": 21649178.0, "step": 1520 }, { "entropy": 1.2927171494811773, "epoch": 1.224, "grad_norm": 0.022216796875, "learning_rate": 1.8824444444444445e-05, "loss": 2.2467, "mean_token_accuracy": 0.5752095308154821, "num_tokens": 21790802.0, "step": 1530 }, { "entropy": 1.2608089979737997, "epoch": 1.232, "grad_norm": 0.01416015625, "learning_rate": 1.8802222222222226e-05, "loss": 2.1654, "mean_token_accuracy": 0.58616862334311, "num_tokens": 21930239.0, "step": 1540 }, { "entropy": 1.279138757288456, "epoch": 1.24, "grad_norm": 0.0164794921875, "learning_rate": 1.878e-05, "loss": 2.1775, "mean_token_accuracy": 0.579110798984766, "num_tokens": 22075270.0, "step": 1550 }, { "entropy": 1.2945632576942443, "epoch": 1.248, "grad_norm": 0.015380859375, "learning_rate": 1.875777777777778e-05, "loss": 2.2213, "mean_token_accuracy": 0.5734076691791415, "num_tokens": 22216387.0, "step": 1560 }, { "entropy": 1.314554738625884, "epoch": 1.256, "grad_norm": 0.021484375, "learning_rate": 1.873555555555556e-05, "loss": 2.2745, "mean_token_accuracy": 0.5707367097958922, "num_tokens": 22359813.0, "step": 1570 }, { "entropy": 1.32604789622128, "epoch": 1.264, "grad_norm": 0.011962890625, "learning_rate": 1.8713333333333336e-05, "loss": 2.25, "mean_token_accuracy": 0.5707180803641677, "num_tokens": 22501002.0, "step": 1580 }, { "entropy": 1.280422120168805, "epoch": 1.272, "grad_norm": 0.008056640625, "learning_rate": 1.8691111111111114e-05, "loss": 2.1828, "mean_token_accuracy": 0.581519584544003, "num_tokens": 22634652.0, "step": 1590 }, { "entropy": 1.2957364562898874, "epoch": 1.28, "grad_norm": 0.0185546875, "learning_rate": 1.866888888888889e-05, "loss": 2.2264, "mean_token_accuracy": 0.5763478111475706, "num_tokens": 22779682.0, "step": 1600 }, { "epoch": 1.28, "eval_coding_entropy": 1.266241003036499, "eval_coding_loss": 1.2625732421875, "eval_coding_mean_token_accuracy": 0.6838934738636017, "eval_coding_num_tokens": 22779682.0, "eval_coding_runtime": 95.6936, "eval_coding_samples_per_second": 5.225, "eval_coding_steps_per_second": 2.613, "step": 1600 }, { "epoch": 1.28, "eval_math_entropy": 1.2986711366176604, "eval_math_loss": 2.2429049015045166, "eval_math_mean_token_accuracy": 0.5744417759180069, "eval_math_num_tokens": 22779682.0, "eval_math_runtime": 55.2565, "eval_math_samples_per_second": 9.049, "eval_math_steps_per_second": 4.524, "step": 1600 }, { "entropy": 1.2870917547494174, "epoch": 1.288, "grad_norm": 0.01287841796875, "learning_rate": 1.864666666666667e-05, "loss": 2.1935, "mean_token_accuracy": 0.5807379197329283, "num_tokens": 22919374.0, "step": 1610 }, { "entropy": 1.268484988436103, "epoch": 1.296, "grad_norm": 0.01123046875, "learning_rate": 1.8624444444444446e-05, "loss": 2.183, "mean_token_accuracy": 0.5854335069656372, "num_tokens": 23057744.0, "step": 1620 }, { "entropy": 1.2904878597706557, "epoch": 1.304, "grad_norm": 0.0120849609375, "learning_rate": 1.8602222222222224e-05, "loss": 2.2618, "mean_token_accuracy": 0.5753191851079464, "num_tokens": 23201590.0, "step": 1630 }, { "entropy": 1.3022067859768867, "epoch": 1.312, "grad_norm": 0.0281982421875, "learning_rate": 1.858e-05, "loss": 2.2992, "mean_token_accuracy": 0.5704338105395437, "num_tokens": 23344644.0, "step": 1640 }, { "entropy": 1.3129540402442217, "epoch": 1.32, "grad_norm": 0.007171630859375, "learning_rate": 1.855777777777778e-05, "loss": 2.2085, "mean_token_accuracy": 0.5713952157646418, "num_tokens": 23485056.0, "step": 1650 }, { "entropy": 1.3017708092927933, "epoch": 1.328, "grad_norm": 0.008056640625, "learning_rate": 1.8535555555555557e-05, "loss": 2.1954, "mean_token_accuracy": 0.5752975823357701, "num_tokens": 23622405.0, "step": 1660 }, { "entropy": 1.289170829206705, "epoch": 1.336, "grad_norm": 0.00732421875, "learning_rate": 1.8513333333333335e-05, "loss": 2.2301, "mean_token_accuracy": 0.5756044138222933, "num_tokens": 23760641.0, "step": 1670 }, { "entropy": 1.262322897836566, "epoch": 1.3439999999999999, "grad_norm": 0.023193359375, "learning_rate": 1.8491111111111112e-05, "loss": 2.1401, "mean_token_accuracy": 0.5876206021755934, "num_tokens": 23899771.0, "step": 1680 }, { "entropy": 1.275562158226967, "epoch": 1.3519999999999999, "grad_norm": 0.00958251953125, "learning_rate": 1.846888888888889e-05, "loss": 2.1845, "mean_token_accuracy": 0.5808436365798115, "num_tokens": 24041995.0, "step": 1690 }, { "entropy": 1.2879225231707097, "epoch": 1.3599999999999999, "grad_norm": 0.010498046875, "learning_rate": 1.8446666666666667e-05, "loss": 2.2334, "mean_token_accuracy": 0.5763995088636875, "num_tokens": 24187023.0, "step": 1700 }, { "epoch": 1.3599999999999999, "eval_coding_entropy": 1.266241003036499, "eval_coding_loss": 1.2625732421875, "eval_coding_mean_token_accuracy": 0.6838934738636017, "eval_coding_num_tokens": 24187023.0, "eval_coding_runtime": 95.7407, "eval_coding_samples_per_second": 5.222, "eval_coding_steps_per_second": 2.611, "step": 1700 }, { "epoch": 1.3599999999999999, "eval_math_entropy": 1.2986711366176604, "eval_math_loss": 2.2429049015045166, "eval_math_mean_token_accuracy": 0.5744417759180069, "eval_math_num_tokens": 24187023.0, "eval_math_runtime": 55.2772, "eval_math_samples_per_second": 9.045, "eval_math_steps_per_second": 4.523, "step": 1700 }, { "entropy": 1.2914761129766703, "epoch": 1.3679999999999999, "grad_norm": 0.007598876953125, "learning_rate": 1.842444444444445e-05, "loss": 2.2328, "mean_token_accuracy": 0.5740274475887418, "num_tokens": 24325844.0, "step": 1710 }, { "entropy": 1.353976671770215, "epoch": 1.376, "grad_norm": 0.01495361328125, "learning_rate": 1.8402222222222223e-05, "loss": 2.3016, "mean_token_accuracy": 0.5633362045511603, "num_tokens": 24466132.0, "step": 1720 }, { "entropy": 1.2960197813808918, "epoch": 1.384, "grad_norm": 0.00830078125, "learning_rate": 1.8380000000000004e-05, "loss": 2.2201, "mean_token_accuracy": 0.5783739054575563, "num_tokens": 24606147.0, "step": 1730 }, { "entropy": 1.2861021481454373, "epoch": 1.392, "grad_norm": 0.00933837890625, "learning_rate": 1.8357777777777778e-05, "loss": 2.1876, "mean_token_accuracy": 0.5815011257305741, "num_tokens": 24748043.0, "step": 1740 }, { "entropy": 1.268753108009696, "epoch": 1.4, "grad_norm": 0.0189208984375, "learning_rate": 1.833555555555556e-05, "loss": 2.2244, "mean_token_accuracy": 0.5804416090250015, "num_tokens": 24894293.0, "step": 1750 }, { "entropy": 1.2929484587162734, "epoch": 1.408, "grad_norm": 0.021240234375, "learning_rate": 1.8313333333333333e-05, "loss": 2.2083, "mean_token_accuracy": 0.575320690125227, "num_tokens": 25036674.0, "step": 1760 }, { "entropy": 1.2982839655131102, "epoch": 1.416, "grad_norm": 0.021484375, "learning_rate": 1.8291111111111114e-05, "loss": 2.2784, "mean_token_accuracy": 0.5741079377010465, "num_tokens": 25178935.0, "step": 1770 }, { "entropy": 1.2535257052630187, "epoch": 1.424, "grad_norm": 0.007110595703125, "learning_rate": 1.8268888888888888e-05, "loss": 2.1463, "mean_token_accuracy": 0.5878499284386635, "num_tokens": 25324579.0, "step": 1780 }, { "entropy": 1.286781796067953, "epoch": 1.432, "grad_norm": 0.0107421875, "learning_rate": 1.824666666666667e-05, "loss": 2.2089, "mean_token_accuracy": 0.576385061070323, "num_tokens": 25466485.0, "step": 1790 }, { "entropy": 1.3003978207707405, "epoch": 1.44, "grad_norm": 0.016357421875, "learning_rate": 1.8224444444444447e-05, "loss": 2.2481, "mean_token_accuracy": 0.5751635741442442, "num_tokens": 25606824.0, "step": 1800 }, { "epoch": 1.44, "eval_coding_entropy": 1.266241003036499, "eval_coding_loss": 1.2625732421875, "eval_coding_mean_token_accuracy": 0.6838934738636017, "eval_coding_num_tokens": 25606824.0, "eval_coding_runtime": 95.6673, "eval_coding_samples_per_second": 5.226, "eval_coding_steps_per_second": 2.613, "step": 1800 }, { "epoch": 1.44, "eval_math_entropy": 1.2986711366176604, "eval_math_loss": 2.2429049015045166, "eval_math_mean_token_accuracy": 0.5744417759180069, "eval_math_num_tokens": 25606824.0, "eval_math_runtime": 55.2224, "eval_math_samples_per_second": 9.054, "eval_math_steps_per_second": 4.527, "step": 1800 }, { "entropy": 1.2908053163439035, "epoch": 1.448, "grad_norm": 0.00799560546875, "learning_rate": 1.8202222222222225e-05, "loss": 2.1957, "mean_token_accuracy": 0.5789953164756299, "num_tokens": 25745729.0, "step": 1810 }, { "entropy": 1.273376001417637, "epoch": 1.456, "grad_norm": 0.01336669921875, "learning_rate": 1.8180000000000002e-05, "loss": 2.1926, "mean_token_accuracy": 0.5779400140047073, "num_tokens": 25886396.0, "step": 1820 }, { "entropy": 1.2855514109134674, "epoch": 1.464, "grad_norm": 0.010498046875, "learning_rate": 1.815777777777778e-05, "loss": 2.2374, "mean_token_accuracy": 0.5767241906374693, "num_tokens": 26026227.0, "step": 1830 }, { "entropy": 1.2807271871715784, "epoch": 1.472, "grad_norm": 0.00823974609375, "learning_rate": 1.8135555555555557e-05, "loss": 2.1749, "mean_token_accuracy": 0.5822155047208071, "num_tokens": 26163618.0, "step": 1840 }, { "entropy": 1.3182869713753462, "epoch": 1.48, "grad_norm": 0.01318359375, "learning_rate": 1.8113333333333335e-05, "loss": 2.2544, "mean_token_accuracy": 0.5738100804388523, "num_tokens": 26299086.0, "step": 1850 }, { "entropy": 1.3167364660650491, "epoch": 1.488, "grad_norm": 0.01043701171875, "learning_rate": 1.8091111111111113e-05, "loss": 2.2364, "mean_token_accuracy": 0.5735991884022951, "num_tokens": 26438338.0, "step": 1860 }, { "entropy": 1.2824474953114986, "epoch": 1.496, "grad_norm": 0.0125732421875, "learning_rate": 1.806888888888889e-05, "loss": 2.2469, "mean_token_accuracy": 0.5744533261284233, "num_tokens": 26583001.0, "step": 1870 }, { "entropy": 1.277822620794177, "epoch": 1.504, "grad_norm": 0.018798828125, "learning_rate": 1.8046666666666668e-05, "loss": 2.2308, "mean_token_accuracy": 0.5790910394862294, "num_tokens": 26729255.0, "step": 1880 }, { "entropy": 1.2929816491901875, "epoch": 1.512, "grad_norm": 0.00946044921875, "learning_rate": 1.8024444444444445e-05, "loss": 2.2212, "mean_token_accuracy": 0.5757774453610182, "num_tokens": 26870139.0, "step": 1890 }, { "entropy": 1.28971153460443, "epoch": 1.52, "grad_norm": 0.010498046875, "learning_rate": 1.8002222222222223e-05, "loss": 2.2155, "mean_token_accuracy": 0.5779576146975159, "num_tokens": 27017935.0, "step": 1900 }, { "epoch": 1.52, "eval_coding_entropy": 1.266241003036499, "eval_coding_loss": 1.2625732421875, "eval_coding_mean_token_accuracy": 0.6838934738636017, "eval_coding_num_tokens": 27017935.0, "eval_coding_runtime": 95.8867, "eval_coding_samples_per_second": 5.214, "eval_coding_steps_per_second": 2.607, "step": 1900 }, { "epoch": 1.52, "eval_math_entropy": 1.2986711366176604, "eval_math_loss": 2.2429049015045166, "eval_math_mean_token_accuracy": 0.5744417759180069, "eval_math_num_tokens": 27017935.0, "eval_math_runtime": 55.1777, "eval_math_samples_per_second": 9.062, "eval_math_steps_per_second": 4.531, "step": 1900 }, { "entropy": 1.331830918043852, "epoch": 1.528, "grad_norm": 0.009033203125, "learning_rate": 1.798e-05, "loss": 2.2721, "mean_token_accuracy": 0.5682261852547527, "num_tokens": 27159322.0, "step": 1910 }, { "entropy": 1.2588583029806615, "epoch": 1.536, "grad_norm": 0.00848388671875, "learning_rate": 1.7957777777777778e-05, "loss": 2.1588, "mean_token_accuracy": 0.5847917202860117, "num_tokens": 27306339.0, "step": 1920 }, { "entropy": 1.2904524333775043, "epoch": 1.544, "grad_norm": 0.01312255859375, "learning_rate": 1.7935555555555556e-05, "loss": 2.2221, "mean_token_accuracy": 0.5756975743919611, "num_tokens": 27445653.0, "step": 1930 }, { "entropy": 1.2871752437204123, "epoch": 1.552, "grad_norm": 0.00830078125, "learning_rate": 1.7913333333333337e-05, "loss": 2.22, "mean_token_accuracy": 0.5743231695145369, "num_tokens": 27591959.0, "step": 1940 }, { "entropy": 1.2857405241578816, "epoch": 1.56, "grad_norm": 0.0101318359375, "learning_rate": 1.789111111111111e-05, "loss": 2.2188, "mean_token_accuracy": 0.5781919835135341, "num_tokens": 27736783.0, "step": 1950 }, { "entropy": 1.2675248205661773, "epoch": 1.568, "grad_norm": 0.009033203125, "learning_rate": 1.7868888888888892e-05, "loss": 2.2334, "mean_token_accuracy": 0.5779768938198686, "num_tokens": 27884398.0, "step": 1960 }, { "entropy": 1.3143243368715047, "epoch": 1.576, "grad_norm": 0.01068115234375, "learning_rate": 1.7846666666666666e-05, "loss": 2.2206, "mean_token_accuracy": 0.5736559564247727, "num_tokens": 28025205.0, "step": 1970 }, { "entropy": 1.2710258401930332, "epoch": 1.584, "grad_norm": 0.00933837890625, "learning_rate": 1.7824444444444447e-05, "loss": 2.1864, "mean_token_accuracy": 0.5820634830743074, "num_tokens": 28171274.0, "step": 1980 }, { "entropy": 1.2678415119647979, "epoch": 1.592, "grad_norm": 0.0164794921875, "learning_rate": 1.780222222222222e-05, "loss": 2.2105, "mean_token_accuracy": 0.5788672368973493, "num_tokens": 28311058.0, "step": 1990 }, { "entropy": 1.2863474819809197, "epoch": 1.6, "grad_norm": 0.0166015625, "learning_rate": 1.7780000000000003e-05, "loss": 2.2279, "mean_token_accuracy": 0.5786767426878214, "num_tokens": 28457624.0, "step": 2000 }, { "epoch": 1.6, "eval_coding_entropy": 1.266241003036499, "eval_coding_loss": 1.2625732421875, "eval_coding_mean_token_accuracy": 0.6838934738636017, "eval_coding_num_tokens": 28457624.0, "eval_coding_runtime": 95.6104, "eval_coding_samples_per_second": 5.23, "eval_coding_steps_per_second": 2.615, "step": 2000 }, { "epoch": 1.6, "eval_math_entropy": 1.2986711366176604, "eval_math_loss": 2.2429049015045166, "eval_math_mean_token_accuracy": 0.5744417759180069, "eval_math_num_tokens": 28457624.0, "eval_math_runtime": 54.9191, "eval_math_samples_per_second": 9.104, "eval_math_steps_per_second": 4.552, "step": 2000 } ], "logging_steps": 10, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.32515376937045e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }