| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.6, |
| "eval_steps": 100, |
| "global_step": 2000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 1.2628081854432822, |
| "epoch": 0.008, |
| "grad_norm": 0.0179443359375, |
| "learning_rate": 1.8e-07, |
| "loss": 2.1855, |
| "mean_token_accuracy": 0.584391780383885, |
| "num_tokens": 138146.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 1.2761256650090218, |
| "epoch": 0.016, |
| "grad_norm": 0.0177001953125, |
| "learning_rate": 3.8e-07, |
| "loss": 2.2119, |
| "mean_token_accuracy": 0.5769438467919826, |
| "num_tokens": 280941.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 1.30821207202971, |
| "epoch": 0.024, |
| "grad_norm": 0.0079345703125, |
| "learning_rate": 5.800000000000001e-07, |
| "loss": 2.2252, |
| "mean_token_accuracy": 0.5782777238637209, |
| "num_tokens": 419127.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 1.2832397993654012, |
| "epoch": 0.032, |
| "grad_norm": 0.00787353515625, |
| "learning_rate": 7.8e-07, |
| "loss": 2.1845, |
| "mean_token_accuracy": 0.5787389608100056, |
| "num_tokens": 558056.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 1.281408105790615, |
| "epoch": 0.04, |
| "grad_norm": 0.0125732421875, |
| "learning_rate": 9.800000000000001e-07, |
| "loss": 2.2214, |
| "mean_token_accuracy": 0.5814869659021497, |
| "num_tokens": 697556.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 1.2824515115469695, |
| "epoch": 0.048, |
| "grad_norm": 0.0206298828125, |
| "learning_rate": 1.1800000000000001e-06, |
| "loss": 2.1897, |
| "mean_token_accuracy": 0.5798849314451218, |
| "num_tokens": 836753.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 1.2973103038966656, |
| "epoch": 0.056, |
| "grad_norm": 0.0216064453125, |
| "learning_rate": 1.3800000000000001e-06, |
| "loss": 2.2424, |
| "mean_token_accuracy": 0.5715558419004083, |
| "num_tokens": 981959.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 1.2914881024509668, |
| "epoch": 0.064, |
| "grad_norm": 0.00927734375, |
| "learning_rate": 1.5800000000000001e-06, |
| "loss": 2.2528, |
| "mean_token_accuracy": 0.5714382914826274, |
| "num_tokens": 1126446.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 1.2911656297743321, |
| "epoch": 0.072, |
| "grad_norm": 0.0145263671875, |
| "learning_rate": 1.7800000000000001e-06, |
| "loss": 2.2655, |
| "mean_token_accuracy": 0.5728934396989643, |
| "num_tokens": 1273905.0, |
| "step": 90 |
| }, |
| { |
| "entropy": 1.3042505994439124, |
| "epoch": 0.08, |
| "grad_norm": 0.00927734375, |
| "learning_rate": 1.98e-06, |
| "loss": 2.2384, |
| "mean_token_accuracy": 0.574522521905601, |
| "num_tokens": 1413596.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.08, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 1413596.0, |
| "eval_coding_runtime": 96.113, |
| "eval_coding_samples_per_second": 5.202, |
| "eval_coding_steps_per_second": 2.601, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.08, |
| "eval_math_entropy": 1.2986711366176604, |
| "eval_math_loss": 2.2429049015045166, |
| "eval_math_mean_token_accuracy": 0.5744417759180069, |
| "eval_math_num_tokens": 1413596.0, |
| "eval_math_runtime": 55.2867, |
| "eval_math_samples_per_second": 9.044, |
| "eval_math_steps_per_second": 4.522, |
| "step": 100 |
| }, |
| { |
| "entropy": 1.2642712604254485, |
| "epoch": 0.088, |
| "grad_norm": 0.0072021484375, |
| "learning_rate": 2.1800000000000003e-06, |
| "loss": 2.1897, |
| "mean_token_accuracy": 0.5820998327806592, |
| "num_tokens": 1559450.0, |
| "step": 110 |
| }, |
| { |
| "entropy": 1.3127624299377203, |
| "epoch": 0.096, |
| "grad_norm": 0.0101318359375, |
| "learning_rate": 2.38e-06, |
| "loss": 2.2696, |
| "mean_token_accuracy": 0.5712630080059171, |
| "num_tokens": 1701193.0, |
| "step": 120 |
| }, |
| { |
| "entropy": 1.2562327686697246, |
| "epoch": 0.104, |
| "grad_norm": 0.0089111328125, |
| "learning_rate": 2.5800000000000003e-06, |
| "loss": 2.1593, |
| "mean_token_accuracy": 0.5853730095550418, |
| "num_tokens": 1840165.0, |
| "step": 130 |
| }, |
| { |
| "entropy": 1.3122376587241888, |
| "epoch": 0.112, |
| "grad_norm": 0.00982666015625, |
| "learning_rate": 2.7800000000000005e-06, |
| "loss": 2.272, |
| "mean_token_accuracy": 0.5697157179936767, |
| "num_tokens": 1979232.0, |
| "step": 140 |
| }, |
| { |
| "entropy": 1.296381675079465, |
| "epoch": 0.12, |
| "grad_norm": 0.0084228515625, |
| "learning_rate": 2.9800000000000003e-06, |
| "loss": 2.2493, |
| "mean_token_accuracy": 0.5788527738302947, |
| "num_tokens": 2125815.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 1.299617338180542, |
| "epoch": 0.128, |
| "grad_norm": 0.015380859375, |
| "learning_rate": 3.1800000000000005e-06, |
| "loss": 2.2113, |
| "mean_token_accuracy": 0.5759535003453493, |
| "num_tokens": 2274177.0, |
| "step": 160 |
| }, |
| { |
| "entropy": 1.2860674735158681, |
| "epoch": 0.136, |
| "grad_norm": 0.00946044921875, |
| "learning_rate": 3.3800000000000007e-06, |
| "loss": 2.2092, |
| "mean_token_accuracy": 0.574743240699172, |
| "num_tokens": 2412435.0, |
| "step": 170 |
| }, |
| { |
| "entropy": 1.299461904540658, |
| "epoch": 0.144, |
| "grad_norm": 0.0115966796875, |
| "learning_rate": 3.58e-06, |
| "loss": 2.1965, |
| "mean_token_accuracy": 0.5797095803543926, |
| "num_tokens": 2548445.0, |
| "step": 180 |
| }, |
| { |
| "entropy": 1.259111713245511, |
| "epoch": 0.152, |
| "grad_norm": 0.0115966796875, |
| "learning_rate": 3.7800000000000002e-06, |
| "loss": 2.1452, |
| "mean_token_accuracy": 0.5854375848546625, |
| "num_tokens": 2686462.0, |
| "step": 190 |
| }, |
| { |
| "entropy": 1.2716298930346965, |
| "epoch": 0.16, |
| "grad_norm": 0.014892578125, |
| "learning_rate": 3.980000000000001e-06, |
| "loss": 2.2284, |
| "mean_token_accuracy": 0.5798038769513368, |
| "num_tokens": 2824418.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.16, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 2824418.0, |
| "eval_coding_runtime": 95.662, |
| "eval_coding_samples_per_second": 5.227, |
| "eval_coding_steps_per_second": 2.613, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.16, |
| "eval_math_entropy": 1.2986711366176604, |
| "eval_math_loss": 2.2429049015045166, |
| "eval_math_mean_token_accuracy": 0.5744417759180069, |
| "eval_math_num_tokens": 2824418.0, |
| "eval_math_runtime": 55.266, |
| "eval_math_samples_per_second": 9.047, |
| "eval_math_steps_per_second": 4.524, |
| "step": 200 |
| }, |
| { |
| "entropy": 1.29471963532269, |
| "epoch": 0.168, |
| "grad_norm": 0.021240234375, |
| "learning_rate": 4.18e-06, |
| "loss": 2.2605, |
| "mean_token_accuracy": 0.5726646112278104, |
| "num_tokens": 2966031.0, |
| "step": 210 |
| }, |
| { |
| "entropy": 1.295415598154068, |
| "epoch": 0.176, |
| "grad_norm": 0.01519775390625, |
| "learning_rate": 4.38e-06, |
| "loss": 2.1967, |
| "mean_token_accuracy": 0.5751344438642263, |
| "num_tokens": 3110313.0, |
| "step": 220 |
| }, |
| { |
| "entropy": 1.3239446703344584, |
| "epoch": 0.184, |
| "grad_norm": 0.0089111328125, |
| "learning_rate": 4.58e-06, |
| "loss": 2.2475, |
| "mean_token_accuracy": 0.5708544872701168, |
| "num_tokens": 3251139.0, |
| "step": 230 |
| }, |
| { |
| "entropy": 1.3012913078069688, |
| "epoch": 0.192, |
| "grad_norm": 0.015625, |
| "learning_rate": 4.78e-06, |
| "loss": 2.2591, |
| "mean_token_accuracy": 0.5751949936151505, |
| "num_tokens": 3394170.0, |
| "step": 240 |
| }, |
| { |
| "entropy": 1.2939962450414897, |
| "epoch": 0.2, |
| "grad_norm": 0.0081787109375, |
| "learning_rate": 4.980000000000001e-06, |
| "loss": 2.2271, |
| "mean_token_accuracy": 0.5751136729493738, |
| "num_tokens": 3533571.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 1.3175367139279843, |
| "epoch": 0.208, |
| "grad_norm": 0.0162353515625, |
| "learning_rate": 5.18e-06, |
| "loss": 2.2477, |
| "mean_token_accuracy": 0.5703525686636567, |
| "num_tokens": 3673600.0, |
| "step": 260 |
| }, |
| { |
| "entropy": 1.2911786384880544, |
| "epoch": 0.216, |
| "grad_norm": 0.00897216796875, |
| "learning_rate": 5.380000000000001e-06, |
| "loss": 2.2607, |
| "mean_token_accuracy": 0.5701368801295758, |
| "num_tokens": 3817397.0, |
| "step": 270 |
| }, |
| { |
| "entropy": 1.312684351950884, |
| "epoch": 0.224, |
| "grad_norm": 0.012451171875, |
| "learning_rate": 5.580000000000001e-06, |
| "loss": 2.2961, |
| "mean_token_accuracy": 0.5682467905804515, |
| "num_tokens": 3953732.0, |
| "step": 280 |
| }, |
| { |
| "entropy": 1.276525003835559, |
| "epoch": 0.232, |
| "grad_norm": 0.009033203125, |
| "learning_rate": 5.78e-06, |
| "loss": 2.2433, |
| "mean_token_accuracy": 0.5770688287913799, |
| "num_tokens": 4098989.0, |
| "step": 290 |
| }, |
| { |
| "entropy": 1.2831152826547623, |
| "epoch": 0.24, |
| "grad_norm": 0.008056640625, |
| "learning_rate": 5.98e-06, |
| "loss": 2.2829, |
| "mean_token_accuracy": 0.5744210470467805, |
| "num_tokens": 4243655.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.24, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 4243655.0, |
| "eval_coding_runtime": 95.6217, |
| "eval_coding_samples_per_second": 5.229, |
| "eval_coding_steps_per_second": 2.614, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.24, |
| "eval_math_entropy": 1.2986711366176604, |
| "eval_math_loss": 2.2429049015045166, |
| "eval_math_mean_token_accuracy": 0.5744417759180069, |
| "eval_math_num_tokens": 4243655.0, |
| "eval_math_runtime": 55.2698, |
| "eval_math_samples_per_second": 9.047, |
| "eval_math_steps_per_second": 4.523, |
| "step": 300 |
| }, |
| { |
| "entropy": 1.2523589238524437, |
| "epoch": 0.248, |
| "grad_norm": 0.01129150390625, |
| "learning_rate": 6.18e-06, |
| "loss": 2.1458, |
| "mean_token_accuracy": 0.5875966742634773, |
| "num_tokens": 4387651.0, |
| "step": 310 |
| }, |
| { |
| "entropy": 1.2836483523249627, |
| "epoch": 0.256, |
| "grad_norm": 0.01708984375, |
| "learning_rate": 6.380000000000001e-06, |
| "loss": 2.2171, |
| "mean_token_accuracy": 0.5771771775558591, |
| "num_tokens": 4531471.0, |
| "step": 320 |
| }, |
| { |
| "entropy": 1.2900966588407754, |
| "epoch": 0.264, |
| "grad_norm": 0.0172119140625, |
| "learning_rate": 6.5800000000000005e-06, |
| "loss": 2.2279, |
| "mean_token_accuracy": 0.577950318530202, |
| "num_tokens": 4669389.0, |
| "step": 330 |
| }, |
| { |
| "entropy": 1.2982714001089335, |
| "epoch": 0.272, |
| "grad_norm": 0.00897216796875, |
| "learning_rate": 6.780000000000001e-06, |
| "loss": 2.2464, |
| "mean_token_accuracy": 0.5756928954273463, |
| "num_tokens": 4810284.0, |
| "step": 340 |
| }, |
| { |
| "entropy": 1.278377178311348, |
| "epoch": 0.28, |
| "grad_norm": 0.00616455078125, |
| "learning_rate": 6.98e-06, |
| "loss": 2.172, |
| "mean_token_accuracy": 0.5843851236626506, |
| "num_tokens": 4955375.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 1.2722328055649996, |
| "epoch": 0.288, |
| "grad_norm": 0.015869140625, |
| "learning_rate": 7.180000000000001e-06, |
| "loss": 2.2143, |
| "mean_token_accuracy": 0.5805238038301468, |
| "num_tokens": 5095104.0, |
| "step": 360 |
| }, |
| { |
| "entropy": 1.2866631541401148, |
| "epoch": 0.296, |
| "grad_norm": 0.01318359375, |
| "learning_rate": 7.3800000000000005e-06, |
| "loss": 2.2041, |
| "mean_token_accuracy": 0.5798180354759097, |
| "num_tokens": 5240988.0, |
| "step": 370 |
| }, |
| { |
| "entropy": 1.2868602696806193, |
| "epoch": 0.304, |
| "grad_norm": 0.0230712890625, |
| "learning_rate": 7.58e-06, |
| "loss": 2.2193, |
| "mean_token_accuracy": 0.578575404547155, |
| "num_tokens": 5383732.0, |
| "step": 380 |
| }, |
| { |
| "entropy": 1.3216422777622938, |
| "epoch": 0.312, |
| "grad_norm": 0.0115966796875, |
| "learning_rate": 7.78e-06, |
| "loss": 2.2847, |
| "mean_token_accuracy": 0.5677898772060871, |
| "num_tokens": 5530666.0, |
| "step": 390 |
| }, |
| { |
| "entropy": 1.2590181913226843, |
| "epoch": 0.32, |
| "grad_norm": 0.01171875, |
| "learning_rate": 7.980000000000002e-06, |
| "loss": 2.1719, |
| "mean_token_accuracy": 0.5848465450108051, |
| "num_tokens": 5676334.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.32, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 5676334.0, |
| "eval_coding_runtime": 95.6468, |
| "eval_coding_samples_per_second": 5.228, |
| "eval_coding_steps_per_second": 2.614, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.32, |
| "eval_math_entropy": 1.2986711366176604, |
| "eval_math_loss": 2.2429049015045166, |
| "eval_math_mean_token_accuracy": 0.5744417759180069, |
| "eval_math_num_tokens": 5676334.0, |
| "eval_math_runtime": 55.2877, |
| "eval_math_samples_per_second": 9.044, |
| "eval_math_steps_per_second": 4.522, |
| "step": 400 |
| }, |
| { |
| "entropy": 1.3136477686464787, |
| "epoch": 0.328, |
| "grad_norm": 0.009521484375, |
| "learning_rate": 8.18e-06, |
| "loss": 2.2817, |
| "mean_token_accuracy": 0.5644237477332353, |
| "num_tokens": 5816488.0, |
| "step": 410 |
| }, |
| { |
| "entropy": 1.296512581780553, |
| "epoch": 0.336, |
| "grad_norm": 0.008544921875, |
| "learning_rate": 8.380000000000001e-06, |
| "loss": 2.2127, |
| "mean_token_accuracy": 0.5752231312915683, |
| "num_tokens": 5958480.0, |
| "step": 420 |
| }, |
| { |
| "entropy": 1.2964510951191186, |
| "epoch": 0.344, |
| "grad_norm": 0.00885009765625, |
| "learning_rate": 8.580000000000001e-06, |
| "loss": 2.2215, |
| "mean_token_accuracy": 0.5778196537867188, |
| "num_tokens": 6102081.0, |
| "step": 430 |
| }, |
| { |
| "entropy": 1.2612504959106445, |
| "epoch": 0.352, |
| "grad_norm": 0.0084228515625, |
| "learning_rate": 8.78e-06, |
| "loss": 2.1833, |
| "mean_token_accuracy": 0.5839220907539129, |
| "num_tokens": 6242161.0, |
| "step": 440 |
| }, |
| { |
| "entropy": 1.2856441132724286, |
| "epoch": 0.36, |
| "grad_norm": 0.0177001953125, |
| "learning_rate": 8.98e-06, |
| "loss": 2.2414, |
| "mean_token_accuracy": 0.5754987616091967, |
| "num_tokens": 6377327.0, |
| "step": 450 |
| }, |
| { |
| "entropy": 1.288279192149639, |
| "epoch": 0.368, |
| "grad_norm": 0.0111083984375, |
| "learning_rate": 9.180000000000002e-06, |
| "loss": 2.1968, |
| "mean_token_accuracy": 0.580965013988316, |
| "num_tokens": 6523679.0, |
| "step": 460 |
| }, |
| { |
| "entropy": 1.28649943433702, |
| "epoch": 0.376, |
| "grad_norm": 0.007659912109375, |
| "learning_rate": 9.38e-06, |
| "loss": 2.2039, |
| "mean_token_accuracy": 0.5775518488138914, |
| "num_tokens": 6669278.0, |
| "step": 470 |
| }, |
| { |
| "entropy": 1.2801726717501878, |
| "epoch": 0.384, |
| "grad_norm": 0.01068115234375, |
| "learning_rate": 9.58e-06, |
| "loss": 2.1868, |
| "mean_token_accuracy": 0.5841207258403301, |
| "num_tokens": 6810978.0, |
| "step": 480 |
| }, |
| { |
| "entropy": 1.3082288071513175, |
| "epoch": 0.392, |
| "grad_norm": 0.01025390625, |
| "learning_rate": 9.780000000000001e-06, |
| "loss": 2.259, |
| "mean_token_accuracy": 0.5713578343391419, |
| "num_tokens": 6953929.0, |
| "step": 490 |
| }, |
| { |
| "entropy": 1.3287261452525854, |
| "epoch": 0.4, |
| "grad_norm": 0.008056640625, |
| "learning_rate": 9.980000000000001e-06, |
| "loss": 2.2733, |
| "mean_token_accuracy": 0.5687045747414231, |
| "num_tokens": 7096903.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.4, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 7096903.0, |
| "eval_coding_runtime": 95.6459, |
| "eval_coding_samples_per_second": 5.228, |
| "eval_coding_steps_per_second": 2.614, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.4, |
| "eval_math_entropy": 1.2986711366176604, |
| "eval_math_loss": 2.2429049015045166, |
| "eval_math_mean_token_accuracy": 0.5744417759180069, |
| "eval_math_num_tokens": 7096903.0, |
| "eval_math_runtime": 54.9641, |
| "eval_math_samples_per_second": 9.097, |
| "eval_math_steps_per_second": 4.548, |
| "step": 500 |
| }, |
| { |
| "entropy": 1.294243635982275, |
| "epoch": 0.408, |
| "grad_norm": 0.016845703125, |
| "learning_rate": 1.018e-05, |
| "loss": 2.2428, |
| "mean_token_accuracy": 0.5740567265078426, |
| "num_tokens": 7239071.0, |
| "step": 510 |
| }, |
| { |
| "entropy": 1.293457629531622, |
| "epoch": 0.416, |
| "grad_norm": 0.0140380859375, |
| "learning_rate": 1.038e-05, |
| "loss": 2.2374, |
| "mean_token_accuracy": 0.5770649960264563, |
| "num_tokens": 7377181.0, |
| "step": 520 |
| }, |
| { |
| "entropy": 1.29749961681664, |
| "epoch": 0.424, |
| "grad_norm": 0.01806640625, |
| "learning_rate": 1.0580000000000002e-05, |
| "loss": 2.1821, |
| "mean_token_accuracy": 0.5780813867226243, |
| "num_tokens": 7513035.0, |
| "step": 530 |
| }, |
| { |
| "entropy": 1.2768937222659589, |
| "epoch": 0.432, |
| "grad_norm": 0.00970458984375, |
| "learning_rate": 1.0780000000000002e-05, |
| "loss": 2.2319, |
| "mean_token_accuracy": 0.5752488367259503, |
| "num_tokens": 7650523.0, |
| "step": 540 |
| }, |
| { |
| "entropy": 1.27384728230536, |
| "epoch": 0.44, |
| "grad_norm": 0.0169677734375, |
| "learning_rate": 1.0980000000000002e-05, |
| "loss": 2.1808, |
| "mean_token_accuracy": 0.5803057106211782, |
| "num_tokens": 7794006.0, |
| "step": 550 |
| }, |
| { |
| "entropy": 1.2649936627596616, |
| "epoch": 0.448, |
| "grad_norm": 0.00933837890625, |
| "learning_rate": 1.1180000000000001e-05, |
| "loss": 2.1932, |
| "mean_token_accuracy": 0.5795159580186009, |
| "num_tokens": 7936788.0, |
| "step": 560 |
| }, |
| { |
| "entropy": 1.2891030061990023, |
| "epoch": 0.456, |
| "grad_norm": 0.00909423828125, |
| "learning_rate": 1.138e-05, |
| "loss": 2.2876, |
| "mean_token_accuracy": 0.5727174270898103, |
| "num_tokens": 8081035.0, |
| "step": 570 |
| }, |
| { |
| "entropy": 1.3052824169397355, |
| "epoch": 0.464, |
| "grad_norm": 0.0169677734375, |
| "learning_rate": 1.1580000000000001e-05, |
| "loss": 2.2796, |
| "mean_token_accuracy": 0.5737911343574524, |
| "num_tokens": 8223147.0, |
| "step": 580 |
| }, |
| { |
| "entropy": 1.2902213044464588, |
| "epoch": 0.472, |
| "grad_norm": 0.008544921875, |
| "learning_rate": 1.178e-05, |
| "loss": 2.2199, |
| "mean_token_accuracy": 0.5761593583971262, |
| "num_tokens": 8367380.0, |
| "step": 590 |
| }, |
| { |
| "entropy": 1.2913669254630804, |
| "epoch": 0.48, |
| "grad_norm": 0.0157470703125, |
| "learning_rate": 1.198e-05, |
| "loss": 2.2032, |
| "mean_token_accuracy": 0.581071512401104, |
| "num_tokens": 8506030.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.48, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 8506030.0, |
| "eval_coding_runtime": 95.6191, |
| "eval_coding_samples_per_second": 5.229, |
| "eval_coding_steps_per_second": 2.615, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.48, |
| "eval_math_entropy": 1.2986711366176604, |
| "eval_math_loss": 2.2429049015045166, |
| "eval_math_mean_token_accuracy": 0.5744417759180069, |
| "eval_math_num_tokens": 8506030.0, |
| "eval_math_runtime": 55.258, |
| "eval_math_samples_per_second": 9.048, |
| "eval_math_steps_per_second": 4.524, |
| "step": 600 |
| }, |
| { |
| "entropy": 1.2780281711369752, |
| "epoch": 0.488, |
| "grad_norm": 0.0093994140625, |
| "learning_rate": 1.218e-05, |
| "loss": 2.1848, |
| "mean_token_accuracy": 0.5809735087677836, |
| "num_tokens": 8646384.0, |
| "step": 610 |
| }, |
| { |
| "entropy": 1.2823347095400095, |
| "epoch": 0.496, |
| "grad_norm": 0.0125732421875, |
| "learning_rate": 1.2380000000000002e-05, |
| "loss": 2.2158, |
| "mean_token_accuracy": 0.5798389609903097, |
| "num_tokens": 8788726.0, |
| "step": 620 |
| }, |
| { |
| "entropy": 1.3198146648705005, |
| "epoch": 0.504, |
| "grad_norm": 0.00946044921875, |
| "learning_rate": 1.2580000000000002e-05, |
| "loss": 2.301, |
| "mean_token_accuracy": 0.5685374505817891, |
| "num_tokens": 8934959.0, |
| "step": 630 |
| }, |
| { |
| "entropy": 1.2555946599692107, |
| "epoch": 0.512, |
| "grad_norm": 0.0177001953125, |
| "learning_rate": 1.2780000000000001e-05, |
| "loss": 2.1431, |
| "mean_token_accuracy": 0.589201102592051, |
| "num_tokens": 9078039.0, |
| "step": 640 |
| }, |
| { |
| "entropy": 1.2702158033847808, |
| "epoch": 0.52, |
| "grad_norm": 0.024169921875, |
| "learning_rate": 1.2980000000000001e-05, |
| "loss": 2.1965, |
| "mean_token_accuracy": 0.5821724396198988, |
| "num_tokens": 9220298.0, |
| "step": 650 |
| }, |
| { |
| "entropy": 1.277011101320386, |
| "epoch": 0.528, |
| "grad_norm": 0.0118408203125, |
| "learning_rate": 1.3180000000000001e-05, |
| "loss": 2.2097, |
| "mean_token_accuracy": 0.5796491207554937, |
| "num_tokens": 9373860.0, |
| "step": 660 |
| }, |
| { |
| "entropy": 1.3105232425034046, |
| "epoch": 0.536, |
| "grad_norm": 0.00811767578125, |
| "learning_rate": 1.3380000000000002e-05, |
| "loss": 2.2839, |
| "mean_token_accuracy": 0.5706269716843962, |
| "num_tokens": 9515962.0, |
| "step": 670 |
| }, |
| { |
| "entropy": 1.2719210147857667, |
| "epoch": 0.544, |
| "grad_norm": 0.011474609375, |
| "learning_rate": 1.3580000000000002e-05, |
| "loss": 2.2322, |
| "mean_token_accuracy": 0.5780333954840898, |
| "num_tokens": 9660940.0, |
| "step": 680 |
| }, |
| { |
| "entropy": 1.336498522385955, |
| "epoch": 0.552, |
| "grad_norm": 0.00921630859375, |
| "learning_rate": 1.378e-05, |
| "loss": 2.2697, |
| "mean_token_accuracy": 0.5707717033103108, |
| "num_tokens": 9793170.0, |
| "step": 690 |
| }, |
| { |
| "entropy": 1.2782492753118277, |
| "epoch": 0.56, |
| "grad_norm": 0.00927734375, |
| "learning_rate": 1.398e-05, |
| "loss": 2.1596, |
| "mean_token_accuracy": 0.583821403235197, |
| "num_tokens": 9932302.0, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.56, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 9932302.0, |
| "eval_coding_runtime": 95.6281, |
| "eval_coding_samples_per_second": 5.229, |
| "eval_coding_steps_per_second": 2.614, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.56, |
| "eval_math_entropy": 1.2986711366176604, |
| "eval_math_loss": 2.2429049015045166, |
| "eval_math_mean_token_accuracy": 0.5744417759180069, |
| "eval_math_num_tokens": 9932302.0, |
| "eval_math_runtime": 55.2345, |
| "eval_math_samples_per_second": 9.052, |
| "eval_math_steps_per_second": 4.526, |
| "step": 700 |
| }, |
| { |
| "entropy": 1.2765720319002867, |
| "epoch": 0.568, |
| "grad_norm": 0.0135498046875, |
| "learning_rate": 1.418e-05, |
| "loss": 2.1618, |
| "mean_token_accuracy": 0.5836544273421168, |
| "num_tokens": 10072362.0, |
| "step": 710 |
| }, |
| { |
| "entropy": 1.3300136230885982, |
| "epoch": 0.576, |
| "grad_norm": 0.0101318359375, |
| "learning_rate": 1.4380000000000001e-05, |
| "loss": 2.3456, |
| "mean_token_accuracy": 0.5648340210318565, |
| "num_tokens": 10215462.0, |
| "step": 720 |
| }, |
| { |
| "entropy": 1.2994835320860147, |
| "epoch": 0.584, |
| "grad_norm": 0.01416015625, |
| "learning_rate": 1.4580000000000001e-05, |
| "loss": 2.2414, |
| "mean_token_accuracy": 0.5734376270323992, |
| "num_tokens": 10354952.0, |
| "step": 730 |
| }, |
| { |
| "entropy": 1.2754272140562535, |
| "epoch": 0.592, |
| "grad_norm": 0.01361083984375, |
| "learning_rate": 1.478e-05, |
| "loss": 2.1968, |
| "mean_token_accuracy": 0.5793214218690992, |
| "num_tokens": 10504396.0, |
| "step": 740 |
| }, |
| { |
| "entropy": 1.2948825549334289, |
| "epoch": 0.6, |
| "grad_norm": 0.0167236328125, |
| "learning_rate": 1.498e-05, |
| "loss": 2.209, |
| "mean_token_accuracy": 0.5778197368606925, |
| "num_tokens": 10644641.0, |
| "step": 750 |
| }, |
| { |
| "entropy": 1.309036424010992, |
| "epoch": 0.608, |
| "grad_norm": 0.0142822265625, |
| "learning_rate": 1.5180000000000002e-05, |
| "loss": 2.2825, |
| "mean_token_accuracy": 0.5746870728209614, |
| "num_tokens": 10793126.0, |
| "step": 760 |
| }, |
| { |
| "entropy": 1.2668032895773649, |
| "epoch": 0.616, |
| "grad_norm": 0.01904296875, |
| "learning_rate": 1.5380000000000002e-05, |
| "loss": 2.1989, |
| "mean_token_accuracy": 0.5822413913905621, |
| "num_tokens": 10941399.0, |
| "step": 770 |
| }, |
| { |
| "entropy": 1.2946851935237647, |
| "epoch": 0.624, |
| "grad_norm": 0.0086669921875, |
| "learning_rate": 1.5580000000000003e-05, |
| "loss": 2.2376, |
| "mean_token_accuracy": 0.574727876111865, |
| "num_tokens": 11081768.0, |
| "step": 780 |
| }, |
| { |
| "entropy": 1.2720156278461219, |
| "epoch": 0.632, |
| "grad_norm": 0.00726318359375, |
| "learning_rate": 1.578e-05, |
| "loss": 2.2091, |
| "mean_token_accuracy": 0.5791729586198926, |
| "num_tokens": 11226419.0, |
| "step": 790 |
| }, |
| { |
| "entropy": 1.2966302383691073, |
| "epoch": 0.64, |
| "grad_norm": 0.0103759765625, |
| "learning_rate": 1.5980000000000003e-05, |
| "loss": 2.2476, |
| "mean_token_accuracy": 0.5757431708276272, |
| "num_tokens": 11370320.0, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.64, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 11370320.0, |
| "eval_coding_runtime": 95.6426, |
| "eval_coding_samples_per_second": 5.228, |
| "eval_coding_steps_per_second": 2.614, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.64, |
| "eval_math_entropy": 1.2986711366176604, |
| "eval_math_loss": 2.2429049015045166, |
| "eval_math_mean_token_accuracy": 0.5744417759180069, |
| "eval_math_num_tokens": 11370320.0, |
| "eval_math_runtime": 55.2405, |
| "eval_math_samples_per_second": 9.051, |
| "eval_math_steps_per_second": 4.526, |
| "step": 800 |
| }, |
| { |
| "entropy": 1.2639863461256027, |
| "epoch": 0.648, |
| "grad_norm": 0.01806640625, |
| "learning_rate": 1.618e-05, |
| "loss": 2.2448, |
| "mean_token_accuracy": 0.5776080982759595, |
| "num_tokens": 11515332.0, |
| "step": 810 |
| }, |
| { |
| "entropy": 1.2712122540920974, |
| "epoch": 0.656, |
| "grad_norm": 0.01318359375, |
| "learning_rate": 1.638e-05, |
| "loss": 2.1846, |
| "mean_token_accuracy": 0.5818307081237435, |
| "num_tokens": 11657835.0, |
| "step": 820 |
| }, |
| { |
| "entropy": 1.3265905980020762, |
| "epoch": 0.664, |
| "grad_norm": 0.01007080078125, |
| "learning_rate": 1.658e-05, |
| "loss": 2.2425, |
| "mean_token_accuracy": 0.5686553552746773, |
| "num_tokens": 11799325.0, |
| "step": 830 |
| }, |
| { |
| "entropy": 1.291335643082857, |
| "epoch": 0.672, |
| "grad_norm": 0.0152587890625, |
| "learning_rate": 1.6780000000000002e-05, |
| "loss": 2.2767, |
| "mean_token_accuracy": 0.5738575926050544, |
| "num_tokens": 11949262.0, |
| "step": 840 |
| }, |
| { |
| "entropy": 1.2961845736950637, |
| "epoch": 0.68, |
| "grad_norm": 0.0164794921875, |
| "learning_rate": 1.698e-05, |
| "loss": 2.2309, |
| "mean_token_accuracy": 0.5745207356289029, |
| "num_tokens": 12090029.0, |
| "step": 850 |
| }, |
| { |
| "entropy": 1.301833227276802, |
| "epoch": 0.688, |
| "grad_norm": 0.011962890625, |
| "learning_rate": 1.718e-05, |
| "loss": 2.2309, |
| "mean_token_accuracy": 0.5745828906074166, |
| "num_tokens": 12227640.0, |
| "step": 860 |
| }, |
| { |
| "entropy": 1.3095486916601657, |
| "epoch": 0.696, |
| "grad_norm": 0.0087890625, |
| "learning_rate": 1.7380000000000003e-05, |
| "loss": 2.2881, |
| "mean_token_accuracy": 0.5654223879799247, |
| "num_tokens": 12371560.0, |
| "step": 870 |
| }, |
| { |
| "entropy": 1.2842330873012542, |
| "epoch": 0.704, |
| "grad_norm": 0.01055908203125, |
| "learning_rate": 1.758e-05, |
| "loss": 2.2148, |
| "mean_token_accuracy": 0.5752653010189533, |
| "num_tokens": 12516641.0, |
| "step": 880 |
| }, |
| { |
| "entropy": 1.3211415249854326, |
| "epoch": 0.712, |
| "grad_norm": 0.00799560546875, |
| "learning_rate": 1.7780000000000003e-05, |
| "loss": 2.2159, |
| "mean_token_accuracy": 0.571710304543376, |
| "num_tokens": 12656623.0, |
| "step": 890 |
| }, |
| { |
| "entropy": 1.2885905254632235, |
| "epoch": 0.72, |
| "grad_norm": 0.0201416015625, |
| "learning_rate": 1.798e-05, |
| "loss": 2.2299, |
| "mean_token_accuracy": 0.5760715741664171, |
| "num_tokens": 12793343.0, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.72, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 12793343.0, |
| "eval_coding_runtime": 95.6394, |
| "eval_coding_samples_per_second": 5.228, |
| "eval_coding_steps_per_second": 2.614, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.72, |
| "eval_math_entropy": 1.2986711366176604, |
| "eval_math_loss": 2.2429049015045166, |
| "eval_math_mean_token_accuracy": 0.5744417759180069, |
| "eval_math_num_tokens": 12793343.0, |
| "eval_math_runtime": 55.2679, |
| "eval_math_samples_per_second": 9.047, |
| "eval_math_steps_per_second": 4.523, |
| "step": 900 |
| }, |
| { |
| "entropy": 1.298223714157939, |
| "epoch": 0.728, |
| "grad_norm": 0.012939453125, |
| "learning_rate": 1.8180000000000002e-05, |
| "loss": 2.298, |
| "mean_token_accuracy": 0.5706229966133833, |
| "num_tokens": 12932479.0, |
| "step": 910 |
| }, |
| { |
| "entropy": 1.2950578905642032, |
| "epoch": 0.736, |
| "grad_norm": 0.01239013671875, |
| "learning_rate": 1.8380000000000004e-05, |
| "loss": 2.234, |
| "mean_token_accuracy": 0.5742258787155151, |
| "num_tokens": 13077981.0, |
| "step": 920 |
| }, |
| { |
| "entropy": 1.302649038657546, |
| "epoch": 0.744, |
| "grad_norm": 0.0211181640625, |
| "learning_rate": 1.858e-05, |
| "loss": 2.2467, |
| "mean_token_accuracy": 0.5736184434965252, |
| "num_tokens": 13217888.0, |
| "step": 930 |
| }, |
| { |
| "entropy": 1.2875377755612134, |
| "epoch": 0.752, |
| "grad_norm": 0.015869140625, |
| "learning_rate": 1.878e-05, |
| "loss": 2.2132, |
| "mean_token_accuracy": 0.575059812888503, |
| "num_tokens": 13358957.0, |
| "step": 940 |
| }, |
| { |
| "entropy": 1.2615925706923008, |
| "epoch": 0.76, |
| "grad_norm": 0.0096435546875, |
| "learning_rate": 1.898e-05, |
| "loss": 2.2597, |
| "mean_token_accuracy": 0.5791866419836879, |
| "num_tokens": 13510850.0, |
| "step": 950 |
| }, |
| { |
| "entropy": 1.269580041617155, |
| "epoch": 0.768, |
| "grad_norm": 0.0115966796875, |
| "learning_rate": 1.918e-05, |
| "loss": 2.2027, |
| "mean_token_accuracy": 0.5817427391186356, |
| "num_tokens": 13653412.0, |
| "step": 960 |
| }, |
| { |
| "entropy": 1.3158201549202204, |
| "epoch": 0.776, |
| "grad_norm": 0.00885009765625, |
| "learning_rate": 1.938e-05, |
| "loss": 2.2614, |
| "mean_token_accuracy": 0.5728385889902711, |
| "num_tokens": 13794252.0, |
| "step": 970 |
| }, |
| { |
| "entropy": 1.2957237258553504, |
| "epoch": 0.784, |
| "grad_norm": 0.00909423828125, |
| "learning_rate": 1.9580000000000002e-05, |
| "loss": 2.2335, |
| "mean_token_accuracy": 0.5786556912586093, |
| "num_tokens": 13940856.0, |
| "step": 980 |
| }, |
| { |
| "entropy": 1.285663042962551, |
| "epoch": 0.792, |
| "grad_norm": 0.020263671875, |
| "learning_rate": 1.978e-05, |
| "loss": 2.2096, |
| "mean_token_accuracy": 0.5792105477303267, |
| "num_tokens": 14086117.0, |
| "step": 990 |
| }, |
| { |
| "entropy": 1.3126300945878029, |
| "epoch": 0.8, |
| "grad_norm": 0.00958251953125, |
| "learning_rate": 1.9980000000000002e-05, |
| "loss": 2.3165, |
| "mean_token_accuracy": 0.5648046223446727, |
| "num_tokens": 14230754.0, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.8, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 14230754.0, |
| "eval_coding_runtime": 96.0895, |
| "eval_coding_samples_per_second": 5.203, |
| "eval_coding_steps_per_second": 2.602, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.8, |
| "eval_math_entropy": 1.2986711366176604, |
| "eval_math_loss": 2.2429049015045166, |
| "eval_math_mean_token_accuracy": 0.5744417759180069, |
| "eval_math_num_tokens": 14230754.0, |
| "eval_math_runtime": 54.9639, |
| "eval_math_samples_per_second": 9.097, |
| "eval_math_steps_per_second": 4.548, |
| "step": 1000 |
| }, |
| { |
| "entropy": 1.29276928268373, |
| "epoch": 0.808, |
| "grad_norm": 0.0126953125, |
| "learning_rate": 1.9980000000000002e-05, |
| "loss": 2.2326, |
| "mean_token_accuracy": 0.5718907188624144, |
| "num_tokens": 14374548.0, |
| "step": 1010 |
| }, |
| { |
| "entropy": 1.3095116015523671, |
| "epoch": 0.816, |
| "grad_norm": 0.01318359375, |
| "learning_rate": 1.995777777777778e-05, |
| "loss": 2.2316, |
| "mean_token_accuracy": 0.5726707259193062, |
| "num_tokens": 14519893.0, |
| "step": 1020 |
| }, |
| { |
| "entropy": 1.2931621059775353, |
| "epoch": 0.824, |
| "grad_norm": 0.0120849609375, |
| "learning_rate": 1.9935555555555557e-05, |
| "loss": 2.2255, |
| "mean_token_accuracy": 0.5791132707148791, |
| "num_tokens": 14664170.0, |
| "step": 1030 |
| }, |
| { |
| "entropy": 1.3060086965560913, |
| "epoch": 0.832, |
| "grad_norm": 0.0087890625, |
| "learning_rate": 1.9913333333333335e-05, |
| "loss": 2.2366, |
| "mean_token_accuracy": 0.5756228402256965, |
| "num_tokens": 14805088.0, |
| "step": 1040 |
| }, |
| { |
| "entropy": 1.29172132499516, |
| "epoch": 0.84, |
| "grad_norm": 0.00775146484375, |
| "learning_rate": 1.9891111111111112e-05, |
| "loss": 2.2138, |
| "mean_token_accuracy": 0.5763768455013633, |
| "num_tokens": 14944711.0, |
| "step": 1050 |
| }, |
| { |
| "entropy": 1.3122717272490263, |
| "epoch": 0.848, |
| "grad_norm": 0.00994873046875, |
| "learning_rate": 1.986888888888889e-05, |
| "loss": 2.2005, |
| "mean_token_accuracy": 0.5786851424723863, |
| "num_tokens": 15086306.0, |
| "step": 1060 |
| }, |
| { |
| "entropy": 1.3352035760879517, |
| "epoch": 0.856, |
| "grad_norm": 0.01043701171875, |
| "learning_rate": 1.9846666666666668e-05, |
| "loss": 2.3066, |
| "mean_token_accuracy": 0.5666656039655209, |
| "num_tokens": 15228872.0, |
| "step": 1070 |
| }, |
| { |
| "entropy": 1.2667071226984263, |
| "epoch": 0.864, |
| "grad_norm": 0.01251220703125, |
| "learning_rate": 1.9824444444444445e-05, |
| "loss": 2.1787, |
| "mean_token_accuracy": 0.5817643126472831, |
| "num_tokens": 15370985.0, |
| "step": 1080 |
| }, |
| { |
| "entropy": 1.3015164621174335, |
| "epoch": 0.872, |
| "grad_norm": 0.0103759765625, |
| "learning_rate": 1.9802222222222226e-05, |
| "loss": 2.2554, |
| "mean_token_accuracy": 0.5708118518814445, |
| "num_tokens": 15513360.0, |
| "step": 1090 |
| }, |
| { |
| "entropy": 1.3232146605849266, |
| "epoch": 0.88, |
| "grad_norm": 0.01025390625, |
| "learning_rate": 1.978e-05, |
| "loss": 2.222, |
| "mean_token_accuracy": 0.5751912448555231, |
| "num_tokens": 15653836.0, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.88, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 15653836.0, |
| "eval_coding_runtime": 95.6833, |
| "eval_coding_samples_per_second": 5.226, |
| "eval_coding_steps_per_second": 2.613, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.88, |
| "eval_math_entropy": 1.2986711366176604, |
| "eval_math_loss": 2.2429049015045166, |
| "eval_math_mean_token_accuracy": 0.5744417759180069, |
| "eval_math_num_tokens": 15653836.0, |
| "eval_math_runtime": 55.2235, |
| "eval_math_samples_per_second": 9.054, |
| "eval_math_steps_per_second": 4.527, |
| "step": 1100 |
| }, |
| { |
| "entropy": 1.2944145441055297, |
| "epoch": 0.888, |
| "grad_norm": 0.013427734375, |
| "learning_rate": 1.975777777777778e-05, |
| "loss": 2.2076, |
| "mean_token_accuracy": 0.579839687794447, |
| "num_tokens": 15791102.0, |
| "step": 1110 |
| }, |
| { |
| "entropy": 1.3253925543278455, |
| "epoch": 0.896, |
| "grad_norm": 0.00970458984375, |
| "learning_rate": 1.9735555555555556e-05, |
| "loss": 2.2785, |
| "mean_token_accuracy": 0.5721883840858937, |
| "num_tokens": 15932179.0, |
| "step": 1120 |
| }, |
| { |
| "entropy": 1.2849542308598756, |
| "epoch": 0.904, |
| "grad_norm": 0.033203125, |
| "learning_rate": 1.9713333333333337e-05, |
| "loss": 2.2034, |
| "mean_token_accuracy": 0.576960513740778, |
| "num_tokens": 16074409.0, |
| "step": 1130 |
| }, |
| { |
| "entropy": 1.2730964332818986, |
| "epoch": 0.912, |
| "grad_norm": 0.0089111328125, |
| "learning_rate": 1.969111111111111e-05, |
| "loss": 2.1835, |
| "mean_token_accuracy": 0.5821036446839571, |
| "num_tokens": 16219842.0, |
| "step": 1140 |
| }, |
| { |
| "entropy": 1.2843450970947743, |
| "epoch": 0.92, |
| "grad_norm": 0.009765625, |
| "learning_rate": 1.9668888888888892e-05, |
| "loss": 2.1674, |
| "mean_token_accuracy": 0.5814145233482122, |
| "num_tokens": 16359249.0, |
| "step": 1150 |
| }, |
| { |
| "entropy": 1.2754333186894655, |
| "epoch": 0.928, |
| "grad_norm": 0.011474609375, |
| "learning_rate": 1.9646666666666666e-05, |
| "loss": 2.176, |
| "mean_token_accuracy": 0.5827303070574998, |
| "num_tokens": 16499842.0, |
| "step": 1160 |
| }, |
| { |
| "entropy": 1.301246513798833, |
| "epoch": 0.936, |
| "grad_norm": 0.010498046875, |
| "learning_rate": 1.9624444444444447e-05, |
| "loss": 2.2071, |
| "mean_token_accuracy": 0.5731328945606947, |
| "num_tokens": 16640499.0, |
| "step": 1170 |
| }, |
| { |
| "entropy": 1.2947144400328399, |
| "epoch": 0.944, |
| "grad_norm": 0.01239013671875, |
| "learning_rate": 1.9602222222222225e-05, |
| "loss": 2.2016, |
| "mean_token_accuracy": 0.5806368073448539, |
| "num_tokens": 16781882.0, |
| "step": 1180 |
| }, |
| { |
| "entropy": 1.2729844357818365, |
| "epoch": 0.952, |
| "grad_norm": 0.00811767578125, |
| "learning_rate": 1.9580000000000002e-05, |
| "loss": 2.157, |
| "mean_token_accuracy": 0.5873358538374305, |
| "num_tokens": 16922913.0, |
| "step": 1190 |
| }, |
| { |
| "entropy": 1.3273278027772903, |
| "epoch": 0.96, |
| "grad_norm": 0.0133056640625, |
| "learning_rate": 1.955777777777778e-05, |
| "loss": 2.2697, |
| "mean_token_accuracy": 0.5692729642614722, |
| "num_tokens": 17067407.0, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.96, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 17067407.0, |
| "eval_coding_runtime": 95.6335, |
| "eval_coding_samples_per_second": 5.228, |
| "eval_coding_steps_per_second": 2.614, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.96, |
| "eval_math_entropy": 1.2986711366176604, |
| "eval_math_loss": 2.2429049015045166, |
| "eval_math_mean_token_accuracy": 0.5744417759180069, |
| "eval_math_num_tokens": 17067407.0, |
| "eval_math_runtime": 55.2304, |
| "eval_math_samples_per_second": 9.053, |
| "eval_math_steps_per_second": 4.526, |
| "step": 1200 |
| }, |
| { |
| "entropy": 1.2900268983095884, |
| "epoch": 0.968, |
| "grad_norm": 0.008056640625, |
| "learning_rate": 1.9535555555555557e-05, |
| "loss": 2.1958, |
| "mean_token_accuracy": 0.5768731584772467, |
| "num_tokens": 17211153.0, |
| "step": 1210 |
| }, |
| { |
| "entropy": 1.30126980766654, |
| "epoch": 0.976, |
| "grad_norm": 0.0245361328125, |
| "learning_rate": 1.9513333333333335e-05, |
| "loss": 2.2313, |
| "mean_token_accuracy": 0.5746001824736595, |
| "num_tokens": 17350994.0, |
| "step": 1220 |
| }, |
| { |
| "entropy": 1.294421959668398, |
| "epoch": 0.984, |
| "grad_norm": 0.018310546875, |
| "learning_rate": 1.9491111111111113e-05, |
| "loss": 2.2481, |
| "mean_token_accuracy": 0.5734764914959669, |
| "num_tokens": 17492960.0, |
| "step": 1230 |
| }, |
| { |
| "entropy": 1.3181488186120986, |
| "epoch": 0.992, |
| "grad_norm": 0.030517578125, |
| "learning_rate": 1.946888888888889e-05, |
| "loss": 2.3008, |
| "mean_token_accuracy": 0.5649250147864222, |
| "num_tokens": 17637514.0, |
| "step": 1240 |
| }, |
| { |
| "entropy": 1.3004115797579288, |
| "epoch": 1.0, |
| "grad_norm": 0.009033203125, |
| "learning_rate": 1.9446666666666668e-05, |
| "loss": 2.2079, |
| "mean_token_accuracy": 0.5754020158201456, |
| "num_tokens": 17780432.0, |
| "step": 1250 |
| }, |
| { |
| "entropy": 1.2847444698214532, |
| "epoch": 1.008, |
| "grad_norm": 0.017333984375, |
| "learning_rate": 1.9424444444444446e-05, |
| "loss": 2.1845, |
| "mean_token_accuracy": 0.5801492627710104, |
| "num_tokens": 17926570.0, |
| "step": 1260 |
| }, |
| { |
| "entropy": 1.2932130094617604, |
| "epoch": 1.016, |
| "grad_norm": 0.0189208984375, |
| "learning_rate": 1.9402222222222223e-05, |
| "loss": 2.2089, |
| "mean_token_accuracy": 0.5793235875666142, |
| "num_tokens": 18070722.0, |
| "step": 1270 |
| }, |
| { |
| "entropy": 1.2891677081584931, |
| "epoch": 1.024, |
| "grad_norm": 0.0081787109375, |
| "learning_rate": 1.938e-05, |
| "loss": 2.2189, |
| "mean_token_accuracy": 0.5802426967769861, |
| "num_tokens": 18207652.0, |
| "step": 1280 |
| }, |
| { |
| "entropy": 1.3298017375171185, |
| "epoch": 1.032, |
| "grad_norm": 0.00958251953125, |
| "learning_rate": 1.935777777777778e-05, |
| "loss": 2.2569, |
| "mean_token_accuracy": 0.5690007250756025, |
| "num_tokens": 18349030.0, |
| "step": 1290 |
| }, |
| { |
| "entropy": 1.2888877458870411, |
| "epoch": 1.04, |
| "grad_norm": 0.01336669921875, |
| "learning_rate": 1.9335555555555556e-05, |
| "loss": 2.2469, |
| "mean_token_accuracy": 0.5769451957195997, |
| "num_tokens": 18484931.0, |
| "step": 1300 |
| }, |
| { |
| "epoch": 1.04, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 18484931.0, |
| "eval_coding_runtime": 95.6088, |
| "eval_coding_samples_per_second": 5.23, |
| "eval_coding_steps_per_second": 2.615, |
| "step": 1300 |
| }, |
| { |
| "epoch": 1.04, |
| "eval_math_entropy": 1.2986711366176604, |
| "eval_math_loss": 2.2429049015045166, |
| "eval_math_mean_token_accuracy": 0.5744417759180069, |
| "eval_math_num_tokens": 18484931.0, |
| "eval_math_runtime": 55.238, |
| "eval_math_samples_per_second": 9.052, |
| "eval_math_steps_per_second": 4.526, |
| "step": 1300 |
| }, |
| { |
| "entropy": 1.3006268568336963, |
| "epoch": 1.048, |
| "grad_norm": 0.01495361328125, |
| "learning_rate": 1.9313333333333334e-05, |
| "loss": 2.2672, |
| "mean_token_accuracy": 0.5714229056611657, |
| "num_tokens": 18630034.0, |
| "step": 1310 |
| }, |
| { |
| "entropy": 1.279826521128416, |
| "epoch": 1.056, |
| "grad_norm": 0.0164794921875, |
| "learning_rate": 1.9291111111111115e-05, |
| "loss": 2.2346, |
| "mean_token_accuracy": 0.5772927038371563, |
| "num_tokens": 18773457.0, |
| "step": 1320 |
| }, |
| { |
| "entropy": 1.311249329522252, |
| "epoch": 1.064, |
| "grad_norm": 0.0126953125, |
| "learning_rate": 1.926888888888889e-05, |
| "loss": 2.3087, |
| "mean_token_accuracy": 0.5693608418107032, |
| "num_tokens": 18918389.0, |
| "step": 1330 |
| }, |
| { |
| "entropy": 1.2683462664484977, |
| "epoch": 1.072, |
| "grad_norm": 0.0140380859375, |
| "learning_rate": 1.924666666666667e-05, |
| "loss": 2.225, |
| "mean_token_accuracy": 0.5798949740827084, |
| "num_tokens": 19055365.0, |
| "step": 1340 |
| }, |
| { |
| "entropy": 1.3234039679169656, |
| "epoch": 1.08, |
| "grad_norm": 0.0096435546875, |
| "learning_rate": 1.9224444444444444e-05, |
| "loss": 2.2342, |
| "mean_token_accuracy": 0.572484284453094, |
| "num_tokens": 19202006.0, |
| "step": 1350 |
| }, |
| { |
| "entropy": 1.2823971390724183, |
| "epoch": 1.088, |
| "grad_norm": 0.00909423828125, |
| "learning_rate": 1.9202222222222225e-05, |
| "loss": 2.2072, |
| "mean_token_accuracy": 0.5814019978046417, |
| "num_tokens": 19345730.0, |
| "step": 1360 |
| }, |
| { |
| "entropy": 1.328789808228612, |
| "epoch": 1.096, |
| "grad_norm": 0.008544921875, |
| "learning_rate": 1.918e-05, |
| "loss": 2.274, |
| "mean_token_accuracy": 0.5689424321055412, |
| "num_tokens": 19494084.0, |
| "step": 1370 |
| }, |
| { |
| "entropy": 1.2878280565142632, |
| "epoch": 1.104, |
| "grad_norm": 0.01165771484375, |
| "learning_rate": 1.915777777777778e-05, |
| "loss": 2.2215, |
| "mean_token_accuracy": 0.5786661734804511, |
| "num_tokens": 19637390.0, |
| "step": 1380 |
| }, |
| { |
| "entropy": 1.2910890705883502, |
| "epoch": 1.112, |
| "grad_norm": 0.018310546875, |
| "learning_rate": 1.9135555555555555e-05, |
| "loss": 2.2246, |
| "mean_token_accuracy": 0.5769055737182498, |
| "num_tokens": 19777388.0, |
| "step": 1390 |
| }, |
| { |
| "entropy": 1.2642895836383103, |
| "epoch": 1.12, |
| "grad_norm": 0.01123046875, |
| "learning_rate": 1.9113333333333336e-05, |
| "loss": 2.2314, |
| "mean_token_accuracy": 0.5781507005915045, |
| "num_tokens": 19923914.0, |
| "step": 1400 |
| }, |
| { |
| "epoch": 1.12, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 19923914.0, |
| "eval_coding_runtime": 95.6535, |
| "eval_coding_samples_per_second": 5.227, |
| "eval_coding_steps_per_second": 2.614, |
| "step": 1400 |
| }, |
| { |
| "epoch": 1.12, |
| "eval_math_entropy": 1.2986711366176604, |
| "eval_math_loss": 2.2429049015045166, |
| "eval_math_mean_token_accuracy": 0.5744417759180069, |
| "eval_math_num_tokens": 19923914.0, |
| "eval_math_runtime": 55.1965, |
| "eval_math_samples_per_second": 9.059, |
| "eval_math_steps_per_second": 4.529, |
| "step": 1400 |
| }, |
| { |
| "entropy": 1.3085532050579787, |
| "epoch": 1.1280000000000001, |
| "grad_norm": 0.01422119140625, |
| "learning_rate": 1.9091111111111113e-05, |
| "loss": 2.322, |
| "mean_token_accuracy": 0.5657644450664521, |
| "num_tokens": 20061623.0, |
| "step": 1410 |
| }, |
| { |
| "entropy": 1.2904092378914356, |
| "epoch": 1.1360000000000001, |
| "grad_norm": 0.017333984375, |
| "learning_rate": 1.906888888888889e-05, |
| "loss": 2.2418, |
| "mean_token_accuracy": 0.5722202086821199, |
| "num_tokens": 20201892.0, |
| "step": 1420 |
| }, |
| { |
| "entropy": 1.297779395803809, |
| "epoch": 1.144, |
| "grad_norm": 0.0078125, |
| "learning_rate": 1.904666666666667e-05, |
| "loss": 2.2403, |
| "mean_token_accuracy": 0.5758582351729273, |
| "num_tokens": 20343948.0, |
| "step": 1430 |
| }, |
| { |
| "entropy": 1.302032370120287, |
| "epoch": 1.152, |
| "grad_norm": 0.00811767578125, |
| "learning_rate": 1.9024444444444446e-05, |
| "loss": 2.2752, |
| "mean_token_accuracy": 0.57330860812217, |
| "num_tokens": 20490282.0, |
| "step": 1440 |
| }, |
| { |
| "entropy": 1.2779104180634022, |
| "epoch": 1.16, |
| "grad_norm": 0.0203857421875, |
| "learning_rate": 1.9002222222222224e-05, |
| "loss": 2.1719, |
| "mean_token_accuracy": 0.5854279346764087, |
| "num_tokens": 20641737.0, |
| "step": 1450 |
| }, |
| { |
| "entropy": 1.2778663296252488, |
| "epoch": 1.168, |
| "grad_norm": 0.025146484375, |
| "learning_rate": 1.898e-05, |
| "loss": 2.173, |
| "mean_token_accuracy": 0.582743788510561, |
| "num_tokens": 20785786.0, |
| "step": 1460 |
| }, |
| { |
| "entropy": 1.3081208609044552, |
| "epoch": 1.176, |
| "grad_norm": 0.015380859375, |
| "learning_rate": 1.895777777777778e-05, |
| "loss": 2.2775, |
| "mean_token_accuracy": 0.5689503744244575, |
| "num_tokens": 20928880.0, |
| "step": 1470 |
| }, |
| { |
| "entropy": 1.2751742608845233, |
| "epoch": 1.184, |
| "grad_norm": 0.0086669921875, |
| "learning_rate": 1.8935555555555556e-05, |
| "loss": 2.2062, |
| "mean_token_accuracy": 0.5821780592203141, |
| "num_tokens": 21074205.0, |
| "step": 1480 |
| }, |
| { |
| "entropy": 1.27163728736341, |
| "epoch": 1.192, |
| "grad_norm": 0.017822265625, |
| "learning_rate": 1.8913333333333334e-05, |
| "loss": 2.1817, |
| "mean_token_accuracy": 0.5830474335700273, |
| "num_tokens": 21222288.0, |
| "step": 1490 |
| }, |
| { |
| "entropy": 1.2738530434668065, |
| "epoch": 1.2, |
| "grad_norm": 0.011962890625, |
| "learning_rate": 1.8891111111111115e-05, |
| "loss": 2.1801, |
| "mean_token_accuracy": 0.5803947079926729, |
| "num_tokens": 21369159.0, |
| "step": 1500 |
| }, |
| { |
| "epoch": 1.2, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 21369159.0, |
| "eval_coding_runtime": 95.695, |
| "eval_coding_samples_per_second": 5.225, |
| "eval_coding_steps_per_second": 2.612, |
| "step": 1500 |
| }, |
| { |
| "epoch": 1.2, |
| "eval_math_entropy": 1.2986711366176604, |
| "eval_math_loss": 2.2429049015045166, |
| "eval_math_mean_token_accuracy": 0.5744417759180069, |
| "eval_math_num_tokens": 21369159.0, |
| "eval_math_runtime": 54.7644, |
| "eval_math_samples_per_second": 9.13, |
| "eval_math_steps_per_second": 4.565, |
| "step": 1500 |
| }, |
| { |
| "entropy": 1.3021067466586829, |
| "epoch": 1.208, |
| "grad_norm": 0.01422119140625, |
| "learning_rate": 1.886888888888889e-05, |
| "loss": 2.1707, |
| "mean_token_accuracy": 0.5797995328903198, |
| "num_tokens": 21511015.0, |
| "step": 1510 |
| }, |
| { |
| "entropy": 1.303629470616579, |
| "epoch": 1.216, |
| "grad_norm": 0.01251220703125, |
| "learning_rate": 1.884666666666667e-05, |
| "loss": 2.1907, |
| "mean_token_accuracy": 0.577221317216754, |
| "num_tokens": 21649178.0, |
| "step": 1520 |
| }, |
| { |
| "entropy": 1.2927171494811773, |
| "epoch": 1.224, |
| "grad_norm": 0.022216796875, |
| "learning_rate": 1.8824444444444445e-05, |
| "loss": 2.2467, |
| "mean_token_accuracy": 0.5752095308154821, |
| "num_tokens": 21790802.0, |
| "step": 1530 |
| }, |
| { |
| "entropy": 1.2608089979737997, |
| "epoch": 1.232, |
| "grad_norm": 0.01416015625, |
| "learning_rate": 1.8802222222222226e-05, |
| "loss": 2.1654, |
| "mean_token_accuracy": 0.58616862334311, |
| "num_tokens": 21930239.0, |
| "step": 1540 |
| }, |
| { |
| "entropy": 1.279138757288456, |
| "epoch": 1.24, |
| "grad_norm": 0.0164794921875, |
| "learning_rate": 1.878e-05, |
| "loss": 2.1775, |
| "mean_token_accuracy": 0.579110798984766, |
| "num_tokens": 22075270.0, |
| "step": 1550 |
| }, |
| { |
| "entropy": 1.2945632576942443, |
| "epoch": 1.248, |
| "grad_norm": 0.015380859375, |
| "learning_rate": 1.875777777777778e-05, |
| "loss": 2.2213, |
| "mean_token_accuracy": 0.5734076691791415, |
| "num_tokens": 22216387.0, |
| "step": 1560 |
| }, |
| { |
| "entropy": 1.314554738625884, |
| "epoch": 1.256, |
| "grad_norm": 0.021484375, |
| "learning_rate": 1.873555555555556e-05, |
| "loss": 2.2745, |
| "mean_token_accuracy": 0.5707367097958922, |
| "num_tokens": 22359813.0, |
| "step": 1570 |
| }, |
| { |
| "entropy": 1.32604789622128, |
| "epoch": 1.264, |
| "grad_norm": 0.011962890625, |
| "learning_rate": 1.8713333333333336e-05, |
| "loss": 2.25, |
| "mean_token_accuracy": 0.5707180803641677, |
| "num_tokens": 22501002.0, |
| "step": 1580 |
| }, |
| { |
| "entropy": 1.280422120168805, |
| "epoch": 1.272, |
| "grad_norm": 0.008056640625, |
| "learning_rate": 1.8691111111111114e-05, |
| "loss": 2.1828, |
| "mean_token_accuracy": 0.581519584544003, |
| "num_tokens": 22634652.0, |
| "step": 1590 |
| }, |
| { |
| "entropy": 1.2957364562898874, |
| "epoch": 1.28, |
| "grad_norm": 0.0185546875, |
| "learning_rate": 1.866888888888889e-05, |
| "loss": 2.2264, |
| "mean_token_accuracy": 0.5763478111475706, |
| "num_tokens": 22779682.0, |
| "step": 1600 |
| }, |
| { |
| "epoch": 1.28, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 22779682.0, |
| "eval_coding_runtime": 95.6936, |
| "eval_coding_samples_per_second": 5.225, |
| "eval_coding_steps_per_second": 2.613, |
| "step": 1600 |
| }, |
| { |
| "epoch": 1.28, |
| "eval_math_entropy": 1.2986711366176604, |
| "eval_math_loss": 2.2429049015045166, |
| "eval_math_mean_token_accuracy": 0.5744417759180069, |
| "eval_math_num_tokens": 22779682.0, |
| "eval_math_runtime": 55.2565, |
| "eval_math_samples_per_second": 9.049, |
| "eval_math_steps_per_second": 4.524, |
| "step": 1600 |
| }, |
| { |
| "entropy": 1.2870917547494174, |
| "epoch": 1.288, |
| "grad_norm": 0.01287841796875, |
| "learning_rate": 1.864666666666667e-05, |
| "loss": 2.1935, |
| "mean_token_accuracy": 0.5807379197329283, |
| "num_tokens": 22919374.0, |
| "step": 1610 |
| }, |
| { |
| "entropy": 1.268484988436103, |
| "epoch": 1.296, |
| "grad_norm": 0.01123046875, |
| "learning_rate": 1.8624444444444446e-05, |
| "loss": 2.183, |
| "mean_token_accuracy": 0.5854335069656372, |
| "num_tokens": 23057744.0, |
| "step": 1620 |
| }, |
| { |
| "entropy": 1.2904878597706557, |
| "epoch": 1.304, |
| "grad_norm": 0.0120849609375, |
| "learning_rate": 1.8602222222222224e-05, |
| "loss": 2.2618, |
| "mean_token_accuracy": 0.5753191851079464, |
| "num_tokens": 23201590.0, |
| "step": 1630 |
| }, |
| { |
| "entropy": 1.3022067859768867, |
| "epoch": 1.312, |
| "grad_norm": 0.0281982421875, |
| "learning_rate": 1.858e-05, |
| "loss": 2.2992, |
| "mean_token_accuracy": 0.5704338105395437, |
| "num_tokens": 23344644.0, |
| "step": 1640 |
| }, |
| { |
| "entropy": 1.3129540402442217, |
| "epoch": 1.32, |
| "grad_norm": 0.007171630859375, |
| "learning_rate": 1.855777777777778e-05, |
| "loss": 2.2085, |
| "mean_token_accuracy": 0.5713952157646418, |
| "num_tokens": 23485056.0, |
| "step": 1650 |
| }, |
| { |
| "entropy": 1.3017708092927933, |
| "epoch": 1.328, |
| "grad_norm": 0.008056640625, |
| "learning_rate": 1.8535555555555557e-05, |
| "loss": 2.1954, |
| "mean_token_accuracy": 0.5752975823357701, |
| "num_tokens": 23622405.0, |
| "step": 1660 |
| }, |
| { |
| "entropy": 1.289170829206705, |
| "epoch": 1.336, |
| "grad_norm": 0.00732421875, |
| "learning_rate": 1.8513333333333335e-05, |
| "loss": 2.2301, |
| "mean_token_accuracy": 0.5756044138222933, |
| "num_tokens": 23760641.0, |
| "step": 1670 |
| }, |
| { |
| "entropy": 1.262322897836566, |
| "epoch": 1.3439999999999999, |
| "grad_norm": 0.023193359375, |
| "learning_rate": 1.8491111111111112e-05, |
| "loss": 2.1401, |
| "mean_token_accuracy": 0.5876206021755934, |
| "num_tokens": 23899771.0, |
| "step": 1680 |
| }, |
| { |
| "entropy": 1.275562158226967, |
| "epoch": 1.3519999999999999, |
| "grad_norm": 0.00958251953125, |
| "learning_rate": 1.846888888888889e-05, |
| "loss": 2.1845, |
| "mean_token_accuracy": 0.5808436365798115, |
| "num_tokens": 24041995.0, |
| "step": 1690 |
| }, |
| { |
| "entropy": 1.2879225231707097, |
| "epoch": 1.3599999999999999, |
| "grad_norm": 0.010498046875, |
| "learning_rate": 1.8446666666666667e-05, |
| "loss": 2.2334, |
| "mean_token_accuracy": 0.5763995088636875, |
| "num_tokens": 24187023.0, |
| "step": 1700 |
| }, |
| { |
| "epoch": 1.3599999999999999, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 24187023.0, |
| "eval_coding_runtime": 95.7407, |
| "eval_coding_samples_per_second": 5.222, |
| "eval_coding_steps_per_second": 2.611, |
| "step": 1700 |
| }, |
| { |
| "epoch": 1.3599999999999999, |
| "eval_math_entropy": 1.2986711366176604, |
| "eval_math_loss": 2.2429049015045166, |
| "eval_math_mean_token_accuracy": 0.5744417759180069, |
| "eval_math_num_tokens": 24187023.0, |
| "eval_math_runtime": 55.2772, |
| "eval_math_samples_per_second": 9.045, |
| "eval_math_steps_per_second": 4.523, |
| "step": 1700 |
| }, |
| { |
| "entropy": 1.2914761129766703, |
| "epoch": 1.3679999999999999, |
| "grad_norm": 0.007598876953125, |
| "learning_rate": 1.842444444444445e-05, |
| "loss": 2.2328, |
| "mean_token_accuracy": 0.5740274475887418, |
| "num_tokens": 24325844.0, |
| "step": 1710 |
| }, |
| { |
| "entropy": 1.353976671770215, |
| "epoch": 1.376, |
| "grad_norm": 0.01495361328125, |
| "learning_rate": 1.8402222222222223e-05, |
| "loss": 2.3016, |
| "mean_token_accuracy": 0.5633362045511603, |
| "num_tokens": 24466132.0, |
| "step": 1720 |
| }, |
| { |
| "entropy": 1.2960197813808918, |
| "epoch": 1.384, |
| "grad_norm": 0.00830078125, |
| "learning_rate": 1.8380000000000004e-05, |
| "loss": 2.2201, |
| "mean_token_accuracy": 0.5783739054575563, |
| "num_tokens": 24606147.0, |
| "step": 1730 |
| }, |
| { |
| "entropy": 1.2861021481454373, |
| "epoch": 1.392, |
| "grad_norm": 0.00933837890625, |
| "learning_rate": 1.8357777777777778e-05, |
| "loss": 2.1876, |
| "mean_token_accuracy": 0.5815011257305741, |
| "num_tokens": 24748043.0, |
| "step": 1740 |
| }, |
| { |
| "entropy": 1.268753108009696, |
| "epoch": 1.4, |
| "grad_norm": 0.0189208984375, |
| "learning_rate": 1.833555555555556e-05, |
| "loss": 2.2244, |
| "mean_token_accuracy": 0.5804416090250015, |
| "num_tokens": 24894293.0, |
| "step": 1750 |
| }, |
| { |
| "entropy": 1.2929484587162734, |
| "epoch": 1.408, |
| "grad_norm": 0.021240234375, |
| "learning_rate": 1.8313333333333333e-05, |
| "loss": 2.2083, |
| "mean_token_accuracy": 0.575320690125227, |
| "num_tokens": 25036674.0, |
| "step": 1760 |
| }, |
| { |
| "entropy": 1.2982839655131102, |
| "epoch": 1.416, |
| "grad_norm": 0.021484375, |
| "learning_rate": 1.8291111111111114e-05, |
| "loss": 2.2784, |
| "mean_token_accuracy": 0.5741079377010465, |
| "num_tokens": 25178935.0, |
| "step": 1770 |
| }, |
| { |
| "entropy": 1.2535257052630187, |
| "epoch": 1.424, |
| "grad_norm": 0.007110595703125, |
| "learning_rate": 1.8268888888888888e-05, |
| "loss": 2.1463, |
| "mean_token_accuracy": 0.5878499284386635, |
| "num_tokens": 25324579.0, |
| "step": 1780 |
| }, |
| { |
| "entropy": 1.286781796067953, |
| "epoch": 1.432, |
| "grad_norm": 0.0107421875, |
| "learning_rate": 1.824666666666667e-05, |
| "loss": 2.2089, |
| "mean_token_accuracy": 0.576385061070323, |
| "num_tokens": 25466485.0, |
| "step": 1790 |
| }, |
| { |
| "entropy": 1.3003978207707405, |
| "epoch": 1.44, |
| "grad_norm": 0.016357421875, |
| "learning_rate": 1.8224444444444447e-05, |
| "loss": 2.2481, |
| "mean_token_accuracy": 0.5751635741442442, |
| "num_tokens": 25606824.0, |
| "step": 1800 |
| }, |
| { |
| "epoch": 1.44, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 25606824.0, |
| "eval_coding_runtime": 95.6673, |
| "eval_coding_samples_per_second": 5.226, |
| "eval_coding_steps_per_second": 2.613, |
| "step": 1800 |
| }, |
| { |
| "epoch": 1.44, |
| "eval_math_entropy": 1.2986711366176604, |
| "eval_math_loss": 2.2429049015045166, |
| "eval_math_mean_token_accuracy": 0.5744417759180069, |
| "eval_math_num_tokens": 25606824.0, |
| "eval_math_runtime": 55.2224, |
| "eval_math_samples_per_second": 9.054, |
| "eval_math_steps_per_second": 4.527, |
| "step": 1800 |
| }, |
| { |
| "entropy": 1.2908053163439035, |
| "epoch": 1.448, |
| "grad_norm": 0.00799560546875, |
| "learning_rate": 1.8202222222222225e-05, |
| "loss": 2.1957, |
| "mean_token_accuracy": 0.5789953164756299, |
| "num_tokens": 25745729.0, |
| "step": 1810 |
| }, |
| { |
| "entropy": 1.273376001417637, |
| "epoch": 1.456, |
| "grad_norm": 0.01336669921875, |
| "learning_rate": 1.8180000000000002e-05, |
| "loss": 2.1926, |
| "mean_token_accuracy": 0.5779400140047073, |
| "num_tokens": 25886396.0, |
| "step": 1820 |
| }, |
| { |
| "entropy": 1.2855514109134674, |
| "epoch": 1.464, |
| "grad_norm": 0.010498046875, |
| "learning_rate": 1.815777777777778e-05, |
| "loss": 2.2374, |
| "mean_token_accuracy": 0.5767241906374693, |
| "num_tokens": 26026227.0, |
| "step": 1830 |
| }, |
| { |
| "entropy": 1.2807271871715784, |
| "epoch": 1.472, |
| "grad_norm": 0.00823974609375, |
| "learning_rate": 1.8135555555555557e-05, |
| "loss": 2.1749, |
| "mean_token_accuracy": 0.5822155047208071, |
| "num_tokens": 26163618.0, |
| "step": 1840 |
| }, |
| { |
| "entropy": 1.3182869713753462, |
| "epoch": 1.48, |
| "grad_norm": 0.01318359375, |
| "learning_rate": 1.8113333333333335e-05, |
| "loss": 2.2544, |
| "mean_token_accuracy": 0.5738100804388523, |
| "num_tokens": 26299086.0, |
| "step": 1850 |
| }, |
| { |
| "entropy": 1.3167364660650491, |
| "epoch": 1.488, |
| "grad_norm": 0.01043701171875, |
| "learning_rate": 1.8091111111111113e-05, |
| "loss": 2.2364, |
| "mean_token_accuracy": 0.5735991884022951, |
| "num_tokens": 26438338.0, |
| "step": 1860 |
| }, |
| { |
| "entropy": 1.2824474953114986, |
| "epoch": 1.496, |
| "grad_norm": 0.0125732421875, |
| "learning_rate": 1.806888888888889e-05, |
| "loss": 2.2469, |
| "mean_token_accuracy": 0.5744533261284233, |
| "num_tokens": 26583001.0, |
| "step": 1870 |
| }, |
| { |
| "entropy": 1.277822620794177, |
| "epoch": 1.504, |
| "grad_norm": 0.018798828125, |
| "learning_rate": 1.8046666666666668e-05, |
| "loss": 2.2308, |
| "mean_token_accuracy": 0.5790910394862294, |
| "num_tokens": 26729255.0, |
| "step": 1880 |
| }, |
| { |
| "entropy": 1.2929816491901875, |
| "epoch": 1.512, |
| "grad_norm": 0.00946044921875, |
| "learning_rate": 1.8024444444444445e-05, |
| "loss": 2.2212, |
| "mean_token_accuracy": 0.5757774453610182, |
| "num_tokens": 26870139.0, |
| "step": 1890 |
| }, |
| { |
| "entropy": 1.28971153460443, |
| "epoch": 1.52, |
| "grad_norm": 0.010498046875, |
| "learning_rate": 1.8002222222222223e-05, |
| "loss": 2.2155, |
| "mean_token_accuracy": 0.5779576146975159, |
| "num_tokens": 27017935.0, |
| "step": 1900 |
| }, |
| { |
| "epoch": 1.52, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 27017935.0, |
| "eval_coding_runtime": 95.8867, |
| "eval_coding_samples_per_second": 5.214, |
| "eval_coding_steps_per_second": 2.607, |
| "step": 1900 |
| }, |
| { |
| "epoch": 1.52, |
| "eval_math_entropy": 1.2986711366176604, |
| "eval_math_loss": 2.2429049015045166, |
| "eval_math_mean_token_accuracy": 0.5744417759180069, |
| "eval_math_num_tokens": 27017935.0, |
| "eval_math_runtime": 55.1777, |
| "eval_math_samples_per_second": 9.062, |
| "eval_math_steps_per_second": 4.531, |
| "step": 1900 |
| }, |
| { |
| "entropy": 1.331830918043852, |
| "epoch": 1.528, |
| "grad_norm": 0.009033203125, |
| "learning_rate": 1.798e-05, |
| "loss": 2.2721, |
| "mean_token_accuracy": 0.5682261852547527, |
| "num_tokens": 27159322.0, |
| "step": 1910 |
| }, |
| { |
| "entropy": 1.2588583029806615, |
| "epoch": 1.536, |
| "grad_norm": 0.00848388671875, |
| "learning_rate": 1.7957777777777778e-05, |
| "loss": 2.1588, |
| "mean_token_accuracy": 0.5847917202860117, |
| "num_tokens": 27306339.0, |
| "step": 1920 |
| }, |
| { |
| "entropy": 1.2904524333775043, |
| "epoch": 1.544, |
| "grad_norm": 0.01312255859375, |
| "learning_rate": 1.7935555555555556e-05, |
| "loss": 2.2221, |
| "mean_token_accuracy": 0.5756975743919611, |
| "num_tokens": 27445653.0, |
| "step": 1930 |
| }, |
| { |
| "entropy": 1.2871752437204123, |
| "epoch": 1.552, |
| "grad_norm": 0.00830078125, |
| "learning_rate": 1.7913333333333337e-05, |
| "loss": 2.22, |
| "mean_token_accuracy": 0.5743231695145369, |
| "num_tokens": 27591959.0, |
| "step": 1940 |
| }, |
| { |
| "entropy": 1.2857405241578816, |
| "epoch": 1.56, |
| "grad_norm": 0.0101318359375, |
| "learning_rate": 1.789111111111111e-05, |
| "loss": 2.2188, |
| "mean_token_accuracy": 0.5781919835135341, |
| "num_tokens": 27736783.0, |
| "step": 1950 |
| }, |
| { |
| "entropy": 1.2675248205661773, |
| "epoch": 1.568, |
| "grad_norm": 0.009033203125, |
| "learning_rate": 1.7868888888888892e-05, |
| "loss": 2.2334, |
| "mean_token_accuracy": 0.5779768938198686, |
| "num_tokens": 27884398.0, |
| "step": 1960 |
| }, |
| { |
| "entropy": 1.3143243368715047, |
| "epoch": 1.576, |
| "grad_norm": 0.01068115234375, |
| "learning_rate": 1.7846666666666666e-05, |
| "loss": 2.2206, |
| "mean_token_accuracy": 0.5736559564247727, |
| "num_tokens": 28025205.0, |
| "step": 1970 |
| }, |
| { |
| "entropy": 1.2710258401930332, |
| "epoch": 1.584, |
| "grad_norm": 0.00933837890625, |
| "learning_rate": 1.7824444444444447e-05, |
| "loss": 2.1864, |
| "mean_token_accuracy": 0.5820634830743074, |
| "num_tokens": 28171274.0, |
| "step": 1980 |
| }, |
| { |
| "entropy": 1.2678415119647979, |
| "epoch": 1.592, |
| "grad_norm": 0.0164794921875, |
| "learning_rate": 1.780222222222222e-05, |
| "loss": 2.2105, |
| "mean_token_accuracy": 0.5788672368973493, |
| "num_tokens": 28311058.0, |
| "step": 1990 |
| }, |
| { |
| "entropy": 1.2863474819809197, |
| "epoch": 1.6, |
| "grad_norm": 0.0166015625, |
| "learning_rate": 1.7780000000000003e-05, |
| "loss": 2.2279, |
| "mean_token_accuracy": 0.5786767426878214, |
| "num_tokens": 28457624.0, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.6, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 28457624.0, |
| "eval_coding_runtime": 95.6104, |
| "eval_coding_samples_per_second": 5.23, |
| "eval_coding_steps_per_second": 2.615, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.6, |
| "eval_math_entropy": 1.2986711366176604, |
| "eval_math_loss": 2.2429049015045166, |
| "eval_math_mean_token_accuracy": 0.5744417759180069, |
| "eval_math_num_tokens": 28457624.0, |
| "eval_math_runtime": 54.9191, |
| "eval_math_samples_per_second": 9.104, |
| "eval_math_steps_per_second": 4.552, |
| "step": 2000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 10000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 8, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.32515376937045e+18, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|