{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6, "eval_steps": 100, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.7581225659698247, "epoch": 0.004, "grad_norm": 101.5, "learning_rate": 1.8e-07, "loss": 18.1141, "mean_token_accuracy": 0.7393306585028767, "num_tokens": 72542.0, "step": 10 }, { "entropy": 0.7520309612154961, "epoch": 0.008, "grad_norm": 162.0, "learning_rate": 3.8e-07, "loss": 18.2441, "mean_token_accuracy": 0.7392390985041857, "num_tokens": 138146.0, "step": 20 }, { "entropy": 0.741816989146173, "epoch": 0.012, "grad_norm": 92.5, "learning_rate": 5.800000000000001e-07, "loss": 17.8539, "mean_token_accuracy": 0.7405363619327545, "num_tokens": 208409.0, "step": 30 }, { "entropy": 0.7636421315371991, "epoch": 0.016, "grad_norm": 88.0, "learning_rate": 7.8e-07, "loss": 18.142, "mean_token_accuracy": 0.7400334309786558, "num_tokens": 280941.0, "step": 40 }, { "entropy": 0.7905357209965587, "epoch": 0.02, "grad_norm": 108.0, "learning_rate": 9.800000000000001e-07, "loss": 18.1041, "mean_token_accuracy": 0.7354144401848316, "num_tokens": 349731.0, "step": 50 }, { "entropy": 0.8223027169704438, "epoch": 0.024, "grad_norm": 112.5, "learning_rate": 1.1800000000000001e-06, "loss": 18.4599, "mean_token_accuracy": 0.731158659607172, "num_tokens": 419127.0, "step": 60 }, { "entropy": 0.7908559931442142, "epoch": 0.028, "grad_norm": 106.0, "learning_rate": 1.3800000000000001e-06, "loss": 17.5053, "mean_token_accuracy": 0.7440328601747751, "num_tokens": 489016.0, "step": 70 }, { "entropy": 0.8229548260569572, "epoch": 0.032, "grad_norm": 86.0, "learning_rate": 1.5800000000000001e-06, "loss": 17.3714, "mean_token_accuracy": 0.7438380591571331, "num_tokens": 558056.0, "step": 80 }, { "entropy": 0.84512098710984, "epoch": 0.036, "grad_norm": 100.5, "learning_rate": 1.7800000000000001e-06, "loss": 17.2456, "mean_token_accuracy": 0.7478047309443354, "num_tokens": 627804.0, "step": 90 }, { "entropy": 0.8738276792690158, "epoch": 0.04, "grad_norm": 67.0, "learning_rate": 1.98e-06, "loss": 16.6896, "mean_token_accuracy": 0.748936639726162, "num_tokens": 697556.0, "step": 100 }, { "epoch": 0.04, "eval_biology_entropy": 1.1379984815716744, "eval_biology_loss": 1.2692387104034424, "eval_biology_mean_token_accuracy": 0.6880569371581078, "eval_biology_num_tokens": 697556.0, "eval_biology_runtime": 40.5573, "eval_biology_samples_per_second": 12.328, "eval_biology_steps_per_second": 12.328, "step": 100 }, { "epoch": 0.04, "eval_math_entropy": 0.9095822716951371, "eval_math_loss": 1.0899367332458496, "eval_math_mean_token_accuracy": 0.7395683622956276, "eval_math_num_tokens": 697556.0, "eval_math_runtime": 48.0541, "eval_math_samples_per_second": 10.405, "eval_math_steps_per_second": 10.405, "step": 100 }, { "entropy": 0.9193563124164938, "epoch": 0.044, "grad_norm": 88.5, "learning_rate": 2.1800000000000003e-06, "loss": 17.3763, "mean_token_accuracy": 0.7420263335108757, "num_tokens": 765262.0, "step": 110 }, { "entropy": 0.8906641118228436, "epoch": 0.048, "grad_norm": 78.0, "learning_rate": 2.38e-06, "loss": 16.2142, "mean_token_accuracy": 0.7552934620529413, "num_tokens": 836753.0, "step": 120 }, { "entropy": 0.9178624864667654, "epoch": 0.052, "grad_norm": 70.0, "learning_rate": 2.5800000000000003e-06, "loss": 16.3594, "mean_token_accuracy": 0.7507667759433388, "num_tokens": 909947.0, "step": 130 }, { "entropy": 0.9556385111063719, "epoch": 0.056, "grad_norm": 70.0, "learning_rate": 2.7800000000000005e-06, "loss": 16.6438, "mean_token_accuracy": 0.748611630871892, "num_tokens": 981959.0, "step": 140 }, { "entropy": 0.9541467605158687, "epoch": 0.06, "grad_norm": 86.0, "learning_rate": 2.9800000000000003e-06, "loss": 16.5304, "mean_token_accuracy": 0.7486465960741043, "num_tokens": 1053624.0, "step": 150 }, { "entropy": 0.9960285406559706, "epoch": 0.064, "grad_norm": 78.0, "learning_rate": 3.1800000000000005e-06, "loss": 16.9435, "mean_token_accuracy": 0.7448480188846588, "num_tokens": 1126446.0, "step": 160 }, { "entropy": 0.9677659103646874, "epoch": 0.068, "grad_norm": 106.5, "learning_rate": 3.3800000000000007e-06, "loss": 16.1013, "mean_token_accuracy": 0.7561902595683933, "num_tokens": 1196286.0, "step": 170 }, { "entropy": 0.9270429328083992, "epoch": 0.072, "grad_norm": 62.0, "learning_rate": 3.58e-06, "loss": 15.3741, "mean_token_accuracy": 0.763100459985435, "num_tokens": 1273905.0, "step": 180 }, { "entropy": 0.9678281666710973, "epoch": 0.076, "grad_norm": 51.0, "learning_rate": 3.7800000000000002e-06, "loss": 15.8618, "mean_token_accuracy": 0.7534618094563484, "num_tokens": 1346454.0, "step": 190 }, { "entropy": 0.9695806819945574, "epoch": 0.08, "grad_norm": 61.25, "learning_rate": 3.980000000000001e-06, "loss": 15.7874, "mean_token_accuracy": 0.7560878146439791, "num_tokens": 1413596.0, "step": 200 }, { "epoch": 0.08, "eval_biology_entropy": 1.214613477051258, "eval_biology_loss": 1.2630934715270996, "eval_biology_mean_token_accuracy": 0.6885634814500808, "eval_biology_num_tokens": 1413596.0, "eval_biology_runtime": 40.4294, "eval_biology_samples_per_second": 12.367, "eval_biology_steps_per_second": 12.367, "step": 200 }, { "epoch": 0.08, "eval_math_entropy": 0.9574190213680267, "eval_math_loss": 0.9871246218681335, "eval_math_mean_token_accuracy": 0.7557044946551323, "eval_math_num_tokens": 1413596.0, "eval_math_runtime": 48.0895, "eval_math_samples_per_second": 10.397, "eval_math_steps_per_second": 10.397, "step": 200 }, { "entropy": 0.8888338362798095, "epoch": 0.084, "grad_norm": 61.25, "learning_rate": 4.18e-06, "loss": 14.623, "mean_token_accuracy": 0.7706882227212191, "num_tokens": 1485147.0, "step": 210 }, { "entropy": 0.906145141646266, "epoch": 0.088, "grad_norm": 53.5, "learning_rate": 4.38e-06, "loss": 14.8128, "mean_token_accuracy": 0.7661576565355063, "num_tokens": 1559450.0, "step": 220 }, { "entropy": 0.95110107883811, "epoch": 0.092, "grad_norm": 63.25, "learning_rate": 4.58e-06, "loss": 15.2457, "mean_token_accuracy": 0.761219710484147, "num_tokens": 1631535.0, "step": 230 }, { "entropy": 0.9411580273881555, "epoch": 0.096, "grad_norm": 51.0, "learning_rate": 4.78e-06, "loss": 14.938, "mean_token_accuracy": 0.7646324578672647, "num_tokens": 1701193.0, "step": 240 }, { "entropy": 0.9254088591784239, "epoch": 0.1, "grad_norm": 56.25, "learning_rate": 4.980000000000001e-06, "loss": 14.6029, "mean_token_accuracy": 0.7678503680974245, "num_tokens": 1773596.0, "step": 250 }, { "entropy": 0.9214777417480946, "epoch": 0.104, "grad_norm": 54.0, "learning_rate": 5.18e-06, "loss": 15.043, "mean_token_accuracy": 0.7618352189660073, "num_tokens": 1840165.0, "step": 260 }, { "entropy": 0.9363527970388532, "epoch": 0.108, "grad_norm": 111.5, "learning_rate": 5.380000000000001e-06, "loss": 15.6052, "mean_token_accuracy": 0.7564919672906398, "num_tokens": 1908149.0, "step": 270 }, { "entropy": 0.9113753678277134, "epoch": 0.112, "grad_norm": 53.0, "learning_rate": 5.580000000000001e-06, "loss": 14.4645, "mean_token_accuracy": 0.7736385434865951, "num_tokens": 1979232.0, "step": 280 }, { "entropy": 0.916975767724216, "epoch": 0.116, "grad_norm": 54.5, "learning_rate": 5.78e-06, "loss": 15.0318, "mean_token_accuracy": 0.7656373519450426, "num_tokens": 2052039.0, "step": 290 }, { "entropy": 0.915022067911923, "epoch": 0.12, "grad_norm": 46.25, "learning_rate": 5.98e-06, "loss": 14.7236, "mean_token_accuracy": 0.7691222932189703, "num_tokens": 2125815.0, "step": 300 }, { "epoch": 0.12, "eval_biology_entropy": 1.2286294117569923, "eval_biology_loss": 1.268429160118103, "eval_biology_mean_token_accuracy": 0.6861296361088752, "eval_biology_num_tokens": 2125815.0, "eval_biology_runtime": 40.4544, "eval_biology_samples_per_second": 12.36, "eval_biology_steps_per_second": 12.36, "step": 300 }, { "epoch": 0.12, "eval_math_entropy": 0.9302992832064628, "eval_math_loss": 0.9273415803909302, "eval_math_mean_token_accuracy": 0.7664831226468086, "eval_math_num_tokens": 2125815.0, "eval_math_runtime": 48.047, "eval_math_samples_per_second": 10.406, "eval_math_steps_per_second": 10.406, "step": 300 }, { "entropy": 0.9336849415674806, "epoch": 0.124, "grad_norm": 55.25, "learning_rate": 6.18e-06, "loss": 14.9674, "mean_token_accuracy": 0.7661335166543722, "num_tokens": 2201991.0, "step": 310 }, { "entropy": 0.8756070390343667, "epoch": 0.128, "grad_norm": 52.25, "learning_rate": 6.380000000000001e-06, "loss": 14.0, "mean_token_accuracy": 0.7786776568740607, "num_tokens": 2274177.0, "step": 320 }, { "entropy": 0.892756593041122, "epoch": 0.132, "grad_norm": 45.75, "learning_rate": 6.5800000000000005e-06, "loss": 14.1896, "mean_token_accuracy": 0.7731969062238931, "num_tokens": 2342173.0, "step": 330 }, { "entropy": 0.8899674592539668, "epoch": 0.136, "grad_norm": 56.5, "learning_rate": 6.780000000000001e-06, "loss": 14.2167, "mean_token_accuracy": 0.7683363553136587, "num_tokens": 2412435.0, "step": 340 }, { "entropy": 0.9237651886418462, "epoch": 0.14, "grad_norm": 49.5, "learning_rate": 6.98e-06, "loss": 14.7203, "mean_token_accuracy": 0.7684664275497198, "num_tokens": 2481105.0, "step": 350 }, { "entropy": 0.8494639024138451, "epoch": 0.144, "grad_norm": 49.0, "learning_rate": 7.180000000000001e-06, "loss": 13.6956, "mean_token_accuracy": 0.7817885402590037, "num_tokens": 2548445.0, "step": 360 }, { "entropy": 0.8341110894456506, "epoch": 0.148, "grad_norm": 44.0, "learning_rate": 7.3800000000000005e-06, "loss": 13.2223, "mean_token_accuracy": 0.789648124948144, "num_tokens": 2621364.0, "step": 370 }, { "entropy": 0.8608193326741457, "epoch": 0.152, "grad_norm": 52.25, "learning_rate": 7.58e-06, "loss": 13.8861, "mean_token_accuracy": 0.7754677496850491, "num_tokens": 2686462.0, "step": 380 }, { "entropy": 0.8447565982118249, "epoch": 0.156, "grad_norm": 50.0, "learning_rate": 7.78e-06, "loss": 13.5199, "mean_token_accuracy": 0.7812942665070295, "num_tokens": 2754241.0, "step": 390 }, { "entropy": 0.8711243279278278, "epoch": 0.16, "grad_norm": 79.0, "learning_rate": 7.980000000000002e-06, "loss": 14.1276, "mean_token_accuracy": 0.7742310840636492, "num_tokens": 2824418.0, "step": 400 }, { "epoch": 0.16, "eval_biology_entropy": 1.22758097666502, "eval_biology_loss": 1.2762199640274048, "eval_biology_mean_token_accuracy": 0.6836535354852676, "eval_biology_num_tokens": 2824418.0, "eval_biology_runtime": 40.7281, "eval_biology_samples_per_second": 12.277, "eval_biology_steps_per_second": 12.277, "step": 400 }, { "epoch": 0.16, "eval_math_entropy": 0.9047691844701767, "eval_math_loss": 0.8841754198074341, "eval_math_mean_token_accuracy": 0.775034920334816, "eval_math_num_tokens": 2824418.0, "eval_math_runtime": 47.9197, "eval_math_samples_per_second": 10.434, "eval_math_steps_per_second": 10.434, "step": 400 }, { "entropy": 0.8859184484928846, "epoch": 0.164, "grad_norm": 127.0, "learning_rate": 8.18e-06, "loss": 14.142, "mean_token_accuracy": 0.7765266574919224, "num_tokens": 2895667.0, "step": 410 }, { "entropy": 0.8501209424808621, "epoch": 0.168, "grad_norm": 40.0, "learning_rate": 8.380000000000001e-06, "loss": 13.8435, "mean_token_accuracy": 0.7774271417409182, "num_tokens": 2966031.0, "step": 420 }, { "entropy": 0.8510567891411484, "epoch": 0.172, "grad_norm": 45.75, "learning_rate": 8.580000000000001e-06, "loss": 13.3673, "mean_token_accuracy": 0.7858892437070608, "num_tokens": 3039780.0, "step": 430 }, { "entropy": 0.8355857279151678, "epoch": 0.176, "grad_norm": 44.5, "learning_rate": 8.78e-06, "loss": 13.7383, "mean_token_accuracy": 0.7784004233777523, "num_tokens": 3110313.0, "step": 440 }, { "entropy": 0.8662491450086236, "epoch": 0.18, "grad_norm": 43.75, "learning_rate": 8.98e-06, "loss": 13.6485, "mean_token_accuracy": 0.7806661173701286, "num_tokens": 3179963.0, "step": 450 }, { "entropy": 0.8596813324838877, "epoch": 0.184, "grad_norm": 46.0, "learning_rate": 9.180000000000002e-06, "loss": 13.7717, "mean_token_accuracy": 0.7796813258901238, "num_tokens": 3251139.0, "step": 460 }, { "entropy": 0.8454847941175103, "epoch": 0.188, "grad_norm": 56.25, "learning_rate": 9.38e-06, "loss": 13.4682, "mean_token_accuracy": 0.7803339377045632, "num_tokens": 3323519.0, "step": 470 }, { "entropy": 0.8228314617648721, "epoch": 0.192, "grad_norm": 49.75, "learning_rate": 9.58e-06, "loss": 13.3842, "mean_token_accuracy": 0.7827629748731851, "num_tokens": 3394170.0, "step": 480 }, { "entropy": 0.8164506537839771, "epoch": 0.196, "grad_norm": 54.25, "learning_rate": 9.780000000000001e-06, "loss": 13.1965, "mean_token_accuracy": 0.7878578577190638, "num_tokens": 3465341.0, "step": 490 }, { "entropy": 0.8838530456647277, "epoch": 0.2, "grad_norm": 51.0, "learning_rate": 9.980000000000001e-06, "loss": 13.9771, "mean_token_accuracy": 0.7734297584742308, "num_tokens": 3533571.0, "step": 500 }, { "epoch": 0.2, "eval_biology_entropy": 1.2344867270588875, "eval_biology_loss": 1.2817195653915405, "eval_biology_mean_token_accuracy": 0.6832386125922203, "eval_biology_num_tokens": 3533571.0, "eval_biology_runtime": 40.4092, "eval_biology_samples_per_second": 12.373, "eval_biology_steps_per_second": 12.373, "step": 500 }, { "epoch": 0.2, "eval_math_entropy": 0.8546719763875008, "eval_math_loss": 0.8547734618186951, "eval_math_mean_token_accuracy": 0.7804626071453095, "eval_math_num_tokens": 3533571.0, "eval_math_runtime": 48.0961, "eval_math_samples_per_second": 10.396, "eval_math_steps_per_second": 10.396, "step": 500 }, { "entropy": 0.8578155474737287, "epoch": 0.204, "grad_norm": 45.75, "learning_rate": 1.018e-05, "loss": 13.8097, "mean_token_accuracy": 0.776990968361497, "num_tokens": 3602983.0, "step": 510 }, { "entropy": 0.8306718476116657, "epoch": 0.208, "grad_norm": 50.25, "learning_rate": 1.038e-05, "loss": 13.4186, "mean_token_accuracy": 0.7863588180392981, "num_tokens": 3673600.0, "step": 520 }, { "entropy": 0.8772016573697329, "epoch": 0.212, "grad_norm": 45.75, "learning_rate": 1.0580000000000002e-05, "loss": 13.9353, "mean_token_accuracy": 0.7783012766391039, "num_tokens": 3743641.0, "step": 530 }, { "entropy": 0.8009312467649579, "epoch": 0.216, "grad_norm": 55.25, "learning_rate": 1.0780000000000002e-05, "loss": 12.9356, "mean_token_accuracy": 0.7893661856651306, "num_tokens": 3817397.0, "step": 540 }, { "entropy": 0.8427618410438299, "epoch": 0.22, "grad_norm": 51.0, "learning_rate": 1.0980000000000002e-05, "loss": 13.3831, "mean_token_accuracy": 0.7832117900252342, "num_tokens": 3886866.0, "step": 550 }, { "entropy": 0.8758323887363076, "epoch": 0.224, "grad_norm": 41.5, "learning_rate": 1.1180000000000001e-05, "loss": 13.8448, "mean_token_accuracy": 0.7745687130838632, "num_tokens": 3953732.0, "step": 560 }, { "entropy": 0.8352954359725118, "epoch": 0.228, "grad_norm": 41.75, "learning_rate": 1.138e-05, "loss": 13.4861, "mean_token_accuracy": 0.7828061260282994, "num_tokens": 4025795.0, "step": 570 }, { "entropy": 0.7947553901001811, "epoch": 0.232, "grad_norm": 43.25, "learning_rate": 1.1580000000000001e-05, "loss": 12.8366, "mean_token_accuracy": 0.7880385994911194, "num_tokens": 4098989.0, "step": 580 }, { "entropy": 0.8285046070814133, "epoch": 0.236, "grad_norm": 39.25, "learning_rate": 1.178e-05, "loss": 13.0789, "mean_token_accuracy": 0.7872951343655586, "num_tokens": 4170878.0, "step": 590 }, { "entropy": 0.7863880429416895, "epoch": 0.24, "grad_norm": 45.0, "learning_rate": 1.198e-05, "loss": 12.7479, "mean_token_accuracy": 0.7928395148366689, "num_tokens": 4243655.0, "step": 600 }, { "epoch": 0.24, "eval_biology_entropy": 1.2514844969511032, "eval_biology_loss": 1.2855682373046875, "eval_biology_mean_token_accuracy": 0.6817903207540512, "eval_biology_num_tokens": 4243655.0, "eval_biology_runtime": 40.7639, "eval_biology_samples_per_second": 12.266, "eval_biology_steps_per_second": 12.266, "step": 600 }, { "epoch": 0.24, "eval_math_entropy": 0.843876126229763, "eval_math_loss": 0.8332411646842957, "eval_math_mean_token_accuracy": 0.783966186761856, "eval_math_num_tokens": 4243655.0, "eval_math_runtime": 48.1965, "eval_math_samples_per_second": 10.374, "eval_math_steps_per_second": 10.374, "step": 600 }, { "entropy": 0.7838549023494125, "epoch": 0.244, "grad_norm": 41.75, "learning_rate": 1.218e-05, "loss": 12.4042, "mean_token_accuracy": 0.7971927735954523, "num_tokens": 4317321.0, "step": 610 }, { "entropy": 0.7797296568751335, "epoch": 0.248, "grad_norm": 51.75, "learning_rate": 1.2380000000000002e-05, "loss": 12.5895, "mean_token_accuracy": 0.7916548330336809, "num_tokens": 4387651.0, "step": 620 }, { "entropy": 0.8094759196043014, "epoch": 0.252, "grad_norm": 84.5, "learning_rate": 1.2580000000000002e-05, "loss": 12.9446, "mean_token_accuracy": 0.7892590902745724, "num_tokens": 4461909.0, "step": 630 }, { "entropy": 0.7797345662489533, "epoch": 0.256, "grad_norm": 62.0, "learning_rate": 1.2780000000000001e-05, "loss": 12.462, "mean_token_accuracy": 0.7935512393712998, "num_tokens": 4531471.0, "step": 640 }, { "entropy": 0.7911159221082926, "epoch": 0.26, "grad_norm": 40.0, "learning_rate": 1.2980000000000001e-05, "loss": 12.8515, "mean_token_accuracy": 0.7928617469966411, "num_tokens": 4600757.0, "step": 650 }, { "entropy": 0.8169831104576588, "epoch": 0.264, "grad_norm": 49.5, "learning_rate": 1.3180000000000001e-05, "loss": 13.117, "mean_token_accuracy": 0.7893466159701348, "num_tokens": 4669389.0, "step": 660 }, { "entropy": 0.8287884982302784, "epoch": 0.268, "grad_norm": 55.25, "learning_rate": 1.3380000000000002e-05, "loss": 13.3618, "mean_token_accuracy": 0.7853043630719185, "num_tokens": 4740492.0, "step": 670 }, { "entropy": 0.7938818972557783, "epoch": 0.272, "grad_norm": 42.5, "learning_rate": 1.3580000000000002e-05, "loss": 12.6092, "mean_token_accuracy": 0.7941738195717335, "num_tokens": 4810284.0, "step": 680 }, { "entropy": 0.7964617364108563, "epoch": 0.276, "grad_norm": 37.25, "learning_rate": 1.378e-05, "loss": 12.6495, "mean_token_accuracy": 0.7921258114278317, "num_tokens": 4885701.0, "step": 690 }, { "entropy": 0.7932526333257556, "epoch": 0.28, "grad_norm": 48.5, "learning_rate": 1.398e-05, "loss": 12.8488, "mean_token_accuracy": 0.7890512771904469, "num_tokens": 4955375.0, "step": 700 }, { "epoch": 0.28, "eval_biology_entropy": 1.2677810794115068, "eval_biology_loss": 1.2936662435531616, "eval_biology_mean_token_accuracy": 0.6814599596261978, "eval_biology_num_tokens": 4955375.0, "eval_biology_runtime": 40.9028, "eval_biology_samples_per_second": 12.224, "eval_biology_steps_per_second": 12.224, "step": 700 }, { "epoch": 0.28, "eval_math_entropy": 0.8274374225139618, "eval_math_loss": 0.8181362748146057, "eval_math_mean_token_accuracy": 0.7868153600692749, "eval_math_num_tokens": 4955375.0, "eval_math_runtime": 48.2074, "eval_math_samples_per_second": 10.372, "eval_math_steps_per_second": 10.372, "step": 700 }, { "entropy": 0.7877025598660111, "epoch": 0.284, "grad_norm": 39.25, "learning_rate": 1.418e-05, "loss": 12.6293, "mean_token_accuracy": 0.7944098394364119, "num_tokens": 5025638.0, "step": 710 }, { "entropy": 0.7564179511740804, "epoch": 0.288, "grad_norm": 38.75, "learning_rate": 1.4380000000000001e-05, "loss": 12.0294, "mean_token_accuracy": 0.8003660906106234, "num_tokens": 5095104.0, "step": 720 }, { "entropy": 0.7647898152470589, "epoch": 0.292, "grad_norm": 56.0, "learning_rate": 1.4580000000000001e-05, "loss": 12.4713, "mean_token_accuracy": 0.7947716873139143, "num_tokens": 5167049.0, "step": 730 }, { "entropy": 0.8119000235572458, "epoch": 0.296, "grad_norm": 36.75, "learning_rate": 1.478e-05, "loss": 13.0034, "mean_token_accuracy": 0.7876484204083681, "num_tokens": 5240988.0, "step": 740 }, { "entropy": 0.7802535065449774, "epoch": 0.3, "grad_norm": 40.5, "learning_rate": 1.498e-05, "loss": 12.4371, "mean_token_accuracy": 0.795463678240776, "num_tokens": 5312258.0, "step": 750 }, { "entropy": 0.8087782407179475, "epoch": 0.304, "grad_norm": 44.75, "learning_rate": 1.5180000000000002e-05, "loss": 12.9661, "mean_token_accuracy": 0.7857579983770847, "num_tokens": 5383732.0, "step": 760 }, { "entropy": 0.8243673056364059, "epoch": 0.308, "grad_norm": 39.5, "learning_rate": 1.5380000000000002e-05, "loss": 13.1334, "mean_token_accuracy": 0.7867084331810474, "num_tokens": 5456650.0, "step": 770 }, { "entropy": 0.7901528097689152, "epoch": 0.312, "grad_norm": 40.0, "learning_rate": 1.5580000000000003e-05, "loss": 12.6608, "mean_token_accuracy": 0.7905277730897069, "num_tokens": 5530666.0, "step": 780 }, { "entropy": 0.7515985878184438, "epoch": 0.316, "grad_norm": 44.75, "learning_rate": 1.578e-05, "loss": 11.9655, "mean_token_accuracy": 0.8011779848486185, "num_tokens": 5605269.0, "step": 790 }, { "entropy": 0.7314818985760212, "epoch": 0.32, "grad_norm": 42.5, "learning_rate": 1.5980000000000003e-05, "loss": 11.7244, "mean_token_accuracy": 0.8059452049434185, "num_tokens": 5676334.0, "step": 800 }, { "epoch": 0.32, "eval_biology_entropy": 1.2663928886651994, "eval_biology_loss": 1.2994052171707153, "eval_biology_mean_token_accuracy": 0.6803199350833893, "eval_biology_num_tokens": 5676334.0, "eval_biology_runtime": 40.7632, "eval_biology_samples_per_second": 12.266, "eval_biology_steps_per_second": 12.266, "step": 800 }, { "epoch": 0.32, "eval_math_entropy": 0.7805578476190567, "eval_math_loss": 0.8004927635192871, "eval_math_mean_token_accuracy": 0.7903167794942856, "eval_math_num_tokens": 5676334.0, "eval_math_runtime": 48.1783, "eval_math_samples_per_second": 10.378, "eval_math_steps_per_second": 10.378, "step": 800 }, { "entropy": 0.7661854576319456, "epoch": 0.324, "grad_norm": 40.5, "learning_rate": 1.618e-05, "loss": 12.5767, "mean_token_accuracy": 0.7944763712584972, "num_tokens": 5747779.0, "step": 810 }, { "entropy": 0.803387301787734, "epoch": 0.328, "grad_norm": 42.0, "learning_rate": 1.638e-05, "loss": 12.825, "mean_token_accuracy": 0.7907922245562077, "num_tokens": 5816488.0, "step": 820 }, { "entropy": 0.8049221463501454, "epoch": 0.332, "grad_norm": 44.0, "learning_rate": 1.658e-05, "loss": 12.9669, "mean_token_accuracy": 0.7854703675955534, "num_tokens": 5887180.0, "step": 830 }, { "entropy": 0.8102014064788818, "epoch": 0.336, "grad_norm": 38.5, "learning_rate": 1.6780000000000002e-05, "loss": 12.9814, "mean_token_accuracy": 0.7886964596807957, "num_tokens": 5958480.0, "step": 840 }, { "entropy": 0.7344874950125814, "epoch": 0.34, "grad_norm": 50.25, "learning_rate": 1.698e-05, "loss": 11.7609, "mean_token_accuracy": 0.8062219381332397, "num_tokens": 6031020.0, "step": 850 }, { "entropy": 0.802255941182375, "epoch": 0.344, "grad_norm": 39.0, "learning_rate": 1.718e-05, "loss": 12.8439, "mean_token_accuracy": 0.7877463910728693, "num_tokens": 6102081.0, "step": 860 }, { "entropy": 0.7980229431763292, "epoch": 0.348, "grad_norm": 36.25, "learning_rate": 1.7380000000000003e-05, "loss": 12.7774, "mean_token_accuracy": 0.7900870595127344, "num_tokens": 6172871.0, "step": 870 }, { "entropy": 0.7401213280856609, "epoch": 0.352, "grad_norm": 40.75, "learning_rate": 1.758e-05, "loss": 11.9409, "mean_token_accuracy": 0.801040057092905, "num_tokens": 6242161.0, "step": 880 }, { "entropy": 0.8095649138092995, "epoch": 0.356, "grad_norm": 46.5, "learning_rate": 1.7780000000000003e-05, "loss": 12.9134, "mean_token_accuracy": 0.7872339155524969, "num_tokens": 6312187.0, "step": 890 }, { "entropy": 0.7853369496762752, "epoch": 0.36, "grad_norm": 37.75, "learning_rate": 1.798e-05, "loss": 12.4062, "mean_token_accuracy": 0.7960268270224333, "num_tokens": 6377327.0, "step": 900 }, { "epoch": 0.36, "eval_biology_entropy": 1.250610340833664, "eval_biology_loss": 1.3047173023223877, "eval_biology_mean_token_accuracy": 0.6794998321533203, "eval_biology_num_tokens": 6377327.0, "eval_biology_runtime": 40.5105, "eval_biology_samples_per_second": 12.342, "eval_biology_steps_per_second": 12.342, "step": 900 }, { "epoch": 0.36, "eval_math_entropy": 0.773322255730629, "eval_math_loss": 0.789590060710907, "eval_math_mean_token_accuracy": 0.7914780566692352, "eval_math_num_tokens": 6377327.0, "eval_math_runtime": 48.04, "eval_math_samples_per_second": 10.408, "eval_math_steps_per_second": 10.408, "step": 900 }, { "entropy": 0.763472201116383, "epoch": 0.364, "grad_norm": 41.5, "learning_rate": 1.8180000000000002e-05, "loss": 12.3288, "mean_token_accuracy": 0.7959023587405681, "num_tokens": 6453254.0, "step": 910 }, { "entropy": 0.7712945869192481, "epoch": 0.368, "grad_norm": 40.25, "learning_rate": 1.8380000000000004e-05, "loss": 12.2854, "mean_token_accuracy": 0.795868669450283, "num_tokens": 6523679.0, "step": 920 }, { "entropy": 0.7641221083700657, "epoch": 0.372, "grad_norm": 37.25, "learning_rate": 1.858e-05, "loss": 12.2867, "mean_token_accuracy": 0.7949323236942292, "num_tokens": 6594651.0, "step": 930 }, { "entropy": 0.7518373979255557, "epoch": 0.376, "grad_norm": 42.0, "learning_rate": 1.878e-05, "loss": 12.1179, "mean_token_accuracy": 0.798585458099842, "num_tokens": 6669278.0, "step": 940 }, { "entropy": 0.7543134400621057, "epoch": 0.38, "grad_norm": 33.75, "learning_rate": 1.898e-05, "loss": 12.023, "mean_token_accuracy": 0.7991646058857441, "num_tokens": 6740491.0, "step": 950 }, { "entropy": 0.7203706834465265, "epoch": 0.384, "grad_norm": 35.5, "learning_rate": 1.918e-05, "loss": 11.6634, "mean_token_accuracy": 0.8036353968083858, "num_tokens": 6810978.0, "step": 960 }, { "entropy": 0.7761434537358582, "epoch": 0.388, "grad_norm": 39.5, "learning_rate": 1.938e-05, "loss": 12.4387, "mean_token_accuracy": 0.7944561909884215, "num_tokens": 6882077.0, "step": 970 }, { "entropy": 0.7785217678174376, "epoch": 0.392, "grad_norm": 33.75, "learning_rate": 1.9580000000000002e-05, "loss": 12.607, "mean_token_accuracy": 0.7944069147109986, "num_tokens": 6953929.0, "step": 980 }, { "entropy": 0.8195465696975589, "epoch": 0.396, "grad_norm": 65.5, "learning_rate": 1.978e-05, "loss": 13.2202, "mean_token_accuracy": 0.7838506288826466, "num_tokens": 7023835.0, "step": 990 }, { "entropy": 0.8217529645189643, "epoch": 0.4, "grad_norm": 39.75, "learning_rate": 1.9980000000000002e-05, "loss": 13.1698, "mean_token_accuracy": 0.7848756689578295, "num_tokens": 7096903.0, "step": 1000 }, { "epoch": 0.4, "eval_biology_entropy": 1.266352553486824, "eval_biology_loss": 1.312301516532898, "eval_biology_mean_token_accuracy": 0.6775026069879532, "eval_biology_num_tokens": 7096903.0, "eval_biology_runtime": 40.7343, "eval_biology_samples_per_second": 12.275, "eval_biology_steps_per_second": 12.275, "step": 1000 }, { "epoch": 0.4, "eval_math_entropy": 0.7719556384682655, "eval_math_loss": 0.7821868062019348, "eval_math_mean_token_accuracy": 0.7932658619880676, "eval_math_num_tokens": 7096903.0, "eval_math_runtime": 47.8946, "eval_math_samples_per_second": 10.44, "eval_math_steps_per_second": 10.44, "step": 1000 }, { "entropy": 0.7681734091602266, "epoch": 0.404, "grad_norm": 50.25, "learning_rate": 1.9980000000000002e-05, "loss": 12.3128, "mean_token_accuracy": 0.7956426542252302, "num_tokens": 7165970.0, "step": 1010 }, { "entropy": 0.7375578137114644, "epoch": 0.408, "grad_norm": 39.5, "learning_rate": 1.995777777777778e-05, "loss": 12.0172, "mean_token_accuracy": 0.7997171815484763, "num_tokens": 7239071.0, "step": 1020 }, { "entropy": 0.7803538812324404, "epoch": 0.412, "grad_norm": 35.75, "learning_rate": 1.9935555555555557e-05, "loss": 12.3805, "mean_token_accuracy": 0.7939646884799003, "num_tokens": 7308430.0, "step": 1030 }, { "entropy": 0.7741465013474226, "epoch": 0.416, "grad_norm": 35.5, "learning_rate": 1.9913333333333335e-05, "loss": 12.3671, "mean_token_accuracy": 0.7934583045542241, "num_tokens": 7377181.0, "step": 1040 }, { "entropy": 0.7638113724067808, "epoch": 0.42, "grad_norm": 90.5, "learning_rate": 1.9891111111111112e-05, "loss": 12.3505, "mean_token_accuracy": 0.7913536671549082, "num_tokens": 7443045.0, "step": 1050 }, { "entropy": 0.763846804574132, "epoch": 0.424, "grad_norm": 30.5, "learning_rate": 1.986888888888889e-05, "loss": 12.2045, "mean_token_accuracy": 0.7992881424725056, "num_tokens": 7513035.0, "step": 1060 }, { "entropy": 0.7383330492302775, "epoch": 0.428, "grad_norm": 34.0, "learning_rate": 1.9846666666666668e-05, "loss": 11.9274, "mean_token_accuracy": 0.80244723521173, "num_tokens": 7580676.0, "step": 1070 }, { "entropy": 0.7183827603235841, "epoch": 0.432, "grad_norm": 37.25, "learning_rate": 1.9824444444444445e-05, "loss": 11.5217, "mean_token_accuracy": 0.8065667234361171, "num_tokens": 7650523.0, "step": 1080 }, { "entropy": 0.784936741553247, "epoch": 0.436, "grad_norm": 60.5, "learning_rate": 1.9802222222222226e-05, "loss": 12.7959, "mean_token_accuracy": 0.7918864544481039, "num_tokens": 7725087.0, "step": 1090 }, { "entropy": 0.7581901378929615, "epoch": 0.44, "grad_norm": 35.75, "learning_rate": 1.978e-05, "loss": 12.0708, "mean_token_accuracy": 0.7991419725120068, "num_tokens": 7794006.0, "step": 1100 }, { "epoch": 0.44, "eval_biology_entropy": 1.3096306604146957, "eval_biology_loss": 1.3199082612991333, "eval_biology_mean_token_accuracy": 0.6769449328184127, "eval_biology_num_tokens": 7794006.0, "eval_biology_runtime": 41.8088, "eval_biology_samples_per_second": 11.959, "eval_biology_steps_per_second": 11.959, "step": 1100 }, { "epoch": 0.44, "eval_math_entropy": 0.7593404217660427, "eval_math_loss": 0.7711546421051025, "eval_math_mean_token_accuracy": 0.7947946254014969, "eval_math_num_tokens": 7794006.0, "eval_math_runtime": 48.5633, "eval_math_samples_per_second": 10.296, "eval_math_steps_per_second": 10.296, "step": 1100 }, { "entropy": 0.7571122448891401, "epoch": 0.444, "grad_norm": 35.0, "learning_rate": 1.975777777777778e-05, "loss": 12.1206, "mean_token_accuracy": 0.7960348557680845, "num_tokens": 7869588.0, "step": 1110 }, { "entropy": 0.7115281330421567, "epoch": 0.448, "grad_norm": 39.75, "learning_rate": 1.9735555555555556e-05, "loss": 11.603, "mean_token_accuracy": 0.8045209631323814, "num_tokens": 7936788.0, "step": 1120 }, { "entropy": 0.7751916225999593, "epoch": 0.452, "grad_norm": 44.5, "learning_rate": 1.9713333333333337e-05, "loss": 12.2553, "mean_token_accuracy": 0.7963129185140133, "num_tokens": 8007062.0, "step": 1130 }, { "entropy": 0.7358326924964785, "epoch": 0.456, "grad_norm": 35.75, "learning_rate": 1.969111111111111e-05, "loss": 11.8682, "mean_token_accuracy": 0.8016852129250764, "num_tokens": 8081035.0, "step": 1140 }, { "entropy": 0.748681915551424, "epoch": 0.46, "grad_norm": 38.75, "learning_rate": 1.9668888888888892e-05, "loss": 12.0865, "mean_token_accuracy": 0.8016125675290823, "num_tokens": 8152005.0, "step": 1150 }, { "entropy": 0.7732457166537643, "epoch": 0.464, "grad_norm": 30.625, "learning_rate": 1.9646666666666666e-05, "loss": 12.3834, "mean_token_accuracy": 0.7960082966834306, "num_tokens": 8223147.0, "step": 1160 }, { "entropy": 0.7652292996644974, "epoch": 0.468, "grad_norm": 47.0, "learning_rate": 1.9624444444444447e-05, "loss": 12.5645, "mean_token_accuracy": 0.7940644320100546, "num_tokens": 8296261.0, "step": 1170 }, { "entropy": 0.7421515263617039, "epoch": 0.472, "grad_norm": 45.75, "learning_rate": 1.9602222222222225e-05, "loss": 11.76, "mean_token_accuracy": 0.8024631194770336, "num_tokens": 8367380.0, "step": 1180 }, { "entropy": 0.74504639133811, "epoch": 0.476, "grad_norm": 40.25, "learning_rate": 1.9580000000000002e-05, "loss": 11.8789, "mean_token_accuracy": 0.798084694892168, "num_tokens": 8438675.0, "step": 1190 }, { "entropy": 0.7464414104819298, "epoch": 0.48, "grad_norm": 34.75, "learning_rate": 1.955777777777778e-05, "loss": 12.21, "mean_token_accuracy": 0.7978132009506226, "num_tokens": 8506030.0, "step": 1200 }, { "epoch": 0.48, "eval_biology_entropy": 1.3110252878069877, "eval_biology_loss": 1.324716567993164, "eval_biology_mean_token_accuracy": 0.6763759871721268, "eval_biology_num_tokens": 8506030.0, "eval_biology_runtime": 67.8224, "eval_biology_samples_per_second": 7.372, "eval_biology_steps_per_second": 7.372, "step": 1200 }, { "epoch": 0.48, "eval_math_entropy": 0.7597530280947685, "eval_math_loss": 0.7603512406349182, "eval_math_mean_token_accuracy": 0.7970666073560715, "eval_math_num_tokens": 8506030.0, "eval_math_runtime": 49.7375, "eval_math_samples_per_second": 10.053, "eval_math_steps_per_second": 10.053, "step": 1200 }, { "entropy": 0.7492084631696343, "epoch": 0.484, "grad_norm": 35.25, "learning_rate": 1.9535555555555557e-05, "loss": 11.91, "mean_token_accuracy": 0.8018037732690573, "num_tokens": 8576915.0, "step": 1210 }, { "entropy": 0.7045143865048885, "epoch": 0.488, "grad_norm": 33.5, "learning_rate": 1.9513333333333335e-05, "loss": 11.3318, "mean_token_accuracy": 0.8072403989732265, "num_tokens": 8646384.0, "step": 1220 }, { "entropy": 0.7107147205621004, "epoch": 0.492, "grad_norm": 34.5, "learning_rate": 1.9491111111111113e-05, "loss": 11.4898, "mean_token_accuracy": 0.8078423041850329, "num_tokens": 8718547.0, "step": 1230 }, { "entropy": 0.7382461177185178, "epoch": 0.496, "grad_norm": 34.0, "learning_rate": 1.946888888888889e-05, "loss": 11.8211, "mean_token_accuracy": 0.8013716965913773, "num_tokens": 8788726.0, "step": 1240 }, { "entropy": 0.7180610377341509, "epoch": 0.5, "grad_norm": 32.0, "learning_rate": 1.9446666666666668e-05, "loss": 11.6359, "mean_token_accuracy": 0.8073719006031752, "num_tokens": 8861341.0, "step": 1250 }, { "entropy": 0.7594908589497209, "epoch": 0.504, "grad_norm": 33.25, "learning_rate": 1.9424444444444446e-05, "loss": 12.1494, "mean_token_accuracy": 0.7983963489532471, "num_tokens": 8934959.0, "step": 1260 }, { "entropy": 0.6865442413836718, "epoch": 0.508, "grad_norm": 47.0, "learning_rate": 1.9402222222222223e-05, "loss": 11.0336, "mean_token_accuracy": 0.8123888235539198, "num_tokens": 9008794.0, "step": 1270 }, { "entropy": 0.736488389596343, "epoch": 0.512, "grad_norm": 36.0, "learning_rate": 1.938e-05, "loss": 11.8236, "mean_token_accuracy": 0.80280033480376, "num_tokens": 9078039.0, "step": 1280 }, { "entropy": 0.7221866660751403, "epoch": 0.516, "grad_norm": 38.5, "learning_rate": 1.935777777777778e-05, "loss": 11.5549, "mean_token_accuracy": 0.8055313553661108, "num_tokens": 9148271.0, "step": 1290 }, { "entropy": 0.7004529038444162, "epoch": 0.52, "grad_norm": 35.5, "learning_rate": 1.9335555555555556e-05, "loss": 11.4288, "mean_token_accuracy": 0.8069275438785553, "num_tokens": 9220298.0, "step": 1300 }, { "epoch": 0.52, "eval_biology_entropy": 1.2943904809951783, "eval_biology_loss": 1.3324984312057495, "eval_biology_mean_token_accuracy": 0.67458989828825, "eval_biology_num_tokens": 9220298.0, "eval_biology_runtime": 40.8416, "eval_biology_samples_per_second": 12.242, "eval_biology_steps_per_second": 12.242, "step": 1300 }, { "epoch": 0.52, "eval_math_entropy": 0.7494015108048916, "eval_math_loss": 0.7516281604766846, "eval_math_mean_token_accuracy": 0.799598550081253, "eval_math_num_tokens": 9220298.0, "eval_math_runtime": 48.3706, "eval_math_samples_per_second": 10.337, "eval_math_steps_per_second": 10.337, "step": 1300 }, { "entropy": 0.7174371790140868, "epoch": 0.524, "grad_norm": 28.375, "learning_rate": 1.9313333333333334e-05, "loss": 11.5, "mean_token_accuracy": 0.8083903137594461, "num_tokens": 9299112.0, "step": 1310 }, { "entropy": 0.7080350153148174, "epoch": 0.528, "grad_norm": 30.75, "learning_rate": 1.9291111111111115e-05, "loss": 11.26, "mean_token_accuracy": 0.8113009247928857, "num_tokens": 9373860.0, "step": 1320 }, { "entropy": 0.7306012082844973, "epoch": 0.532, "grad_norm": 44.5, "learning_rate": 1.926888888888889e-05, "loss": 11.8475, "mean_token_accuracy": 0.8006449308246374, "num_tokens": 9446235.0, "step": 1330 }, { "entropy": 0.7229157039895654, "epoch": 0.536, "grad_norm": 37.25, "learning_rate": 1.924666666666667e-05, "loss": 11.6284, "mean_token_accuracy": 0.8080755949020386, "num_tokens": 9515962.0, "step": 1340 }, { "entropy": 0.7307992668822407, "epoch": 0.54, "grad_norm": 41.25, "learning_rate": 1.9224444444444444e-05, "loss": 11.6765, "mean_token_accuracy": 0.8042573977261782, "num_tokens": 9587715.0, "step": 1350 }, { "entropy": 0.7034009215421975, "epoch": 0.544, "grad_norm": 35.75, "learning_rate": 1.9202222222222225e-05, "loss": 11.7562, "mean_token_accuracy": 0.8090484531596303, "num_tokens": 9660940.0, "step": 1360 }, { "entropy": 0.7848182378336788, "epoch": 0.548, "grad_norm": 36.25, "learning_rate": 1.918e-05, "loss": 12.6939, "mean_token_accuracy": 0.7890455313026905, "num_tokens": 9727806.0, "step": 1370 }, { "entropy": 0.7604333622381091, "epoch": 0.552, "grad_norm": 39.75, "learning_rate": 1.915777777777778e-05, "loss": 12.152, "mean_token_accuracy": 0.7959740109741688, "num_tokens": 9793170.0, "step": 1380 }, { "entropy": 0.7220884849317372, "epoch": 0.556, "grad_norm": 37.0, "learning_rate": 1.9135555555555555e-05, "loss": 11.6033, "mean_token_accuracy": 0.8036061096936464, "num_tokens": 9860484.0, "step": 1390 }, { "entropy": 0.704200503975153, "epoch": 0.56, "grad_norm": 29.375, "learning_rate": 1.9113333333333336e-05, "loss": 11.1518, "mean_token_accuracy": 0.8113750666379929, "num_tokens": 9932302.0, "step": 1400 }, { "epoch": 0.56, "eval_biology_entropy": 1.2828473488092422, "eval_biology_loss": 1.3331190347671509, "eval_biology_mean_token_accuracy": 0.6747415412068367, "eval_biology_num_tokens": 9932302.0, "eval_biology_runtime": 45.9485, "eval_biology_samples_per_second": 10.882, "eval_biology_steps_per_second": 10.882, "step": 1400 }, { "epoch": 0.56, "eval_math_entropy": 0.7451034058630467, "eval_math_loss": 0.7451240420341492, "eval_math_mean_token_accuracy": 0.8007836003303528, "eval_math_num_tokens": 9932302.0, "eval_math_runtime": 56.801, "eval_math_samples_per_second": 8.803, "eval_math_steps_per_second": 8.803, "step": 1400 }, { "entropy": 0.7115965198725462, "epoch": 0.564, "grad_norm": 34.25, "learning_rate": 1.9091111111111113e-05, "loss": 11.5991, "mean_token_accuracy": 0.8047812730073929, "num_tokens": 10001923.0, "step": 1410 }, { "entropy": 0.7081295600160956, "epoch": 0.568, "grad_norm": 32.75, "learning_rate": 1.906888888888889e-05, "loss": 11.2206, "mean_token_accuracy": 0.8095470562577247, "num_tokens": 10072362.0, "step": 1420 }, { "entropy": 0.7141612969338894, "epoch": 0.572, "grad_norm": 30.375, "learning_rate": 1.904666666666667e-05, "loss": 11.6816, "mean_token_accuracy": 0.8019938107579947, "num_tokens": 10140236.0, "step": 1430 }, { "entropy": 0.7410996645689011, "epoch": 0.576, "grad_norm": 32.5, "learning_rate": 1.9024444444444446e-05, "loss": 11.7841, "mean_token_accuracy": 0.8034846507012844, "num_tokens": 10215462.0, "step": 1440 }, { "entropy": 0.7485123924911022, "epoch": 0.58, "grad_norm": 43.25, "learning_rate": 1.9002222222222224e-05, "loss": 11.9787, "mean_token_accuracy": 0.798300775885582, "num_tokens": 10286386.0, "step": 1450 }, { "entropy": 0.7154224492609501, "epoch": 0.584, "grad_norm": 32.5, "learning_rate": 1.898e-05, "loss": 11.5298, "mean_token_accuracy": 0.8057492181658745, "num_tokens": 10354952.0, "step": 1460 }, { "entropy": 0.7272289376705885, "epoch": 0.588, "grad_norm": 40.25, "learning_rate": 1.895777777777778e-05, "loss": 11.652, "mean_token_accuracy": 0.8025094602257014, "num_tokens": 10425215.0, "step": 1470 }, { "entropy": 0.7166844455525279, "epoch": 0.592, "grad_norm": 30.625, "learning_rate": 1.8935555555555556e-05, "loss": 11.4116, "mean_token_accuracy": 0.8082372698932886, "num_tokens": 10504396.0, "step": 1480 }, { "entropy": 0.7413169213570654, "epoch": 0.596, "grad_norm": 32.0, "learning_rate": 1.8913333333333334e-05, "loss": 11.959, "mean_token_accuracy": 0.7982309013605118, "num_tokens": 10576150.0, "step": 1490 }, { "entropy": 0.7089840611442924, "epoch": 0.6, "grad_norm": 35.0, "learning_rate": 1.8891111111111115e-05, "loss": 11.6029, "mean_token_accuracy": 0.8050658291205763, "num_tokens": 10644641.0, "step": 1500 }, { "epoch": 0.6, "eval_biology_entropy": 1.301250998735428, "eval_biology_loss": 1.3366436958312988, "eval_biology_mean_token_accuracy": 0.6739367877840996, "eval_biology_num_tokens": 10644641.0, "eval_biology_runtime": 40.559, "eval_biology_samples_per_second": 12.328, "eval_biology_steps_per_second": 12.328, "step": 1500 }, { "epoch": 0.6, "eval_math_entropy": 0.7238052343726158, "eval_math_loss": 0.7381152510643005, "eval_math_mean_token_accuracy": 0.8017449014186859, "eval_math_num_tokens": 10644641.0, "eval_math_runtime": 48.6353, "eval_math_samples_per_second": 10.281, "eval_math_steps_per_second": 10.281, "step": 1500 } ], "logging_steps": 10, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.13769657469834e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }