| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.8, |
| "eval_steps": 100, |
| "global_step": 2000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 0.7581225659698247, |
| "epoch": 0.004, |
| "grad_norm": 101.5, |
| "learning_rate": 1.8e-07, |
| "loss": 18.1141, |
| "mean_token_accuracy": 0.7393306585028767, |
| "num_tokens": 72542.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 0.7520309612154961, |
| "epoch": 0.008, |
| "grad_norm": 162.0, |
| "learning_rate": 3.8e-07, |
| "loss": 18.2441, |
| "mean_token_accuracy": 0.7392390985041857, |
| "num_tokens": 138146.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 0.741816989146173, |
| "epoch": 0.012, |
| "grad_norm": 92.5, |
| "learning_rate": 5.800000000000001e-07, |
| "loss": 17.8539, |
| "mean_token_accuracy": 0.7405363619327545, |
| "num_tokens": 208409.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 0.7636421315371991, |
| "epoch": 0.016, |
| "grad_norm": 88.0, |
| "learning_rate": 7.8e-07, |
| "loss": 18.142, |
| "mean_token_accuracy": 0.7400334309786558, |
| "num_tokens": 280941.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 0.7905357209965587, |
| "epoch": 0.02, |
| "grad_norm": 108.0, |
| "learning_rate": 9.800000000000001e-07, |
| "loss": 18.1041, |
| "mean_token_accuracy": 0.7354144401848316, |
| "num_tokens": 349731.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 0.8223027169704438, |
| "epoch": 0.024, |
| "grad_norm": 112.5, |
| "learning_rate": 1.1800000000000001e-06, |
| "loss": 18.4599, |
| "mean_token_accuracy": 0.731158659607172, |
| "num_tokens": 419127.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 0.7908559931442142, |
| "epoch": 0.028, |
| "grad_norm": 106.0, |
| "learning_rate": 1.3800000000000001e-06, |
| "loss": 17.5053, |
| "mean_token_accuracy": 0.7440328601747751, |
| "num_tokens": 489016.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 0.8229548260569572, |
| "epoch": 0.032, |
| "grad_norm": 86.0, |
| "learning_rate": 1.5800000000000001e-06, |
| "loss": 17.3714, |
| "mean_token_accuracy": 0.7438380591571331, |
| "num_tokens": 558056.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 0.84512098710984, |
| "epoch": 0.036, |
| "grad_norm": 100.5, |
| "learning_rate": 1.7800000000000001e-06, |
| "loss": 17.2456, |
| "mean_token_accuracy": 0.7478047309443354, |
| "num_tokens": 627804.0, |
| "step": 90 |
| }, |
| { |
| "entropy": 0.8738276792690158, |
| "epoch": 0.04, |
| "grad_norm": 67.0, |
| "learning_rate": 1.98e-06, |
| "loss": 16.6896, |
| "mean_token_accuracy": 0.748936639726162, |
| "num_tokens": 697556.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.04, |
| "eval_biology_entropy": 1.1379984815716744, |
| "eval_biology_loss": 1.2692387104034424, |
| "eval_biology_mean_token_accuracy": 0.6880569371581078, |
| "eval_biology_num_tokens": 697556.0, |
| "eval_biology_runtime": 40.5573, |
| "eval_biology_samples_per_second": 12.328, |
| "eval_biology_steps_per_second": 12.328, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.04, |
| "eval_math_entropy": 0.9095822716951371, |
| "eval_math_loss": 1.0899367332458496, |
| "eval_math_mean_token_accuracy": 0.7395683622956276, |
| "eval_math_num_tokens": 697556.0, |
| "eval_math_runtime": 48.0541, |
| "eval_math_samples_per_second": 10.405, |
| "eval_math_steps_per_second": 10.405, |
| "step": 100 |
| }, |
| { |
| "entropy": 0.9193563124164938, |
| "epoch": 0.044, |
| "grad_norm": 88.5, |
| "learning_rate": 2.1800000000000003e-06, |
| "loss": 17.3763, |
| "mean_token_accuracy": 0.7420263335108757, |
| "num_tokens": 765262.0, |
| "step": 110 |
| }, |
| { |
| "entropy": 0.8906641118228436, |
| "epoch": 0.048, |
| "grad_norm": 78.0, |
| "learning_rate": 2.38e-06, |
| "loss": 16.2142, |
| "mean_token_accuracy": 0.7552934620529413, |
| "num_tokens": 836753.0, |
| "step": 120 |
| }, |
| { |
| "entropy": 0.9178624864667654, |
| "epoch": 0.052, |
| "grad_norm": 70.0, |
| "learning_rate": 2.5800000000000003e-06, |
| "loss": 16.3594, |
| "mean_token_accuracy": 0.7507667759433388, |
| "num_tokens": 909947.0, |
| "step": 130 |
| }, |
| { |
| "entropy": 0.9556385111063719, |
| "epoch": 0.056, |
| "grad_norm": 70.0, |
| "learning_rate": 2.7800000000000005e-06, |
| "loss": 16.6438, |
| "mean_token_accuracy": 0.748611630871892, |
| "num_tokens": 981959.0, |
| "step": 140 |
| }, |
| { |
| "entropy": 0.9541467605158687, |
| "epoch": 0.06, |
| "grad_norm": 86.0, |
| "learning_rate": 2.9800000000000003e-06, |
| "loss": 16.5304, |
| "mean_token_accuracy": 0.7486465960741043, |
| "num_tokens": 1053624.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 0.9960285406559706, |
| "epoch": 0.064, |
| "grad_norm": 78.0, |
| "learning_rate": 3.1800000000000005e-06, |
| "loss": 16.9435, |
| "mean_token_accuracy": 0.7448480188846588, |
| "num_tokens": 1126446.0, |
| "step": 160 |
| }, |
| { |
| "entropy": 0.9677659103646874, |
| "epoch": 0.068, |
| "grad_norm": 106.5, |
| "learning_rate": 3.3800000000000007e-06, |
| "loss": 16.1013, |
| "mean_token_accuracy": 0.7561902595683933, |
| "num_tokens": 1196286.0, |
| "step": 170 |
| }, |
| { |
| "entropy": 0.9270429328083992, |
| "epoch": 0.072, |
| "grad_norm": 62.0, |
| "learning_rate": 3.58e-06, |
| "loss": 15.3741, |
| "mean_token_accuracy": 0.763100459985435, |
| "num_tokens": 1273905.0, |
| "step": 180 |
| }, |
| { |
| "entropy": 0.9678281666710973, |
| "epoch": 0.076, |
| "grad_norm": 51.0, |
| "learning_rate": 3.7800000000000002e-06, |
| "loss": 15.8618, |
| "mean_token_accuracy": 0.7534618094563484, |
| "num_tokens": 1346454.0, |
| "step": 190 |
| }, |
| { |
| "entropy": 0.9695806819945574, |
| "epoch": 0.08, |
| "grad_norm": 61.25, |
| "learning_rate": 3.980000000000001e-06, |
| "loss": 15.7874, |
| "mean_token_accuracy": 0.7560878146439791, |
| "num_tokens": 1413596.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.08, |
| "eval_biology_entropy": 1.214613477051258, |
| "eval_biology_loss": 1.2630934715270996, |
| "eval_biology_mean_token_accuracy": 0.6885634814500808, |
| "eval_biology_num_tokens": 1413596.0, |
| "eval_biology_runtime": 40.4294, |
| "eval_biology_samples_per_second": 12.367, |
| "eval_biology_steps_per_second": 12.367, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.08, |
| "eval_math_entropy": 0.9574190213680267, |
| "eval_math_loss": 0.9871246218681335, |
| "eval_math_mean_token_accuracy": 0.7557044946551323, |
| "eval_math_num_tokens": 1413596.0, |
| "eval_math_runtime": 48.0895, |
| "eval_math_samples_per_second": 10.397, |
| "eval_math_steps_per_second": 10.397, |
| "step": 200 |
| }, |
| { |
| "entropy": 0.8888338362798095, |
| "epoch": 0.084, |
| "grad_norm": 61.25, |
| "learning_rate": 4.18e-06, |
| "loss": 14.623, |
| "mean_token_accuracy": 0.7706882227212191, |
| "num_tokens": 1485147.0, |
| "step": 210 |
| }, |
| { |
| "entropy": 0.906145141646266, |
| "epoch": 0.088, |
| "grad_norm": 53.5, |
| "learning_rate": 4.38e-06, |
| "loss": 14.8128, |
| "mean_token_accuracy": 0.7661576565355063, |
| "num_tokens": 1559450.0, |
| "step": 220 |
| }, |
| { |
| "entropy": 0.95110107883811, |
| "epoch": 0.092, |
| "grad_norm": 63.25, |
| "learning_rate": 4.58e-06, |
| "loss": 15.2457, |
| "mean_token_accuracy": 0.761219710484147, |
| "num_tokens": 1631535.0, |
| "step": 230 |
| }, |
| { |
| "entropy": 0.9411580273881555, |
| "epoch": 0.096, |
| "grad_norm": 51.0, |
| "learning_rate": 4.78e-06, |
| "loss": 14.938, |
| "mean_token_accuracy": 0.7646324578672647, |
| "num_tokens": 1701193.0, |
| "step": 240 |
| }, |
| { |
| "entropy": 0.9254088591784239, |
| "epoch": 0.1, |
| "grad_norm": 56.25, |
| "learning_rate": 4.980000000000001e-06, |
| "loss": 14.6029, |
| "mean_token_accuracy": 0.7678503680974245, |
| "num_tokens": 1773596.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 0.9214777417480946, |
| "epoch": 0.104, |
| "grad_norm": 54.0, |
| "learning_rate": 5.18e-06, |
| "loss": 15.043, |
| "mean_token_accuracy": 0.7618352189660073, |
| "num_tokens": 1840165.0, |
| "step": 260 |
| }, |
| { |
| "entropy": 0.9363527970388532, |
| "epoch": 0.108, |
| "grad_norm": 111.5, |
| "learning_rate": 5.380000000000001e-06, |
| "loss": 15.6052, |
| "mean_token_accuracy": 0.7564919672906398, |
| "num_tokens": 1908149.0, |
| "step": 270 |
| }, |
| { |
| "entropy": 0.9113753678277134, |
| "epoch": 0.112, |
| "grad_norm": 53.0, |
| "learning_rate": 5.580000000000001e-06, |
| "loss": 14.4645, |
| "mean_token_accuracy": 0.7736385434865951, |
| "num_tokens": 1979232.0, |
| "step": 280 |
| }, |
| { |
| "entropy": 0.916975767724216, |
| "epoch": 0.116, |
| "grad_norm": 54.5, |
| "learning_rate": 5.78e-06, |
| "loss": 15.0318, |
| "mean_token_accuracy": 0.7656373519450426, |
| "num_tokens": 2052039.0, |
| "step": 290 |
| }, |
| { |
| "entropy": 0.915022067911923, |
| "epoch": 0.12, |
| "grad_norm": 46.25, |
| "learning_rate": 5.98e-06, |
| "loss": 14.7236, |
| "mean_token_accuracy": 0.7691222932189703, |
| "num_tokens": 2125815.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.12, |
| "eval_biology_entropy": 1.2286294117569923, |
| "eval_biology_loss": 1.268429160118103, |
| "eval_biology_mean_token_accuracy": 0.6861296361088752, |
| "eval_biology_num_tokens": 2125815.0, |
| "eval_biology_runtime": 40.4544, |
| "eval_biology_samples_per_second": 12.36, |
| "eval_biology_steps_per_second": 12.36, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.12, |
| "eval_math_entropy": 0.9302992832064628, |
| "eval_math_loss": 0.9273415803909302, |
| "eval_math_mean_token_accuracy": 0.7664831226468086, |
| "eval_math_num_tokens": 2125815.0, |
| "eval_math_runtime": 48.047, |
| "eval_math_samples_per_second": 10.406, |
| "eval_math_steps_per_second": 10.406, |
| "step": 300 |
| }, |
| { |
| "entropy": 0.9336849415674806, |
| "epoch": 0.124, |
| "grad_norm": 55.25, |
| "learning_rate": 6.18e-06, |
| "loss": 14.9674, |
| "mean_token_accuracy": 0.7661335166543722, |
| "num_tokens": 2201991.0, |
| "step": 310 |
| }, |
| { |
| "entropy": 0.8756070390343667, |
| "epoch": 0.128, |
| "grad_norm": 52.25, |
| "learning_rate": 6.380000000000001e-06, |
| "loss": 14.0, |
| "mean_token_accuracy": 0.7786776568740607, |
| "num_tokens": 2274177.0, |
| "step": 320 |
| }, |
| { |
| "entropy": 0.892756593041122, |
| "epoch": 0.132, |
| "grad_norm": 45.75, |
| "learning_rate": 6.5800000000000005e-06, |
| "loss": 14.1896, |
| "mean_token_accuracy": 0.7731969062238931, |
| "num_tokens": 2342173.0, |
| "step": 330 |
| }, |
| { |
| "entropy": 0.8899674592539668, |
| "epoch": 0.136, |
| "grad_norm": 56.5, |
| "learning_rate": 6.780000000000001e-06, |
| "loss": 14.2167, |
| "mean_token_accuracy": 0.7683363553136587, |
| "num_tokens": 2412435.0, |
| "step": 340 |
| }, |
| { |
| "entropy": 0.9237651886418462, |
| "epoch": 0.14, |
| "grad_norm": 49.5, |
| "learning_rate": 6.98e-06, |
| "loss": 14.7203, |
| "mean_token_accuracy": 0.7684664275497198, |
| "num_tokens": 2481105.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 0.8494639024138451, |
| "epoch": 0.144, |
| "grad_norm": 49.0, |
| "learning_rate": 7.180000000000001e-06, |
| "loss": 13.6956, |
| "mean_token_accuracy": 0.7817885402590037, |
| "num_tokens": 2548445.0, |
| "step": 360 |
| }, |
| { |
| "entropy": 0.8341110894456506, |
| "epoch": 0.148, |
| "grad_norm": 44.0, |
| "learning_rate": 7.3800000000000005e-06, |
| "loss": 13.2223, |
| "mean_token_accuracy": 0.789648124948144, |
| "num_tokens": 2621364.0, |
| "step": 370 |
| }, |
| { |
| "entropy": 0.8608193326741457, |
| "epoch": 0.152, |
| "grad_norm": 52.25, |
| "learning_rate": 7.58e-06, |
| "loss": 13.8861, |
| "mean_token_accuracy": 0.7754677496850491, |
| "num_tokens": 2686462.0, |
| "step": 380 |
| }, |
| { |
| "entropy": 0.8447565982118249, |
| "epoch": 0.156, |
| "grad_norm": 50.0, |
| "learning_rate": 7.78e-06, |
| "loss": 13.5199, |
| "mean_token_accuracy": 0.7812942665070295, |
| "num_tokens": 2754241.0, |
| "step": 390 |
| }, |
| { |
| "entropy": 0.8711243279278278, |
| "epoch": 0.16, |
| "grad_norm": 79.0, |
| "learning_rate": 7.980000000000002e-06, |
| "loss": 14.1276, |
| "mean_token_accuracy": 0.7742310840636492, |
| "num_tokens": 2824418.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.16, |
| "eval_biology_entropy": 1.22758097666502, |
| "eval_biology_loss": 1.2762199640274048, |
| "eval_biology_mean_token_accuracy": 0.6836535354852676, |
| "eval_biology_num_tokens": 2824418.0, |
| "eval_biology_runtime": 40.7281, |
| "eval_biology_samples_per_second": 12.277, |
| "eval_biology_steps_per_second": 12.277, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.16, |
| "eval_math_entropy": 0.9047691844701767, |
| "eval_math_loss": 0.8841754198074341, |
| "eval_math_mean_token_accuracy": 0.775034920334816, |
| "eval_math_num_tokens": 2824418.0, |
| "eval_math_runtime": 47.9197, |
| "eval_math_samples_per_second": 10.434, |
| "eval_math_steps_per_second": 10.434, |
| "step": 400 |
| }, |
| { |
| "entropy": 0.8859184484928846, |
| "epoch": 0.164, |
| "grad_norm": 127.0, |
| "learning_rate": 8.18e-06, |
| "loss": 14.142, |
| "mean_token_accuracy": 0.7765266574919224, |
| "num_tokens": 2895667.0, |
| "step": 410 |
| }, |
| { |
| "entropy": 0.8501209424808621, |
| "epoch": 0.168, |
| "grad_norm": 40.0, |
| "learning_rate": 8.380000000000001e-06, |
| "loss": 13.8435, |
| "mean_token_accuracy": 0.7774271417409182, |
| "num_tokens": 2966031.0, |
| "step": 420 |
| }, |
| { |
| "entropy": 0.8510567891411484, |
| "epoch": 0.172, |
| "grad_norm": 45.75, |
| "learning_rate": 8.580000000000001e-06, |
| "loss": 13.3673, |
| "mean_token_accuracy": 0.7858892437070608, |
| "num_tokens": 3039780.0, |
| "step": 430 |
| }, |
| { |
| "entropy": 0.8355857279151678, |
| "epoch": 0.176, |
| "grad_norm": 44.5, |
| "learning_rate": 8.78e-06, |
| "loss": 13.7383, |
| "mean_token_accuracy": 0.7784004233777523, |
| "num_tokens": 3110313.0, |
| "step": 440 |
| }, |
| { |
| "entropy": 0.8662491450086236, |
| "epoch": 0.18, |
| "grad_norm": 43.75, |
| "learning_rate": 8.98e-06, |
| "loss": 13.6485, |
| "mean_token_accuracy": 0.7806661173701286, |
| "num_tokens": 3179963.0, |
| "step": 450 |
| }, |
| { |
| "entropy": 0.8596813324838877, |
| "epoch": 0.184, |
| "grad_norm": 46.0, |
| "learning_rate": 9.180000000000002e-06, |
| "loss": 13.7717, |
| "mean_token_accuracy": 0.7796813258901238, |
| "num_tokens": 3251139.0, |
| "step": 460 |
| }, |
| { |
| "entropy": 0.8454847941175103, |
| "epoch": 0.188, |
| "grad_norm": 56.25, |
| "learning_rate": 9.38e-06, |
| "loss": 13.4682, |
| "mean_token_accuracy": 0.7803339377045632, |
| "num_tokens": 3323519.0, |
| "step": 470 |
| }, |
| { |
| "entropy": 0.8228314617648721, |
| "epoch": 0.192, |
| "grad_norm": 49.75, |
| "learning_rate": 9.58e-06, |
| "loss": 13.3842, |
| "mean_token_accuracy": 0.7827629748731851, |
| "num_tokens": 3394170.0, |
| "step": 480 |
| }, |
| { |
| "entropy": 0.8164506537839771, |
| "epoch": 0.196, |
| "grad_norm": 54.25, |
| "learning_rate": 9.780000000000001e-06, |
| "loss": 13.1965, |
| "mean_token_accuracy": 0.7878578577190638, |
| "num_tokens": 3465341.0, |
| "step": 490 |
| }, |
| { |
| "entropy": 0.8838530456647277, |
| "epoch": 0.2, |
| "grad_norm": 51.0, |
| "learning_rate": 9.980000000000001e-06, |
| "loss": 13.9771, |
| "mean_token_accuracy": 0.7734297584742308, |
| "num_tokens": 3533571.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.2, |
| "eval_biology_entropy": 1.2344867270588875, |
| "eval_biology_loss": 1.2817195653915405, |
| "eval_biology_mean_token_accuracy": 0.6832386125922203, |
| "eval_biology_num_tokens": 3533571.0, |
| "eval_biology_runtime": 40.4092, |
| "eval_biology_samples_per_second": 12.373, |
| "eval_biology_steps_per_second": 12.373, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.2, |
| "eval_math_entropy": 0.8546719763875008, |
| "eval_math_loss": 0.8547734618186951, |
| "eval_math_mean_token_accuracy": 0.7804626071453095, |
| "eval_math_num_tokens": 3533571.0, |
| "eval_math_runtime": 48.0961, |
| "eval_math_samples_per_second": 10.396, |
| "eval_math_steps_per_second": 10.396, |
| "step": 500 |
| }, |
| { |
| "entropy": 0.8578155474737287, |
| "epoch": 0.204, |
| "grad_norm": 45.75, |
| "learning_rate": 1.018e-05, |
| "loss": 13.8097, |
| "mean_token_accuracy": 0.776990968361497, |
| "num_tokens": 3602983.0, |
| "step": 510 |
| }, |
| { |
| "entropy": 0.8306718476116657, |
| "epoch": 0.208, |
| "grad_norm": 50.25, |
| "learning_rate": 1.038e-05, |
| "loss": 13.4186, |
| "mean_token_accuracy": 0.7863588180392981, |
| "num_tokens": 3673600.0, |
| "step": 520 |
| }, |
| { |
| "entropy": 0.8772016573697329, |
| "epoch": 0.212, |
| "grad_norm": 45.75, |
| "learning_rate": 1.0580000000000002e-05, |
| "loss": 13.9353, |
| "mean_token_accuracy": 0.7783012766391039, |
| "num_tokens": 3743641.0, |
| "step": 530 |
| }, |
| { |
| "entropy": 0.8009312467649579, |
| "epoch": 0.216, |
| "grad_norm": 55.25, |
| "learning_rate": 1.0780000000000002e-05, |
| "loss": 12.9356, |
| "mean_token_accuracy": 0.7893661856651306, |
| "num_tokens": 3817397.0, |
| "step": 540 |
| }, |
| { |
| "entropy": 0.8427618410438299, |
| "epoch": 0.22, |
| "grad_norm": 51.0, |
| "learning_rate": 1.0980000000000002e-05, |
| "loss": 13.3831, |
| "mean_token_accuracy": 0.7832117900252342, |
| "num_tokens": 3886866.0, |
| "step": 550 |
| }, |
| { |
| "entropy": 0.8758323887363076, |
| "epoch": 0.224, |
| "grad_norm": 41.5, |
| "learning_rate": 1.1180000000000001e-05, |
| "loss": 13.8448, |
| "mean_token_accuracy": 0.7745687130838632, |
| "num_tokens": 3953732.0, |
| "step": 560 |
| }, |
| { |
| "entropy": 0.8352954359725118, |
| "epoch": 0.228, |
| "grad_norm": 41.75, |
| "learning_rate": 1.138e-05, |
| "loss": 13.4861, |
| "mean_token_accuracy": 0.7828061260282994, |
| "num_tokens": 4025795.0, |
| "step": 570 |
| }, |
| { |
| "entropy": 0.7947553901001811, |
| "epoch": 0.232, |
| "grad_norm": 43.25, |
| "learning_rate": 1.1580000000000001e-05, |
| "loss": 12.8366, |
| "mean_token_accuracy": 0.7880385994911194, |
| "num_tokens": 4098989.0, |
| "step": 580 |
| }, |
| { |
| "entropy": 0.8285046070814133, |
| "epoch": 0.236, |
| "grad_norm": 39.25, |
| "learning_rate": 1.178e-05, |
| "loss": 13.0789, |
| "mean_token_accuracy": 0.7872951343655586, |
| "num_tokens": 4170878.0, |
| "step": 590 |
| }, |
| { |
| "entropy": 0.7863880429416895, |
| "epoch": 0.24, |
| "grad_norm": 45.0, |
| "learning_rate": 1.198e-05, |
| "loss": 12.7479, |
| "mean_token_accuracy": 0.7928395148366689, |
| "num_tokens": 4243655.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.24, |
| "eval_biology_entropy": 1.2514844969511032, |
| "eval_biology_loss": 1.2855682373046875, |
| "eval_biology_mean_token_accuracy": 0.6817903207540512, |
| "eval_biology_num_tokens": 4243655.0, |
| "eval_biology_runtime": 40.7639, |
| "eval_biology_samples_per_second": 12.266, |
| "eval_biology_steps_per_second": 12.266, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.24, |
| "eval_math_entropy": 0.843876126229763, |
| "eval_math_loss": 0.8332411646842957, |
| "eval_math_mean_token_accuracy": 0.783966186761856, |
| "eval_math_num_tokens": 4243655.0, |
| "eval_math_runtime": 48.1965, |
| "eval_math_samples_per_second": 10.374, |
| "eval_math_steps_per_second": 10.374, |
| "step": 600 |
| }, |
| { |
| "entropy": 0.7838549023494125, |
| "epoch": 0.244, |
| "grad_norm": 41.75, |
| "learning_rate": 1.218e-05, |
| "loss": 12.4042, |
| "mean_token_accuracy": 0.7971927735954523, |
| "num_tokens": 4317321.0, |
| "step": 610 |
| }, |
| { |
| "entropy": 0.7797296568751335, |
| "epoch": 0.248, |
| "grad_norm": 51.75, |
| "learning_rate": 1.2380000000000002e-05, |
| "loss": 12.5895, |
| "mean_token_accuracy": 0.7916548330336809, |
| "num_tokens": 4387651.0, |
| "step": 620 |
| }, |
| { |
| "entropy": 0.8094759196043014, |
| "epoch": 0.252, |
| "grad_norm": 84.5, |
| "learning_rate": 1.2580000000000002e-05, |
| "loss": 12.9446, |
| "mean_token_accuracy": 0.7892590902745724, |
| "num_tokens": 4461909.0, |
| "step": 630 |
| }, |
| { |
| "entropy": 0.7797345662489533, |
| "epoch": 0.256, |
| "grad_norm": 62.0, |
| "learning_rate": 1.2780000000000001e-05, |
| "loss": 12.462, |
| "mean_token_accuracy": 0.7935512393712998, |
| "num_tokens": 4531471.0, |
| "step": 640 |
| }, |
| { |
| "entropy": 0.7911159221082926, |
| "epoch": 0.26, |
| "grad_norm": 40.0, |
| "learning_rate": 1.2980000000000001e-05, |
| "loss": 12.8515, |
| "mean_token_accuracy": 0.7928617469966411, |
| "num_tokens": 4600757.0, |
| "step": 650 |
| }, |
| { |
| "entropy": 0.8169831104576588, |
| "epoch": 0.264, |
| "grad_norm": 49.5, |
| "learning_rate": 1.3180000000000001e-05, |
| "loss": 13.117, |
| "mean_token_accuracy": 0.7893466159701348, |
| "num_tokens": 4669389.0, |
| "step": 660 |
| }, |
| { |
| "entropy": 0.8287884982302784, |
| "epoch": 0.268, |
| "grad_norm": 55.25, |
| "learning_rate": 1.3380000000000002e-05, |
| "loss": 13.3618, |
| "mean_token_accuracy": 0.7853043630719185, |
| "num_tokens": 4740492.0, |
| "step": 670 |
| }, |
| { |
| "entropy": 0.7938818972557783, |
| "epoch": 0.272, |
| "grad_norm": 42.5, |
| "learning_rate": 1.3580000000000002e-05, |
| "loss": 12.6092, |
| "mean_token_accuracy": 0.7941738195717335, |
| "num_tokens": 4810284.0, |
| "step": 680 |
| }, |
| { |
| "entropy": 0.7964617364108563, |
| "epoch": 0.276, |
| "grad_norm": 37.25, |
| "learning_rate": 1.378e-05, |
| "loss": 12.6495, |
| "mean_token_accuracy": 0.7921258114278317, |
| "num_tokens": 4885701.0, |
| "step": 690 |
| }, |
| { |
| "entropy": 0.7932526333257556, |
| "epoch": 0.28, |
| "grad_norm": 48.5, |
| "learning_rate": 1.398e-05, |
| "loss": 12.8488, |
| "mean_token_accuracy": 0.7890512771904469, |
| "num_tokens": 4955375.0, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.28, |
| "eval_biology_entropy": 1.2677810794115068, |
| "eval_biology_loss": 1.2936662435531616, |
| "eval_biology_mean_token_accuracy": 0.6814599596261978, |
| "eval_biology_num_tokens": 4955375.0, |
| "eval_biology_runtime": 40.9028, |
| "eval_biology_samples_per_second": 12.224, |
| "eval_biology_steps_per_second": 12.224, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.28, |
| "eval_math_entropy": 0.8274374225139618, |
| "eval_math_loss": 0.8181362748146057, |
| "eval_math_mean_token_accuracy": 0.7868153600692749, |
| "eval_math_num_tokens": 4955375.0, |
| "eval_math_runtime": 48.2074, |
| "eval_math_samples_per_second": 10.372, |
| "eval_math_steps_per_second": 10.372, |
| "step": 700 |
| }, |
| { |
| "entropy": 0.7877025598660111, |
| "epoch": 0.284, |
| "grad_norm": 39.25, |
| "learning_rate": 1.418e-05, |
| "loss": 12.6293, |
| "mean_token_accuracy": 0.7944098394364119, |
| "num_tokens": 5025638.0, |
| "step": 710 |
| }, |
| { |
| "entropy": 0.7564179511740804, |
| "epoch": 0.288, |
| "grad_norm": 38.75, |
| "learning_rate": 1.4380000000000001e-05, |
| "loss": 12.0294, |
| "mean_token_accuracy": 0.8003660906106234, |
| "num_tokens": 5095104.0, |
| "step": 720 |
| }, |
| { |
| "entropy": 0.7647898152470589, |
| "epoch": 0.292, |
| "grad_norm": 56.0, |
| "learning_rate": 1.4580000000000001e-05, |
| "loss": 12.4713, |
| "mean_token_accuracy": 0.7947716873139143, |
| "num_tokens": 5167049.0, |
| "step": 730 |
| }, |
| { |
| "entropy": 0.8119000235572458, |
| "epoch": 0.296, |
| "grad_norm": 36.75, |
| "learning_rate": 1.478e-05, |
| "loss": 13.0034, |
| "mean_token_accuracy": 0.7876484204083681, |
| "num_tokens": 5240988.0, |
| "step": 740 |
| }, |
| { |
| "entropy": 0.7802535065449774, |
| "epoch": 0.3, |
| "grad_norm": 40.5, |
| "learning_rate": 1.498e-05, |
| "loss": 12.4371, |
| "mean_token_accuracy": 0.795463678240776, |
| "num_tokens": 5312258.0, |
| "step": 750 |
| }, |
| { |
| "entropy": 0.8087782407179475, |
| "epoch": 0.304, |
| "grad_norm": 44.75, |
| "learning_rate": 1.5180000000000002e-05, |
| "loss": 12.9661, |
| "mean_token_accuracy": 0.7857579983770847, |
| "num_tokens": 5383732.0, |
| "step": 760 |
| }, |
| { |
| "entropy": 0.8243673056364059, |
| "epoch": 0.308, |
| "grad_norm": 39.5, |
| "learning_rate": 1.5380000000000002e-05, |
| "loss": 13.1334, |
| "mean_token_accuracy": 0.7867084331810474, |
| "num_tokens": 5456650.0, |
| "step": 770 |
| }, |
| { |
| "entropy": 0.7901528097689152, |
| "epoch": 0.312, |
| "grad_norm": 40.0, |
| "learning_rate": 1.5580000000000003e-05, |
| "loss": 12.6608, |
| "mean_token_accuracy": 0.7905277730897069, |
| "num_tokens": 5530666.0, |
| "step": 780 |
| }, |
| { |
| "entropy": 0.7515985878184438, |
| "epoch": 0.316, |
| "grad_norm": 44.75, |
| "learning_rate": 1.578e-05, |
| "loss": 11.9655, |
| "mean_token_accuracy": 0.8011779848486185, |
| "num_tokens": 5605269.0, |
| "step": 790 |
| }, |
| { |
| "entropy": 0.7314818985760212, |
| "epoch": 0.32, |
| "grad_norm": 42.5, |
| "learning_rate": 1.5980000000000003e-05, |
| "loss": 11.7244, |
| "mean_token_accuracy": 0.8059452049434185, |
| "num_tokens": 5676334.0, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.32, |
| "eval_biology_entropy": 1.2663928886651994, |
| "eval_biology_loss": 1.2994052171707153, |
| "eval_biology_mean_token_accuracy": 0.6803199350833893, |
| "eval_biology_num_tokens": 5676334.0, |
| "eval_biology_runtime": 40.7632, |
| "eval_biology_samples_per_second": 12.266, |
| "eval_biology_steps_per_second": 12.266, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.32, |
| "eval_math_entropy": 0.7805578476190567, |
| "eval_math_loss": 0.8004927635192871, |
| "eval_math_mean_token_accuracy": 0.7903167794942856, |
| "eval_math_num_tokens": 5676334.0, |
| "eval_math_runtime": 48.1783, |
| "eval_math_samples_per_second": 10.378, |
| "eval_math_steps_per_second": 10.378, |
| "step": 800 |
| }, |
| { |
| "entropy": 0.7661854576319456, |
| "epoch": 0.324, |
| "grad_norm": 40.5, |
| "learning_rate": 1.618e-05, |
| "loss": 12.5767, |
| "mean_token_accuracy": 0.7944763712584972, |
| "num_tokens": 5747779.0, |
| "step": 810 |
| }, |
| { |
| "entropy": 0.803387301787734, |
| "epoch": 0.328, |
| "grad_norm": 42.0, |
| "learning_rate": 1.638e-05, |
| "loss": 12.825, |
| "mean_token_accuracy": 0.7907922245562077, |
| "num_tokens": 5816488.0, |
| "step": 820 |
| }, |
| { |
| "entropy": 0.8049221463501454, |
| "epoch": 0.332, |
| "grad_norm": 44.0, |
| "learning_rate": 1.658e-05, |
| "loss": 12.9669, |
| "mean_token_accuracy": 0.7854703675955534, |
| "num_tokens": 5887180.0, |
| "step": 830 |
| }, |
| { |
| "entropy": 0.8102014064788818, |
| "epoch": 0.336, |
| "grad_norm": 38.5, |
| "learning_rate": 1.6780000000000002e-05, |
| "loss": 12.9814, |
| "mean_token_accuracy": 0.7886964596807957, |
| "num_tokens": 5958480.0, |
| "step": 840 |
| }, |
| { |
| "entropy": 0.7344874950125814, |
| "epoch": 0.34, |
| "grad_norm": 50.25, |
| "learning_rate": 1.698e-05, |
| "loss": 11.7609, |
| "mean_token_accuracy": 0.8062219381332397, |
| "num_tokens": 6031020.0, |
| "step": 850 |
| }, |
| { |
| "entropy": 0.802255941182375, |
| "epoch": 0.344, |
| "grad_norm": 39.0, |
| "learning_rate": 1.718e-05, |
| "loss": 12.8439, |
| "mean_token_accuracy": 0.7877463910728693, |
| "num_tokens": 6102081.0, |
| "step": 860 |
| }, |
| { |
| "entropy": 0.7980229431763292, |
| "epoch": 0.348, |
| "grad_norm": 36.25, |
| "learning_rate": 1.7380000000000003e-05, |
| "loss": 12.7774, |
| "mean_token_accuracy": 0.7900870595127344, |
| "num_tokens": 6172871.0, |
| "step": 870 |
| }, |
| { |
| "entropy": 0.7401213280856609, |
| "epoch": 0.352, |
| "grad_norm": 40.75, |
| "learning_rate": 1.758e-05, |
| "loss": 11.9409, |
| "mean_token_accuracy": 0.801040057092905, |
| "num_tokens": 6242161.0, |
| "step": 880 |
| }, |
| { |
| "entropy": 0.8095649138092995, |
| "epoch": 0.356, |
| "grad_norm": 46.5, |
| "learning_rate": 1.7780000000000003e-05, |
| "loss": 12.9134, |
| "mean_token_accuracy": 0.7872339155524969, |
| "num_tokens": 6312187.0, |
| "step": 890 |
| }, |
| { |
| "entropy": 0.7853369496762752, |
| "epoch": 0.36, |
| "grad_norm": 37.75, |
| "learning_rate": 1.798e-05, |
| "loss": 12.4062, |
| "mean_token_accuracy": 0.7960268270224333, |
| "num_tokens": 6377327.0, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.36, |
| "eval_biology_entropy": 1.250610340833664, |
| "eval_biology_loss": 1.3047173023223877, |
| "eval_biology_mean_token_accuracy": 0.6794998321533203, |
| "eval_biology_num_tokens": 6377327.0, |
| "eval_biology_runtime": 40.5105, |
| "eval_biology_samples_per_second": 12.342, |
| "eval_biology_steps_per_second": 12.342, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.36, |
| "eval_math_entropy": 0.773322255730629, |
| "eval_math_loss": 0.789590060710907, |
| "eval_math_mean_token_accuracy": 0.7914780566692352, |
| "eval_math_num_tokens": 6377327.0, |
| "eval_math_runtime": 48.04, |
| "eval_math_samples_per_second": 10.408, |
| "eval_math_steps_per_second": 10.408, |
| "step": 900 |
| }, |
| { |
| "entropy": 0.763472201116383, |
| "epoch": 0.364, |
| "grad_norm": 41.5, |
| "learning_rate": 1.8180000000000002e-05, |
| "loss": 12.3288, |
| "mean_token_accuracy": 0.7959023587405681, |
| "num_tokens": 6453254.0, |
| "step": 910 |
| }, |
| { |
| "entropy": 0.7712945869192481, |
| "epoch": 0.368, |
| "grad_norm": 40.25, |
| "learning_rate": 1.8380000000000004e-05, |
| "loss": 12.2854, |
| "mean_token_accuracy": 0.795868669450283, |
| "num_tokens": 6523679.0, |
| "step": 920 |
| }, |
| { |
| "entropy": 0.7641221083700657, |
| "epoch": 0.372, |
| "grad_norm": 37.25, |
| "learning_rate": 1.858e-05, |
| "loss": 12.2867, |
| "mean_token_accuracy": 0.7949323236942292, |
| "num_tokens": 6594651.0, |
| "step": 930 |
| }, |
| { |
| "entropy": 0.7518373979255557, |
| "epoch": 0.376, |
| "grad_norm": 42.0, |
| "learning_rate": 1.878e-05, |
| "loss": 12.1179, |
| "mean_token_accuracy": 0.798585458099842, |
| "num_tokens": 6669278.0, |
| "step": 940 |
| }, |
| { |
| "entropy": 0.7543134400621057, |
| "epoch": 0.38, |
| "grad_norm": 33.75, |
| "learning_rate": 1.898e-05, |
| "loss": 12.023, |
| "mean_token_accuracy": 0.7991646058857441, |
| "num_tokens": 6740491.0, |
| "step": 950 |
| }, |
| { |
| "entropy": 0.7203706834465265, |
| "epoch": 0.384, |
| "grad_norm": 35.5, |
| "learning_rate": 1.918e-05, |
| "loss": 11.6634, |
| "mean_token_accuracy": 0.8036353968083858, |
| "num_tokens": 6810978.0, |
| "step": 960 |
| }, |
| { |
| "entropy": 0.7761434537358582, |
| "epoch": 0.388, |
| "grad_norm": 39.5, |
| "learning_rate": 1.938e-05, |
| "loss": 12.4387, |
| "mean_token_accuracy": 0.7944561909884215, |
| "num_tokens": 6882077.0, |
| "step": 970 |
| }, |
| { |
| "entropy": 0.7785217678174376, |
| "epoch": 0.392, |
| "grad_norm": 33.75, |
| "learning_rate": 1.9580000000000002e-05, |
| "loss": 12.607, |
| "mean_token_accuracy": 0.7944069147109986, |
| "num_tokens": 6953929.0, |
| "step": 980 |
| }, |
| { |
| "entropy": 0.8195465696975589, |
| "epoch": 0.396, |
| "grad_norm": 65.5, |
| "learning_rate": 1.978e-05, |
| "loss": 13.2202, |
| "mean_token_accuracy": 0.7838506288826466, |
| "num_tokens": 7023835.0, |
| "step": 990 |
| }, |
| { |
| "entropy": 0.8217529645189643, |
| "epoch": 0.4, |
| "grad_norm": 39.75, |
| "learning_rate": 1.9980000000000002e-05, |
| "loss": 13.1698, |
| "mean_token_accuracy": 0.7848756689578295, |
| "num_tokens": 7096903.0, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.4, |
| "eval_biology_entropy": 1.266352553486824, |
| "eval_biology_loss": 1.312301516532898, |
| "eval_biology_mean_token_accuracy": 0.6775026069879532, |
| "eval_biology_num_tokens": 7096903.0, |
| "eval_biology_runtime": 40.7343, |
| "eval_biology_samples_per_second": 12.275, |
| "eval_biology_steps_per_second": 12.275, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.4, |
| "eval_math_entropy": 0.7719556384682655, |
| "eval_math_loss": 0.7821868062019348, |
| "eval_math_mean_token_accuracy": 0.7932658619880676, |
| "eval_math_num_tokens": 7096903.0, |
| "eval_math_runtime": 47.8946, |
| "eval_math_samples_per_second": 10.44, |
| "eval_math_steps_per_second": 10.44, |
| "step": 1000 |
| }, |
| { |
| "entropy": 0.7681734091602266, |
| "epoch": 0.404, |
| "grad_norm": 50.25, |
| "learning_rate": 1.9980000000000002e-05, |
| "loss": 12.3128, |
| "mean_token_accuracy": 0.7956426542252302, |
| "num_tokens": 7165970.0, |
| "step": 1010 |
| }, |
| { |
| "entropy": 0.7375578137114644, |
| "epoch": 0.408, |
| "grad_norm": 39.5, |
| "learning_rate": 1.995777777777778e-05, |
| "loss": 12.0172, |
| "mean_token_accuracy": 0.7997171815484763, |
| "num_tokens": 7239071.0, |
| "step": 1020 |
| }, |
| { |
| "entropy": 0.7803538812324404, |
| "epoch": 0.412, |
| "grad_norm": 35.75, |
| "learning_rate": 1.9935555555555557e-05, |
| "loss": 12.3805, |
| "mean_token_accuracy": 0.7939646884799003, |
| "num_tokens": 7308430.0, |
| "step": 1030 |
| }, |
| { |
| "entropy": 0.7741465013474226, |
| "epoch": 0.416, |
| "grad_norm": 35.5, |
| "learning_rate": 1.9913333333333335e-05, |
| "loss": 12.3671, |
| "mean_token_accuracy": 0.7934583045542241, |
| "num_tokens": 7377181.0, |
| "step": 1040 |
| }, |
| { |
| "entropy": 0.7638113724067808, |
| "epoch": 0.42, |
| "grad_norm": 90.5, |
| "learning_rate": 1.9891111111111112e-05, |
| "loss": 12.3505, |
| "mean_token_accuracy": 0.7913536671549082, |
| "num_tokens": 7443045.0, |
| "step": 1050 |
| }, |
| { |
| "entropy": 0.763846804574132, |
| "epoch": 0.424, |
| "grad_norm": 30.5, |
| "learning_rate": 1.986888888888889e-05, |
| "loss": 12.2045, |
| "mean_token_accuracy": 0.7992881424725056, |
| "num_tokens": 7513035.0, |
| "step": 1060 |
| }, |
| { |
| "entropy": 0.7383330492302775, |
| "epoch": 0.428, |
| "grad_norm": 34.0, |
| "learning_rate": 1.9846666666666668e-05, |
| "loss": 11.9274, |
| "mean_token_accuracy": 0.80244723521173, |
| "num_tokens": 7580676.0, |
| "step": 1070 |
| }, |
| { |
| "entropy": 0.7183827603235841, |
| "epoch": 0.432, |
| "grad_norm": 37.25, |
| "learning_rate": 1.9824444444444445e-05, |
| "loss": 11.5217, |
| "mean_token_accuracy": 0.8065667234361171, |
| "num_tokens": 7650523.0, |
| "step": 1080 |
| }, |
| { |
| "entropy": 0.784936741553247, |
| "epoch": 0.436, |
| "grad_norm": 60.5, |
| "learning_rate": 1.9802222222222226e-05, |
| "loss": 12.7959, |
| "mean_token_accuracy": 0.7918864544481039, |
| "num_tokens": 7725087.0, |
| "step": 1090 |
| }, |
| { |
| "entropy": 0.7581901378929615, |
| "epoch": 0.44, |
| "grad_norm": 35.75, |
| "learning_rate": 1.978e-05, |
| "loss": 12.0708, |
| "mean_token_accuracy": 0.7991419725120068, |
| "num_tokens": 7794006.0, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.44, |
| "eval_biology_entropy": 1.3096306604146957, |
| "eval_biology_loss": 1.3199082612991333, |
| "eval_biology_mean_token_accuracy": 0.6769449328184127, |
| "eval_biology_num_tokens": 7794006.0, |
| "eval_biology_runtime": 41.8088, |
| "eval_biology_samples_per_second": 11.959, |
| "eval_biology_steps_per_second": 11.959, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.44, |
| "eval_math_entropy": 0.7593404217660427, |
| "eval_math_loss": 0.7711546421051025, |
| "eval_math_mean_token_accuracy": 0.7947946254014969, |
| "eval_math_num_tokens": 7794006.0, |
| "eval_math_runtime": 48.5633, |
| "eval_math_samples_per_second": 10.296, |
| "eval_math_steps_per_second": 10.296, |
| "step": 1100 |
| }, |
| { |
| "entropy": 0.7571122448891401, |
| "epoch": 0.444, |
| "grad_norm": 35.0, |
| "learning_rate": 1.975777777777778e-05, |
| "loss": 12.1206, |
| "mean_token_accuracy": 0.7960348557680845, |
| "num_tokens": 7869588.0, |
| "step": 1110 |
| }, |
| { |
| "entropy": 0.7115281330421567, |
| "epoch": 0.448, |
| "grad_norm": 39.75, |
| "learning_rate": 1.9735555555555556e-05, |
| "loss": 11.603, |
| "mean_token_accuracy": 0.8045209631323814, |
| "num_tokens": 7936788.0, |
| "step": 1120 |
| }, |
| { |
| "entropy": 0.7751916225999593, |
| "epoch": 0.452, |
| "grad_norm": 44.5, |
| "learning_rate": 1.9713333333333337e-05, |
| "loss": 12.2553, |
| "mean_token_accuracy": 0.7963129185140133, |
| "num_tokens": 8007062.0, |
| "step": 1130 |
| }, |
| { |
| "entropy": 0.7358326924964785, |
| "epoch": 0.456, |
| "grad_norm": 35.75, |
| "learning_rate": 1.969111111111111e-05, |
| "loss": 11.8682, |
| "mean_token_accuracy": 0.8016852129250764, |
| "num_tokens": 8081035.0, |
| "step": 1140 |
| }, |
| { |
| "entropy": 0.748681915551424, |
| "epoch": 0.46, |
| "grad_norm": 38.75, |
| "learning_rate": 1.9668888888888892e-05, |
| "loss": 12.0865, |
| "mean_token_accuracy": 0.8016125675290823, |
| "num_tokens": 8152005.0, |
| "step": 1150 |
| }, |
| { |
| "entropy": 0.7732457166537643, |
| "epoch": 0.464, |
| "grad_norm": 30.625, |
| "learning_rate": 1.9646666666666666e-05, |
| "loss": 12.3834, |
| "mean_token_accuracy": 0.7960082966834306, |
| "num_tokens": 8223147.0, |
| "step": 1160 |
| }, |
| { |
| "entropy": 0.7652292996644974, |
| "epoch": 0.468, |
| "grad_norm": 47.0, |
| "learning_rate": 1.9624444444444447e-05, |
| "loss": 12.5645, |
| "mean_token_accuracy": 0.7940644320100546, |
| "num_tokens": 8296261.0, |
| "step": 1170 |
| }, |
| { |
| "entropy": 0.7421515263617039, |
| "epoch": 0.472, |
| "grad_norm": 45.75, |
| "learning_rate": 1.9602222222222225e-05, |
| "loss": 11.76, |
| "mean_token_accuracy": 0.8024631194770336, |
| "num_tokens": 8367380.0, |
| "step": 1180 |
| }, |
| { |
| "entropy": 0.74504639133811, |
| "epoch": 0.476, |
| "grad_norm": 40.25, |
| "learning_rate": 1.9580000000000002e-05, |
| "loss": 11.8789, |
| "mean_token_accuracy": 0.798084694892168, |
| "num_tokens": 8438675.0, |
| "step": 1190 |
| }, |
| { |
| "entropy": 0.7464414104819298, |
| "epoch": 0.48, |
| "grad_norm": 34.75, |
| "learning_rate": 1.955777777777778e-05, |
| "loss": 12.21, |
| "mean_token_accuracy": 0.7978132009506226, |
| "num_tokens": 8506030.0, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.48, |
| "eval_biology_entropy": 1.3110252878069877, |
| "eval_biology_loss": 1.324716567993164, |
| "eval_biology_mean_token_accuracy": 0.6763759871721268, |
| "eval_biology_num_tokens": 8506030.0, |
| "eval_biology_runtime": 67.8224, |
| "eval_biology_samples_per_second": 7.372, |
| "eval_biology_steps_per_second": 7.372, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.48, |
| "eval_math_entropy": 0.7597530280947685, |
| "eval_math_loss": 0.7603512406349182, |
| "eval_math_mean_token_accuracy": 0.7970666073560715, |
| "eval_math_num_tokens": 8506030.0, |
| "eval_math_runtime": 49.7375, |
| "eval_math_samples_per_second": 10.053, |
| "eval_math_steps_per_second": 10.053, |
| "step": 1200 |
| }, |
| { |
| "entropy": 0.7492084631696343, |
| "epoch": 0.484, |
| "grad_norm": 35.25, |
| "learning_rate": 1.9535555555555557e-05, |
| "loss": 11.91, |
| "mean_token_accuracy": 0.8018037732690573, |
| "num_tokens": 8576915.0, |
| "step": 1210 |
| }, |
| { |
| "entropy": 0.7045143865048885, |
| "epoch": 0.488, |
| "grad_norm": 33.5, |
| "learning_rate": 1.9513333333333335e-05, |
| "loss": 11.3318, |
| "mean_token_accuracy": 0.8072403989732265, |
| "num_tokens": 8646384.0, |
| "step": 1220 |
| }, |
| { |
| "entropy": 0.7107147205621004, |
| "epoch": 0.492, |
| "grad_norm": 34.5, |
| "learning_rate": 1.9491111111111113e-05, |
| "loss": 11.4898, |
| "mean_token_accuracy": 0.8078423041850329, |
| "num_tokens": 8718547.0, |
| "step": 1230 |
| }, |
| { |
| "entropy": 0.7382461177185178, |
| "epoch": 0.496, |
| "grad_norm": 34.0, |
| "learning_rate": 1.946888888888889e-05, |
| "loss": 11.8211, |
| "mean_token_accuracy": 0.8013716965913773, |
| "num_tokens": 8788726.0, |
| "step": 1240 |
| }, |
| { |
| "entropy": 0.7180610377341509, |
| "epoch": 0.5, |
| "grad_norm": 32.0, |
| "learning_rate": 1.9446666666666668e-05, |
| "loss": 11.6359, |
| "mean_token_accuracy": 0.8073719006031752, |
| "num_tokens": 8861341.0, |
| "step": 1250 |
| }, |
| { |
| "entropy": 0.7594908589497209, |
| "epoch": 0.504, |
| "grad_norm": 33.25, |
| "learning_rate": 1.9424444444444446e-05, |
| "loss": 12.1494, |
| "mean_token_accuracy": 0.7983963489532471, |
| "num_tokens": 8934959.0, |
| "step": 1260 |
| }, |
| { |
| "entropy": 0.6865442413836718, |
| "epoch": 0.508, |
| "grad_norm": 47.0, |
| "learning_rate": 1.9402222222222223e-05, |
| "loss": 11.0336, |
| "mean_token_accuracy": 0.8123888235539198, |
| "num_tokens": 9008794.0, |
| "step": 1270 |
| }, |
| { |
| "entropy": 0.736488389596343, |
| "epoch": 0.512, |
| "grad_norm": 36.0, |
| "learning_rate": 1.938e-05, |
| "loss": 11.8236, |
| "mean_token_accuracy": 0.80280033480376, |
| "num_tokens": 9078039.0, |
| "step": 1280 |
| }, |
| { |
| "entropy": 0.7221866660751403, |
| "epoch": 0.516, |
| "grad_norm": 38.5, |
| "learning_rate": 1.935777777777778e-05, |
| "loss": 11.5549, |
| "mean_token_accuracy": 0.8055313553661108, |
| "num_tokens": 9148271.0, |
| "step": 1290 |
| }, |
| { |
| "entropy": 0.7004529038444162, |
| "epoch": 0.52, |
| "grad_norm": 35.5, |
| "learning_rate": 1.9335555555555556e-05, |
| "loss": 11.4288, |
| "mean_token_accuracy": 0.8069275438785553, |
| "num_tokens": 9220298.0, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.52, |
| "eval_biology_entropy": 1.2943904809951783, |
| "eval_biology_loss": 1.3324984312057495, |
| "eval_biology_mean_token_accuracy": 0.67458989828825, |
| "eval_biology_num_tokens": 9220298.0, |
| "eval_biology_runtime": 40.8416, |
| "eval_biology_samples_per_second": 12.242, |
| "eval_biology_steps_per_second": 12.242, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.52, |
| "eval_math_entropy": 0.7494015108048916, |
| "eval_math_loss": 0.7516281604766846, |
| "eval_math_mean_token_accuracy": 0.799598550081253, |
| "eval_math_num_tokens": 9220298.0, |
| "eval_math_runtime": 48.3706, |
| "eval_math_samples_per_second": 10.337, |
| "eval_math_steps_per_second": 10.337, |
| "step": 1300 |
| }, |
| { |
| "entropy": 0.7174371790140868, |
| "epoch": 0.524, |
| "grad_norm": 28.375, |
| "learning_rate": 1.9313333333333334e-05, |
| "loss": 11.5, |
| "mean_token_accuracy": 0.8083903137594461, |
| "num_tokens": 9299112.0, |
| "step": 1310 |
| }, |
| { |
| "entropy": 0.7080350153148174, |
| "epoch": 0.528, |
| "grad_norm": 30.75, |
| "learning_rate": 1.9291111111111115e-05, |
| "loss": 11.26, |
| "mean_token_accuracy": 0.8113009247928857, |
| "num_tokens": 9373860.0, |
| "step": 1320 |
| }, |
| { |
| "entropy": 0.7306012082844973, |
| "epoch": 0.532, |
| "grad_norm": 44.5, |
| "learning_rate": 1.926888888888889e-05, |
| "loss": 11.8475, |
| "mean_token_accuracy": 0.8006449308246374, |
| "num_tokens": 9446235.0, |
| "step": 1330 |
| }, |
| { |
| "entropy": 0.7229157039895654, |
| "epoch": 0.536, |
| "grad_norm": 37.25, |
| "learning_rate": 1.924666666666667e-05, |
| "loss": 11.6284, |
| "mean_token_accuracy": 0.8080755949020386, |
| "num_tokens": 9515962.0, |
| "step": 1340 |
| }, |
| { |
| "entropy": 0.7307992668822407, |
| "epoch": 0.54, |
| "grad_norm": 41.25, |
| "learning_rate": 1.9224444444444444e-05, |
| "loss": 11.6765, |
| "mean_token_accuracy": 0.8042573977261782, |
| "num_tokens": 9587715.0, |
| "step": 1350 |
| }, |
| { |
| "entropy": 0.7034009215421975, |
| "epoch": 0.544, |
| "grad_norm": 35.75, |
| "learning_rate": 1.9202222222222225e-05, |
| "loss": 11.7562, |
| "mean_token_accuracy": 0.8090484531596303, |
| "num_tokens": 9660940.0, |
| "step": 1360 |
| }, |
| { |
| "entropy": 0.7848182378336788, |
| "epoch": 0.548, |
| "grad_norm": 36.25, |
| "learning_rate": 1.918e-05, |
| "loss": 12.6939, |
| "mean_token_accuracy": 0.7890455313026905, |
| "num_tokens": 9727806.0, |
| "step": 1370 |
| }, |
| { |
| "entropy": 0.7604333622381091, |
| "epoch": 0.552, |
| "grad_norm": 39.75, |
| "learning_rate": 1.915777777777778e-05, |
| "loss": 12.152, |
| "mean_token_accuracy": 0.7959740109741688, |
| "num_tokens": 9793170.0, |
| "step": 1380 |
| }, |
| { |
| "entropy": 0.7220884849317372, |
| "epoch": 0.556, |
| "grad_norm": 37.0, |
| "learning_rate": 1.9135555555555555e-05, |
| "loss": 11.6033, |
| "mean_token_accuracy": 0.8036061096936464, |
| "num_tokens": 9860484.0, |
| "step": 1390 |
| }, |
| { |
| "entropy": 0.704200503975153, |
| "epoch": 0.56, |
| "grad_norm": 29.375, |
| "learning_rate": 1.9113333333333336e-05, |
| "loss": 11.1518, |
| "mean_token_accuracy": 0.8113750666379929, |
| "num_tokens": 9932302.0, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.56, |
| "eval_biology_entropy": 1.2828473488092422, |
| "eval_biology_loss": 1.3331190347671509, |
| "eval_biology_mean_token_accuracy": 0.6747415412068367, |
| "eval_biology_num_tokens": 9932302.0, |
| "eval_biology_runtime": 45.9485, |
| "eval_biology_samples_per_second": 10.882, |
| "eval_biology_steps_per_second": 10.882, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.56, |
| "eval_math_entropy": 0.7451034058630467, |
| "eval_math_loss": 0.7451240420341492, |
| "eval_math_mean_token_accuracy": 0.8007836003303528, |
| "eval_math_num_tokens": 9932302.0, |
| "eval_math_runtime": 56.801, |
| "eval_math_samples_per_second": 8.803, |
| "eval_math_steps_per_second": 8.803, |
| "step": 1400 |
| }, |
| { |
| "entropy": 0.7115965198725462, |
| "epoch": 0.564, |
| "grad_norm": 34.25, |
| "learning_rate": 1.9091111111111113e-05, |
| "loss": 11.5991, |
| "mean_token_accuracy": 0.8047812730073929, |
| "num_tokens": 10001923.0, |
| "step": 1410 |
| }, |
| { |
| "entropy": 0.7081295600160956, |
| "epoch": 0.568, |
| "grad_norm": 32.75, |
| "learning_rate": 1.906888888888889e-05, |
| "loss": 11.2206, |
| "mean_token_accuracy": 0.8095470562577247, |
| "num_tokens": 10072362.0, |
| "step": 1420 |
| }, |
| { |
| "entropy": 0.7141612969338894, |
| "epoch": 0.572, |
| "grad_norm": 30.375, |
| "learning_rate": 1.904666666666667e-05, |
| "loss": 11.6816, |
| "mean_token_accuracy": 0.8019938107579947, |
| "num_tokens": 10140236.0, |
| "step": 1430 |
| }, |
| { |
| "entropy": 0.7410996645689011, |
| "epoch": 0.576, |
| "grad_norm": 32.5, |
| "learning_rate": 1.9024444444444446e-05, |
| "loss": 11.7841, |
| "mean_token_accuracy": 0.8034846507012844, |
| "num_tokens": 10215462.0, |
| "step": 1440 |
| }, |
| { |
| "entropy": 0.7485123924911022, |
| "epoch": 0.58, |
| "grad_norm": 43.25, |
| "learning_rate": 1.9002222222222224e-05, |
| "loss": 11.9787, |
| "mean_token_accuracy": 0.798300775885582, |
| "num_tokens": 10286386.0, |
| "step": 1450 |
| }, |
| { |
| "entropy": 0.7154224492609501, |
| "epoch": 0.584, |
| "grad_norm": 32.5, |
| "learning_rate": 1.898e-05, |
| "loss": 11.5298, |
| "mean_token_accuracy": 0.8057492181658745, |
| "num_tokens": 10354952.0, |
| "step": 1460 |
| }, |
| { |
| "entropy": 0.7272289376705885, |
| "epoch": 0.588, |
| "grad_norm": 40.25, |
| "learning_rate": 1.895777777777778e-05, |
| "loss": 11.652, |
| "mean_token_accuracy": 0.8025094602257014, |
| "num_tokens": 10425215.0, |
| "step": 1470 |
| }, |
| { |
| "entropy": 0.7166844455525279, |
| "epoch": 0.592, |
| "grad_norm": 30.625, |
| "learning_rate": 1.8935555555555556e-05, |
| "loss": 11.4116, |
| "mean_token_accuracy": 0.8082372698932886, |
| "num_tokens": 10504396.0, |
| "step": 1480 |
| }, |
| { |
| "entropy": 0.7413169213570654, |
| "epoch": 0.596, |
| "grad_norm": 32.0, |
| "learning_rate": 1.8913333333333334e-05, |
| "loss": 11.959, |
| "mean_token_accuracy": 0.7982309013605118, |
| "num_tokens": 10576150.0, |
| "step": 1490 |
| }, |
| { |
| "entropy": 0.7089840611442924, |
| "epoch": 0.6, |
| "grad_norm": 35.0, |
| "learning_rate": 1.8891111111111115e-05, |
| "loss": 11.6029, |
| "mean_token_accuracy": 0.8050658291205763, |
| "num_tokens": 10644641.0, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.6, |
| "eval_biology_entropy": 1.301250998735428, |
| "eval_biology_loss": 1.3366436958312988, |
| "eval_biology_mean_token_accuracy": 0.6739367877840996, |
| "eval_biology_num_tokens": 10644641.0, |
| "eval_biology_runtime": 40.559, |
| "eval_biology_samples_per_second": 12.328, |
| "eval_biology_steps_per_second": 12.328, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.6, |
| "eval_math_entropy": 0.7238052343726158, |
| "eval_math_loss": 0.7381152510643005, |
| "eval_math_mean_token_accuracy": 0.8017449014186859, |
| "eval_math_num_tokens": 10644641.0, |
| "eval_math_runtime": 48.6353, |
| "eval_math_samples_per_second": 10.281, |
| "eval_math_steps_per_second": 10.281, |
| "step": 1500 |
| }, |
| { |
| "entropy": 0.7384224113076925, |
| "epoch": 0.604, |
| "grad_norm": 31.375, |
| "learning_rate": 1.886888888888889e-05, |
| "loss": 11.6601, |
| "mean_token_accuracy": 0.8054828774183989, |
| "num_tokens": 10718879.0, |
| "step": 1510 |
| }, |
| { |
| "entropy": 0.7157115396112204, |
| "epoch": 0.608, |
| "grad_norm": 38.0, |
| "learning_rate": 1.884666666666667e-05, |
| "loss": 11.7253, |
| "mean_token_accuracy": 0.805310570448637, |
| "num_tokens": 10793126.0, |
| "step": 1520 |
| }, |
| { |
| "entropy": 0.7220246642827988, |
| "epoch": 0.612, |
| "grad_norm": 31.875, |
| "learning_rate": 1.8824444444444445e-05, |
| "loss": 11.4871, |
| "mean_token_accuracy": 0.8036613553762436, |
| "num_tokens": 10868501.0, |
| "step": 1530 |
| }, |
| { |
| "entropy": 0.7019952710717916, |
| "epoch": 0.616, |
| "grad_norm": 37.75, |
| "learning_rate": 1.8802222222222226e-05, |
| "loss": 11.3501, |
| "mean_token_accuracy": 0.8096903506666422, |
| "num_tokens": 10941399.0, |
| "step": 1540 |
| }, |
| { |
| "entropy": 0.7124224284663796, |
| "epoch": 0.62, |
| "grad_norm": 28.0, |
| "learning_rate": 1.878e-05, |
| "loss": 11.2568, |
| "mean_token_accuracy": 0.8094121634960174, |
| "num_tokens": 11013227.0, |
| "step": 1550 |
| }, |
| { |
| "entropy": 0.6985575210303068, |
| "epoch": 0.624, |
| "grad_norm": 41.0, |
| "learning_rate": 1.875777777777778e-05, |
| "loss": 11.3067, |
| "mean_token_accuracy": 0.8099827248603105, |
| "num_tokens": 11081768.0, |
| "step": 1560 |
| }, |
| { |
| "entropy": 0.7057704029604792, |
| "epoch": 0.628, |
| "grad_norm": 35.5, |
| "learning_rate": 1.873555555555556e-05, |
| "loss": 11.2109, |
| "mean_token_accuracy": 0.8099541310220957, |
| "num_tokens": 11155362.0, |
| "step": 1570 |
| }, |
| { |
| "entropy": 0.6665219226852059, |
| "epoch": 0.632, |
| "grad_norm": 33.5, |
| "learning_rate": 1.8713333333333336e-05, |
| "loss": 10.7041, |
| "mean_token_accuracy": 0.8133308228105307, |
| "num_tokens": 11226419.0, |
| "step": 1580 |
| }, |
| { |
| "entropy": 0.6918639421463013, |
| "epoch": 0.636, |
| "grad_norm": 28.0, |
| "learning_rate": 1.8691111111111114e-05, |
| "loss": 11.1789, |
| "mean_token_accuracy": 0.810427363216877, |
| "num_tokens": 11298800.0, |
| "step": 1590 |
| }, |
| { |
| "entropy": 0.7074929376132786, |
| "epoch": 0.64, |
| "grad_norm": 32.0, |
| "learning_rate": 1.866888888888889e-05, |
| "loss": 11.4364, |
| "mean_token_accuracy": 0.8062872476875782, |
| "num_tokens": 11370320.0, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.64, |
| "eval_biology_entropy": 1.2752835777401925, |
| "eval_biology_loss": 1.3371055126190186, |
| "eval_biology_mean_token_accuracy": 0.6743674901127815, |
| "eval_biology_num_tokens": 11370320.0, |
| "eval_biology_runtime": 41.1859, |
| "eval_biology_samples_per_second": 12.14, |
| "eval_biology_steps_per_second": 12.14, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.64, |
| "eval_math_entropy": 0.7216429207324981, |
| "eval_math_loss": 0.7321795225143433, |
| "eval_math_mean_token_accuracy": 0.8027968648672104, |
| "eval_math_num_tokens": 11370320.0, |
| "eval_math_runtime": 50.051, |
| "eval_math_samples_per_second": 9.99, |
| "eval_math_steps_per_second": 9.99, |
| "step": 1600 |
| }, |
| { |
| "entropy": 0.7110869940370321, |
| "epoch": 0.644, |
| "grad_norm": 33.5, |
| "learning_rate": 1.864666666666667e-05, |
| "loss": 11.4826, |
| "mean_token_accuracy": 0.8066194530576467, |
| "num_tokens": 11445577.0, |
| "step": 1610 |
| }, |
| { |
| "entropy": 0.6680682392790913, |
| "epoch": 0.648, |
| "grad_norm": 41.75, |
| "learning_rate": 1.8624444444444446e-05, |
| "loss": 10.6001, |
| "mean_token_accuracy": 0.8180687021464109, |
| "num_tokens": 11515332.0, |
| "step": 1620 |
| }, |
| { |
| "entropy": 0.6972724428400398, |
| "epoch": 0.652, |
| "grad_norm": 28.375, |
| "learning_rate": 1.8602222222222224e-05, |
| "loss": 11.2625, |
| "mean_token_accuracy": 0.8118615668267012, |
| "num_tokens": 11589344.0, |
| "step": 1630 |
| }, |
| { |
| "entropy": 0.7117041515186429, |
| "epoch": 0.656, |
| "grad_norm": 30.125, |
| "learning_rate": 1.858e-05, |
| "loss": 11.2332, |
| "mean_token_accuracy": 0.8057843204587698, |
| "num_tokens": 11657835.0, |
| "step": 1640 |
| }, |
| { |
| "entropy": 0.7337123662233352, |
| "epoch": 0.66, |
| "grad_norm": 35.0, |
| "learning_rate": 1.855777777777778e-05, |
| "loss": 11.9959, |
| "mean_token_accuracy": 0.7986002914607525, |
| "num_tokens": 11728761.0, |
| "step": 1650 |
| }, |
| { |
| "entropy": 0.6996571996249259, |
| "epoch": 0.664, |
| "grad_norm": 36.0, |
| "learning_rate": 1.8535555555555557e-05, |
| "loss": 11.225, |
| "mean_token_accuracy": 0.807379187270999, |
| "num_tokens": 11799325.0, |
| "step": 1660 |
| }, |
| { |
| "entropy": 0.7201200375333429, |
| "epoch": 0.668, |
| "grad_norm": 26.625, |
| "learning_rate": 1.8513333333333335e-05, |
| "loss": 11.4655, |
| "mean_token_accuracy": 0.8065472077578306, |
| "num_tokens": 11875761.0, |
| "step": 1670 |
| }, |
| { |
| "entropy": 0.6706034051254391, |
| "epoch": 0.672, |
| "grad_norm": 29.5, |
| "learning_rate": 1.8491111111111112e-05, |
| "loss": 11.0248, |
| "mean_token_accuracy": 0.8108973000198603, |
| "num_tokens": 11949262.0, |
| "step": 1680 |
| }, |
| { |
| "entropy": 0.7067649567499756, |
| "epoch": 0.676, |
| "grad_norm": 39.0, |
| "learning_rate": 1.846888888888889e-05, |
| "loss": 11.2029, |
| "mean_token_accuracy": 0.8082552354782819, |
| "num_tokens": 12017021.0, |
| "step": 1690 |
| }, |
| { |
| "entropy": 0.7115105454809963, |
| "epoch": 0.68, |
| "grad_norm": 28.375, |
| "learning_rate": 1.8446666666666667e-05, |
| "loss": 11.5299, |
| "mean_token_accuracy": 0.8070931307971477, |
| "num_tokens": 12090029.0, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.68, |
| "eval_biology_entropy": 1.3190836807489394, |
| "eval_biology_loss": 1.3399405479431152, |
| "eval_biology_mean_token_accuracy": 0.6731415511965751, |
| "eval_biology_num_tokens": 12090029.0, |
| "eval_biology_runtime": 41.5684, |
| "eval_biology_samples_per_second": 12.028, |
| "eval_biology_steps_per_second": 12.028, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.68, |
| "eval_math_entropy": 0.7365337611436844, |
| "eval_math_loss": 0.7248461842536926, |
| "eval_math_mean_token_accuracy": 0.8049273964166641, |
| "eval_math_num_tokens": 12090029.0, |
| "eval_math_runtime": 48.658, |
| "eval_math_samples_per_second": 10.276, |
| "eval_math_steps_per_second": 10.276, |
| "step": 1700 |
| }, |
| { |
| "entropy": 0.7276539742946625, |
| "epoch": 0.684, |
| "grad_norm": 45.75, |
| "learning_rate": 1.842444444444445e-05, |
| "loss": 11.6271, |
| "mean_token_accuracy": 0.8031581114977598, |
| "num_tokens": 12157148.0, |
| "step": 1710 |
| }, |
| { |
| "entropy": 0.692380516603589, |
| "epoch": 0.688, |
| "grad_norm": 32.75, |
| "learning_rate": 1.8402222222222223e-05, |
| "loss": 11.0596, |
| "mean_token_accuracy": 0.8091447809711099, |
| "num_tokens": 12227640.0, |
| "step": 1720 |
| }, |
| { |
| "entropy": 0.7150112164206803, |
| "epoch": 0.692, |
| "grad_norm": 31.5, |
| "learning_rate": 1.8380000000000004e-05, |
| "loss": 11.4079, |
| "mean_token_accuracy": 0.8083063390105962, |
| "num_tokens": 12300485.0, |
| "step": 1730 |
| }, |
| { |
| "entropy": 0.707249708659947, |
| "epoch": 0.696, |
| "grad_norm": 44.5, |
| "learning_rate": 1.8357777777777778e-05, |
| "loss": 11.4663, |
| "mean_token_accuracy": 0.8062439143657685, |
| "num_tokens": 12371560.0, |
| "step": 1740 |
| }, |
| { |
| "entropy": 0.6466306034475565, |
| "epoch": 0.7, |
| "grad_norm": 31.5, |
| "learning_rate": 1.833555555555556e-05, |
| "loss": 10.4753, |
| "mean_token_accuracy": 0.8210283190011978, |
| "num_tokens": 12444485.0, |
| "step": 1750 |
| }, |
| { |
| "entropy": 0.7072455288842321, |
| "epoch": 0.704, |
| "grad_norm": 28.625, |
| "learning_rate": 1.8313333333333333e-05, |
| "loss": 11.2585, |
| "mean_token_accuracy": 0.8080747354775667, |
| "num_tokens": 12516641.0, |
| "step": 1760 |
| }, |
| { |
| "entropy": 0.6972917837090791, |
| "epoch": 0.708, |
| "grad_norm": 31.875, |
| "learning_rate": 1.8291111111111114e-05, |
| "loss": 11.1016, |
| "mean_token_accuracy": 0.8106624346226454, |
| "num_tokens": 12586969.0, |
| "step": 1770 |
| }, |
| { |
| "entropy": 0.733701079338789, |
| "epoch": 0.712, |
| "grad_norm": 33.0, |
| "learning_rate": 1.8268888888888888e-05, |
| "loss": 11.9791, |
| "mean_token_accuracy": 0.8003675427287817, |
| "num_tokens": 12656623.0, |
| "step": 1780 |
| }, |
| { |
| "entropy": 0.7007612373679877, |
| "epoch": 0.716, |
| "grad_norm": 32.5, |
| "learning_rate": 1.824666666666667e-05, |
| "loss": 11.3258, |
| "mean_token_accuracy": 0.8079633958637714, |
| "num_tokens": 12724402.0, |
| "step": 1790 |
| }, |
| { |
| "entropy": 0.7202634891495109, |
| "epoch": 0.72, |
| "grad_norm": 45.25, |
| "learning_rate": 1.8224444444444447e-05, |
| "loss": 11.4906, |
| "mean_token_accuracy": 0.8054792225360871, |
| "num_tokens": 12793343.0, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.72, |
| "eval_biology_entropy": 1.2490641915798186, |
| "eval_biology_loss": 1.3441540002822876, |
| "eval_biology_mean_token_accuracy": 0.6730483353734017, |
| "eval_biology_num_tokens": 12793343.0, |
| "eval_biology_runtime": 41.3736, |
| "eval_biology_samples_per_second": 12.085, |
| "eval_biology_steps_per_second": 12.085, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.72, |
| "eval_math_entropy": 0.6915989356935024, |
| "eval_math_loss": 0.7225330471992493, |
| "eval_math_mean_token_accuracy": 0.8046257610321045, |
| "eval_math_num_tokens": 12793343.0, |
| "eval_math_runtime": 48.3235, |
| "eval_math_samples_per_second": 10.347, |
| "eval_math_steps_per_second": 10.347, |
| "step": 1800 |
| }, |
| { |
| "entropy": 0.689653181284666, |
| "epoch": 0.724, |
| "grad_norm": 30.0, |
| "learning_rate": 1.8202222222222225e-05, |
| "loss": 11.095, |
| "mean_token_accuracy": 0.8116659399121999, |
| "num_tokens": 12862261.0, |
| "step": 1810 |
| }, |
| { |
| "entropy": 0.6818234785459936, |
| "epoch": 0.728, |
| "grad_norm": 33.25, |
| "learning_rate": 1.8180000000000002e-05, |
| "loss": 11.0911, |
| "mean_token_accuracy": 0.812104906514287, |
| "num_tokens": 12932479.0, |
| "step": 1820 |
| }, |
| { |
| "entropy": 0.7217532116919756, |
| "epoch": 0.732, |
| "grad_norm": 39.0, |
| "learning_rate": 1.815777777777778e-05, |
| "loss": 11.5997, |
| "mean_token_accuracy": 0.80367177054286, |
| "num_tokens": 13005640.0, |
| "step": 1830 |
| }, |
| { |
| "entropy": 0.678720659762621, |
| "epoch": 0.736, |
| "grad_norm": 30.25, |
| "learning_rate": 1.8135555555555557e-05, |
| "loss": 10.8064, |
| "mean_token_accuracy": 0.817896718531847, |
| "num_tokens": 13077981.0, |
| "step": 1840 |
| }, |
| { |
| "entropy": 0.6725755118764937, |
| "epoch": 0.74, |
| "grad_norm": 32.25, |
| "learning_rate": 1.8113333333333335e-05, |
| "loss": 10.8668, |
| "mean_token_accuracy": 0.8124937634915114, |
| "num_tokens": 13147662.0, |
| "step": 1850 |
| }, |
| { |
| "entropy": 0.6931444002315402, |
| "epoch": 0.744, |
| "grad_norm": 29.75, |
| "learning_rate": 1.8091111111111113e-05, |
| "loss": 10.9939, |
| "mean_token_accuracy": 0.8126753181219101, |
| "num_tokens": 13217888.0, |
| "step": 1860 |
| }, |
| { |
| "entropy": 0.6799735228531063, |
| "epoch": 0.748, |
| "grad_norm": 30.625, |
| "learning_rate": 1.806888888888889e-05, |
| "loss": 11.1209, |
| "mean_token_accuracy": 0.8113108247518539, |
| "num_tokens": 13290215.0, |
| "step": 1870 |
| }, |
| { |
| "entropy": 0.6741954877972602, |
| "epoch": 0.752, |
| "grad_norm": 37.25, |
| "learning_rate": 1.8046666666666668e-05, |
| "loss": 10.68, |
| "mean_token_accuracy": 0.8178396210074425, |
| "num_tokens": 13358957.0, |
| "step": 1880 |
| }, |
| { |
| "entropy": 0.6795736599713564, |
| "epoch": 0.756, |
| "grad_norm": 27.5, |
| "learning_rate": 1.8024444444444445e-05, |
| "loss": 10.7669, |
| "mean_token_accuracy": 0.8167448993772268, |
| "num_tokens": 13435490.0, |
| "step": 1890 |
| }, |
| { |
| "entropy": 0.6350338911637664, |
| "epoch": 0.76, |
| "grad_norm": 33.75, |
| "learning_rate": 1.8002222222222223e-05, |
| "loss": 10.293, |
| "mean_token_accuracy": 0.8226836711168289, |
| "num_tokens": 13510850.0, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.76, |
| "eval_biology_entropy": 1.2716083939671516, |
| "eval_biology_loss": 1.343294620513916, |
| "eval_biology_mean_token_accuracy": 0.6733359169363976, |
| "eval_biology_num_tokens": 13510850.0, |
| "eval_biology_runtime": 40.8464, |
| "eval_biology_samples_per_second": 12.241, |
| "eval_biology_steps_per_second": 12.241, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.76, |
| "eval_math_entropy": 0.7108331805765629, |
| "eval_math_loss": 0.7166345715522766, |
| "eval_math_mean_token_accuracy": 0.8057290604114532, |
| "eval_math_num_tokens": 13510850.0, |
| "eval_math_runtime": 47.8127, |
| "eval_math_samples_per_second": 10.457, |
| "eval_math_steps_per_second": 10.457, |
| "step": 1900 |
| }, |
| { |
| "entropy": 0.6852852246724069, |
| "epoch": 0.764, |
| "grad_norm": 26.0, |
| "learning_rate": 1.798e-05, |
| "loss": 10.9532, |
| "mean_token_accuracy": 0.8128071770071983, |
| "num_tokens": 13581848.0, |
| "step": 1910 |
| }, |
| { |
| "entropy": 0.6663387446664274, |
| "epoch": 0.768, |
| "grad_norm": 29.125, |
| "learning_rate": 1.7957777777777778e-05, |
| "loss": 10.8527, |
| "mean_token_accuracy": 0.8174721624702215, |
| "num_tokens": 13653412.0, |
| "step": 1920 |
| }, |
| { |
| "entropy": 0.7165529853664339, |
| "epoch": 0.772, |
| "grad_norm": 31.125, |
| "learning_rate": 1.7935555555555556e-05, |
| "loss": 11.2507, |
| "mean_token_accuracy": 0.8098738107830286, |
| "num_tokens": 13721740.0, |
| "step": 1930 |
| }, |
| { |
| "entropy": 0.6845185202546418, |
| "epoch": 0.776, |
| "grad_norm": 27.25, |
| "learning_rate": 1.7913333333333337e-05, |
| "loss": 11.1313, |
| "mean_token_accuracy": 0.810427762567997, |
| "num_tokens": 13794252.0, |
| "step": 1940 |
| }, |
| { |
| "entropy": 0.693409715499729, |
| "epoch": 0.78, |
| "grad_norm": 37.25, |
| "learning_rate": 1.789111111111111e-05, |
| "loss": 11.1187, |
| "mean_token_accuracy": 0.8099165614694357, |
| "num_tokens": 13865839.0, |
| "step": 1950 |
| }, |
| { |
| "entropy": 0.707060054782778, |
| "epoch": 0.784, |
| "grad_norm": 81.5, |
| "learning_rate": 1.7868888888888892e-05, |
| "loss": 11.428, |
| "mean_token_accuracy": 0.8070406079292297, |
| "num_tokens": 13940856.0, |
| "step": 1960 |
| }, |
| { |
| "entropy": 0.7144587634131312, |
| "epoch": 0.788, |
| "grad_norm": 34.0, |
| "learning_rate": 1.7846666666666666e-05, |
| "loss": 11.4499, |
| "mean_token_accuracy": 0.8067739550024271, |
| "num_tokens": 14014568.0, |
| "step": 1970 |
| }, |
| { |
| "entropy": 0.6812865046784282, |
| "epoch": 0.792, |
| "grad_norm": 29.75, |
| "learning_rate": 1.7824444444444447e-05, |
| "loss": 10.9827, |
| "mean_token_accuracy": 0.8135220106691122, |
| "num_tokens": 14086117.0, |
| "step": 1980 |
| }, |
| { |
| "entropy": 0.7018590277060867, |
| "epoch": 0.796, |
| "grad_norm": 28.875, |
| "learning_rate": 1.780222222222222e-05, |
| "loss": 11.4353, |
| "mean_token_accuracy": 0.8069973647594452, |
| "num_tokens": 14160050.0, |
| "step": 1990 |
| }, |
| { |
| "entropy": 0.7179917026311159, |
| "epoch": 0.8, |
| "grad_norm": 33.0, |
| "learning_rate": 1.7780000000000003e-05, |
| "loss": 11.3243, |
| "mean_token_accuracy": 0.8104559704661369, |
| "num_tokens": 14230754.0, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.8, |
| "eval_biology_entropy": 1.2935789892077445, |
| "eval_biology_loss": 1.3476186990737915, |
| "eval_biology_mean_token_accuracy": 0.672012362420559, |
| "eval_biology_num_tokens": 14230754.0, |
| "eval_biology_runtime": 44.053, |
| "eval_biology_samples_per_second": 11.35, |
| "eval_biology_steps_per_second": 11.35, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.8, |
| "eval_math_entropy": 0.6968656893670558, |
| "eval_math_loss": 0.7122905850410461, |
| "eval_math_mean_token_accuracy": 0.8068100419044495, |
| "eval_math_num_tokens": 14230754.0, |
| "eval_math_runtime": 48.0745, |
| "eval_math_samples_per_second": 10.401, |
| "eval_math_steps_per_second": 10.401, |
| "step": 2000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 10000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 4, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 9.542341924088814e+17, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|