| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.2, |
| "eval_steps": 100, |
| "global_step": 500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 0.7581225659698247, |
| "epoch": 0.004, |
| "grad_norm": 101.5, |
| "learning_rate": 1.8e-07, |
| "loss": 18.1141, |
| "mean_token_accuracy": 0.7393306585028767, |
| "num_tokens": 72542.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 0.7520309612154961, |
| "epoch": 0.008, |
| "grad_norm": 162.0, |
| "learning_rate": 3.8e-07, |
| "loss": 18.2441, |
| "mean_token_accuracy": 0.7392390985041857, |
| "num_tokens": 138146.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 0.741816989146173, |
| "epoch": 0.012, |
| "grad_norm": 92.5, |
| "learning_rate": 5.800000000000001e-07, |
| "loss": 17.8539, |
| "mean_token_accuracy": 0.7405363619327545, |
| "num_tokens": 208409.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 0.7636421315371991, |
| "epoch": 0.016, |
| "grad_norm": 88.0, |
| "learning_rate": 7.8e-07, |
| "loss": 18.142, |
| "mean_token_accuracy": 0.7400334309786558, |
| "num_tokens": 280941.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 0.7905357209965587, |
| "epoch": 0.02, |
| "grad_norm": 108.0, |
| "learning_rate": 9.800000000000001e-07, |
| "loss": 18.1041, |
| "mean_token_accuracy": 0.7354144401848316, |
| "num_tokens": 349731.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 0.8223027169704438, |
| "epoch": 0.024, |
| "grad_norm": 112.5, |
| "learning_rate": 1.1800000000000001e-06, |
| "loss": 18.4599, |
| "mean_token_accuracy": 0.731158659607172, |
| "num_tokens": 419127.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 0.7908559931442142, |
| "epoch": 0.028, |
| "grad_norm": 106.0, |
| "learning_rate": 1.3800000000000001e-06, |
| "loss": 17.5053, |
| "mean_token_accuracy": 0.7440328601747751, |
| "num_tokens": 489016.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 0.8229548260569572, |
| "epoch": 0.032, |
| "grad_norm": 86.0, |
| "learning_rate": 1.5800000000000001e-06, |
| "loss": 17.3714, |
| "mean_token_accuracy": 0.7438380591571331, |
| "num_tokens": 558056.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 0.84512098710984, |
| "epoch": 0.036, |
| "grad_norm": 100.5, |
| "learning_rate": 1.7800000000000001e-06, |
| "loss": 17.2456, |
| "mean_token_accuracy": 0.7478047309443354, |
| "num_tokens": 627804.0, |
| "step": 90 |
| }, |
| { |
| "entropy": 0.8738276792690158, |
| "epoch": 0.04, |
| "grad_norm": 67.0, |
| "learning_rate": 1.98e-06, |
| "loss": 16.6896, |
| "mean_token_accuracy": 0.748936639726162, |
| "num_tokens": 697556.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.04, |
| "eval_biology_entropy": 1.1379984815716744, |
| "eval_biology_loss": 1.2692387104034424, |
| "eval_biology_mean_token_accuracy": 0.6880569371581078, |
| "eval_biology_num_tokens": 697556.0, |
| "eval_biology_runtime": 40.5573, |
| "eval_biology_samples_per_second": 12.328, |
| "eval_biology_steps_per_second": 12.328, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.04, |
| "eval_math_entropy": 0.9095822716951371, |
| "eval_math_loss": 1.0899367332458496, |
| "eval_math_mean_token_accuracy": 0.7395683622956276, |
| "eval_math_num_tokens": 697556.0, |
| "eval_math_runtime": 48.0541, |
| "eval_math_samples_per_second": 10.405, |
| "eval_math_steps_per_second": 10.405, |
| "step": 100 |
| }, |
| { |
| "entropy": 0.9193563124164938, |
| "epoch": 0.044, |
| "grad_norm": 88.5, |
| "learning_rate": 2.1800000000000003e-06, |
| "loss": 17.3763, |
| "mean_token_accuracy": 0.7420263335108757, |
| "num_tokens": 765262.0, |
| "step": 110 |
| }, |
| { |
| "entropy": 0.8906641118228436, |
| "epoch": 0.048, |
| "grad_norm": 78.0, |
| "learning_rate": 2.38e-06, |
| "loss": 16.2142, |
| "mean_token_accuracy": 0.7552934620529413, |
| "num_tokens": 836753.0, |
| "step": 120 |
| }, |
| { |
| "entropy": 0.9178624864667654, |
| "epoch": 0.052, |
| "grad_norm": 70.0, |
| "learning_rate": 2.5800000000000003e-06, |
| "loss": 16.3594, |
| "mean_token_accuracy": 0.7507667759433388, |
| "num_tokens": 909947.0, |
| "step": 130 |
| }, |
| { |
| "entropy": 0.9556385111063719, |
| "epoch": 0.056, |
| "grad_norm": 70.0, |
| "learning_rate": 2.7800000000000005e-06, |
| "loss": 16.6438, |
| "mean_token_accuracy": 0.748611630871892, |
| "num_tokens": 981959.0, |
| "step": 140 |
| }, |
| { |
| "entropy": 0.9541467605158687, |
| "epoch": 0.06, |
| "grad_norm": 86.0, |
| "learning_rate": 2.9800000000000003e-06, |
| "loss": 16.5304, |
| "mean_token_accuracy": 0.7486465960741043, |
| "num_tokens": 1053624.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 0.9960285406559706, |
| "epoch": 0.064, |
| "grad_norm": 78.0, |
| "learning_rate": 3.1800000000000005e-06, |
| "loss": 16.9435, |
| "mean_token_accuracy": 0.7448480188846588, |
| "num_tokens": 1126446.0, |
| "step": 160 |
| }, |
| { |
| "entropy": 0.9677659103646874, |
| "epoch": 0.068, |
| "grad_norm": 106.5, |
| "learning_rate": 3.3800000000000007e-06, |
| "loss": 16.1013, |
| "mean_token_accuracy": 0.7561902595683933, |
| "num_tokens": 1196286.0, |
| "step": 170 |
| }, |
| { |
| "entropy": 0.9270429328083992, |
| "epoch": 0.072, |
| "grad_norm": 62.0, |
| "learning_rate": 3.58e-06, |
| "loss": 15.3741, |
| "mean_token_accuracy": 0.763100459985435, |
| "num_tokens": 1273905.0, |
| "step": 180 |
| }, |
| { |
| "entropy": 0.9678281666710973, |
| "epoch": 0.076, |
| "grad_norm": 51.0, |
| "learning_rate": 3.7800000000000002e-06, |
| "loss": 15.8618, |
| "mean_token_accuracy": 0.7534618094563484, |
| "num_tokens": 1346454.0, |
| "step": 190 |
| }, |
| { |
| "entropy": 0.9695806819945574, |
| "epoch": 0.08, |
| "grad_norm": 61.25, |
| "learning_rate": 3.980000000000001e-06, |
| "loss": 15.7874, |
| "mean_token_accuracy": 0.7560878146439791, |
| "num_tokens": 1413596.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.08, |
| "eval_biology_entropy": 1.214613477051258, |
| "eval_biology_loss": 1.2630934715270996, |
| "eval_biology_mean_token_accuracy": 0.6885634814500808, |
| "eval_biology_num_tokens": 1413596.0, |
| "eval_biology_runtime": 40.4294, |
| "eval_biology_samples_per_second": 12.367, |
| "eval_biology_steps_per_second": 12.367, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.08, |
| "eval_math_entropy": 0.9574190213680267, |
| "eval_math_loss": 0.9871246218681335, |
| "eval_math_mean_token_accuracy": 0.7557044946551323, |
| "eval_math_num_tokens": 1413596.0, |
| "eval_math_runtime": 48.0895, |
| "eval_math_samples_per_second": 10.397, |
| "eval_math_steps_per_second": 10.397, |
| "step": 200 |
| }, |
| { |
| "entropy": 0.8888338362798095, |
| "epoch": 0.084, |
| "grad_norm": 61.25, |
| "learning_rate": 4.18e-06, |
| "loss": 14.623, |
| "mean_token_accuracy": 0.7706882227212191, |
| "num_tokens": 1485147.0, |
| "step": 210 |
| }, |
| { |
| "entropy": 0.906145141646266, |
| "epoch": 0.088, |
| "grad_norm": 53.5, |
| "learning_rate": 4.38e-06, |
| "loss": 14.8128, |
| "mean_token_accuracy": 0.7661576565355063, |
| "num_tokens": 1559450.0, |
| "step": 220 |
| }, |
| { |
| "entropy": 0.95110107883811, |
| "epoch": 0.092, |
| "grad_norm": 63.25, |
| "learning_rate": 4.58e-06, |
| "loss": 15.2457, |
| "mean_token_accuracy": 0.761219710484147, |
| "num_tokens": 1631535.0, |
| "step": 230 |
| }, |
| { |
| "entropy": 0.9411580273881555, |
| "epoch": 0.096, |
| "grad_norm": 51.0, |
| "learning_rate": 4.78e-06, |
| "loss": 14.938, |
| "mean_token_accuracy": 0.7646324578672647, |
| "num_tokens": 1701193.0, |
| "step": 240 |
| }, |
| { |
| "entropy": 0.9254088591784239, |
| "epoch": 0.1, |
| "grad_norm": 56.25, |
| "learning_rate": 4.980000000000001e-06, |
| "loss": 14.6029, |
| "mean_token_accuracy": 0.7678503680974245, |
| "num_tokens": 1773596.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 0.9214777417480946, |
| "epoch": 0.104, |
| "grad_norm": 54.0, |
| "learning_rate": 5.18e-06, |
| "loss": 15.043, |
| "mean_token_accuracy": 0.7618352189660073, |
| "num_tokens": 1840165.0, |
| "step": 260 |
| }, |
| { |
| "entropy": 0.9363527970388532, |
| "epoch": 0.108, |
| "grad_norm": 111.5, |
| "learning_rate": 5.380000000000001e-06, |
| "loss": 15.6052, |
| "mean_token_accuracy": 0.7564919672906398, |
| "num_tokens": 1908149.0, |
| "step": 270 |
| }, |
| { |
| "entropy": 0.9113753678277134, |
| "epoch": 0.112, |
| "grad_norm": 53.0, |
| "learning_rate": 5.580000000000001e-06, |
| "loss": 14.4645, |
| "mean_token_accuracy": 0.7736385434865951, |
| "num_tokens": 1979232.0, |
| "step": 280 |
| }, |
| { |
| "entropy": 0.916975767724216, |
| "epoch": 0.116, |
| "grad_norm": 54.5, |
| "learning_rate": 5.78e-06, |
| "loss": 15.0318, |
| "mean_token_accuracy": 0.7656373519450426, |
| "num_tokens": 2052039.0, |
| "step": 290 |
| }, |
| { |
| "entropy": 0.915022067911923, |
| "epoch": 0.12, |
| "grad_norm": 46.25, |
| "learning_rate": 5.98e-06, |
| "loss": 14.7236, |
| "mean_token_accuracy": 0.7691222932189703, |
| "num_tokens": 2125815.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.12, |
| "eval_biology_entropy": 1.2286294117569923, |
| "eval_biology_loss": 1.268429160118103, |
| "eval_biology_mean_token_accuracy": 0.6861296361088752, |
| "eval_biology_num_tokens": 2125815.0, |
| "eval_biology_runtime": 40.4544, |
| "eval_biology_samples_per_second": 12.36, |
| "eval_biology_steps_per_second": 12.36, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.12, |
| "eval_math_entropy": 0.9302992832064628, |
| "eval_math_loss": 0.9273415803909302, |
| "eval_math_mean_token_accuracy": 0.7664831226468086, |
| "eval_math_num_tokens": 2125815.0, |
| "eval_math_runtime": 48.047, |
| "eval_math_samples_per_second": 10.406, |
| "eval_math_steps_per_second": 10.406, |
| "step": 300 |
| }, |
| { |
| "entropy": 0.9336849415674806, |
| "epoch": 0.124, |
| "grad_norm": 55.25, |
| "learning_rate": 6.18e-06, |
| "loss": 14.9674, |
| "mean_token_accuracy": 0.7661335166543722, |
| "num_tokens": 2201991.0, |
| "step": 310 |
| }, |
| { |
| "entropy": 0.8756070390343667, |
| "epoch": 0.128, |
| "grad_norm": 52.25, |
| "learning_rate": 6.380000000000001e-06, |
| "loss": 14.0, |
| "mean_token_accuracy": 0.7786776568740607, |
| "num_tokens": 2274177.0, |
| "step": 320 |
| }, |
| { |
| "entropy": 0.892756593041122, |
| "epoch": 0.132, |
| "grad_norm": 45.75, |
| "learning_rate": 6.5800000000000005e-06, |
| "loss": 14.1896, |
| "mean_token_accuracy": 0.7731969062238931, |
| "num_tokens": 2342173.0, |
| "step": 330 |
| }, |
| { |
| "entropy": 0.8899674592539668, |
| "epoch": 0.136, |
| "grad_norm": 56.5, |
| "learning_rate": 6.780000000000001e-06, |
| "loss": 14.2167, |
| "mean_token_accuracy": 0.7683363553136587, |
| "num_tokens": 2412435.0, |
| "step": 340 |
| }, |
| { |
| "entropy": 0.9237651886418462, |
| "epoch": 0.14, |
| "grad_norm": 49.5, |
| "learning_rate": 6.98e-06, |
| "loss": 14.7203, |
| "mean_token_accuracy": 0.7684664275497198, |
| "num_tokens": 2481105.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 0.8494639024138451, |
| "epoch": 0.144, |
| "grad_norm": 49.0, |
| "learning_rate": 7.180000000000001e-06, |
| "loss": 13.6956, |
| "mean_token_accuracy": 0.7817885402590037, |
| "num_tokens": 2548445.0, |
| "step": 360 |
| }, |
| { |
| "entropy": 0.8341110894456506, |
| "epoch": 0.148, |
| "grad_norm": 44.0, |
| "learning_rate": 7.3800000000000005e-06, |
| "loss": 13.2223, |
| "mean_token_accuracy": 0.789648124948144, |
| "num_tokens": 2621364.0, |
| "step": 370 |
| }, |
| { |
| "entropy": 0.8608193326741457, |
| "epoch": 0.152, |
| "grad_norm": 52.25, |
| "learning_rate": 7.58e-06, |
| "loss": 13.8861, |
| "mean_token_accuracy": 0.7754677496850491, |
| "num_tokens": 2686462.0, |
| "step": 380 |
| }, |
| { |
| "entropy": 0.8447565982118249, |
| "epoch": 0.156, |
| "grad_norm": 50.0, |
| "learning_rate": 7.78e-06, |
| "loss": 13.5199, |
| "mean_token_accuracy": 0.7812942665070295, |
| "num_tokens": 2754241.0, |
| "step": 390 |
| }, |
| { |
| "entropy": 0.8711243279278278, |
| "epoch": 0.16, |
| "grad_norm": 79.0, |
| "learning_rate": 7.980000000000002e-06, |
| "loss": 14.1276, |
| "mean_token_accuracy": 0.7742310840636492, |
| "num_tokens": 2824418.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.16, |
| "eval_biology_entropy": 1.22758097666502, |
| "eval_biology_loss": 1.2762199640274048, |
| "eval_biology_mean_token_accuracy": 0.6836535354852676, |
| "eval_biology_num_tokens": 2824418.0, |
| "eval_biology_runtime": 40.7281, |
| "eval_biology_samples_per_second": 12.277, |
| "eval_biology_steps_per_second": 12.277, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.16, |
| "eval_math_entropy": 0.9047691844701767, |
| "eval_math_loss": 0.8841754198074341, |
| "eval_math_mean_token_accuracy": 0.775034920334816, |
| "eval_math_num_tokens": 2824418.0, |
| "eval_math_runtime": 47.9197, |
| "eval_math_samples_per_second": 10.434, |
| "eval_math_steps_per_second": 10.434, |
| "step": 400 |
| }, |
| { |
| "entropy": 0.8859184484928846, |
| "epoch": 0.164, |
| "grad_norm": 127.0, |
| "learning_rate": 8.18e-06, |
| "loss": 14.142, |
| "mean_token_accuracy": 0.7765266574919224, |
| "num_tokens": 2895667.0, |
| "step": 410 |
| }, |
| { |
| "entropy": 0.8501209424808621, |
| "epoch": 0.168, |
| "grad_norm": 40.0, |
| "learning_rate": 8.380000000000001e-06, |
| "loss": 13.8435, |
| "mean_token_accuracy": 0.7774271417409182, |
| "num_tokens": 2966031.0, |
| "step": 420 |
| }, |
| { |
| "entropy": 0.8510567891411484, |
| "epoch": 0.172, |
| "grad_norm": 45.75, |
| "learning_rate": 8.580000000000001e-06, |
| "loss": 13.3673, |
| "mean_token_accuracy": 0.7858892437070608, |
| "num_tokens": 3039780.0, |
| "step": 430 |
| }, |
| { |
| "entropy": 0.8355857279151678, |
| "epoch": 0.176, |
| "grad_norm": 44.5, |
| "learning_rate": 8.78e-06, |
| "loss": 13.7383, |
| "mean_token_accuracy": 0.7784004233777523, |
| "num_tokens": 3110313.0, |
| "step": 440 |
| }, |
| { |
| "entropy": 0.8662491450086236, |
| "epoch": 0.18, |
| "grad_norm": 43.75, |
| "learning_rate": 8.98e-06, |
| "loss": 13.6485, |
| "mean_token_accuracy": 0.7806661173701286, |
| "num_tokens": 3179963.0, |
| "step": 450 |
| }, |
| { |
| "entropy": 0.8596813324838877, |
| "epoch": 0.184, |
| "grad_norm": 46.0, |
| "learning_rate": 9.180000000000002e-06, |
| "loss": 13.7717, |
| "mean_token_accuracy": 0.7796813258901238, |
| "num_tokens": 3251139.0, |
| "step": 460 |
| }, |
| { |
| "entropy": 0.8454847941175103, |
| "epoch": 0.188, |
| "grad_norm": 56.25, |
| "learning_rate": 9.38e-06, |
| "loss": 13.4682, |
| "mean_token_accuracy": 0.7803339377045632, |
| "num_tokens": 3323519.0, |
| "step": 470 |
| }, |
| { |
| "entropy": 0.8228314617648721, |
| "epoch": 0.192, |
| "grad_norm": 49.75, |
| "learning_rate": 9.58e-06, |
| "loss": 13.3842, |
| "mean_token_accuracy": 0.7827629748731851, |
| "num_tokens": 3394170.0, |
| "step": 480 |
| }, |
| { |
| "entropy": 0.8164506537839771, |
| "epoch": 0.196, |
| "grad_norm": 54.25, |
| "learning_rate": 9.780000000000001e-06, |
| "loss": 13.1965, |
| "mean_token_accuracy": 0.7878578577190638, |
| "num_tokens": 3465341.0, |
| "step": 490 |
| }, |
| { |
| "entropy": 0.8838530456647277, |
| "epoch": 0.2, |
| "grad_norm": 51.0, |
| "learning_rate": 9.980000000000001e-06, |
| "loss": 13.9771, |
| "mean_token_accuracy": 0.7734297584742308, |
| "num_tokens": 3533571.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.2, |
| "eval_biology_entropy": 1.2344867270588875, |
| "eval_biology_loss": 1.2817195653915405, |
| "eval_biology_mean_token_accuracy": 0.6832386125922203, |
| "eval_biology_num_tokens": 3533571.0, |
| "eval_biology_runtime": 40.4092, |
| "eval_biology_samples_per_second": 12.373, |
| "eval_biology_steps_per_second": 12.373, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.2, |
| "eval_math_entropy": 0.8546719763875008, |
| "eval_math_loss": 0.8547734618186951, |
| "eval_math_mean_token_accuracy": 0.7804626071453095, |
| "eval_math_num_tokens": 3533571.0, |
| "eval_math_runtime": 48.0961, |
| "eval_math_samples_per_second": 10.396, |
| "eval_math_steps_per_second": 10.396, |
| "step": 500 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 10000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 4, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.3694136442134013e+17, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|