| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.22255605630668224, |
| "eval_steps": 100, |
| "global_step": 500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 0.8066153490915895, |
| "epoch": 0.004451121126133645, |
| "grad_norm": 107.5, |
| "learning_rate": 1.8e-07, |
| "loss": 16.3591, |
| "mean_token_accuracy": 0.7447933696210385, |
| "num_tokens": 63133.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 0.8401238698512316, |
| "epoch": 0.00890224225226729, |
| "grad_norm": 103.5, |
| "learning_rate": 3.8e-07, |
| "loss": 17.2936, |
| "mean_token_accuracy": 0.7330440735444427, |
| "num_tokens": 128968.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 0.828661117143929, |
| "epoch": 0.013353363378400934, |
| "grad_norm": 96.5, |
| "learning_rate": 5.800000000000001e-07, |
| "loss": 16.5809, |
| "mean_token_accuracy": 0.7381731692701579, |
| "num_tokens": 193314.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 0.8600894263014197, |
| "epoch": 0.01780448450453458, |
| "grad_norm": 96.0, |
| "learning_rate": 7.8e-07, |
| "loss": 17.7444, |
| "mean_token_accuracy": 0.7324823886156082, |
| "num_tokens": 257189.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 0.8017022017389536, |
| "epoch": 0.022255605630668224, |
| "grad_norm": 84.5, |
| "learning_rate": 9.800000000000001e-07, |
| "loss": 15.9063, |
| "mean_token_accuracy": 0.7455223135650157, |
| "num_tokens": 324224.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 0.846268966794014, |
| "epoch": 0.026706726756801868, |
| "grad_norm": 78.5, |
| "learning_rate": 1.1800000000000001e-06, |
| "loss": 16.6006, |
| "mean_token_accuracy": 0.7411292420700193, |
| "num_tokens": 394654.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 0.8181243563070894, |
| "epoch": 0.031157847882935515, |
| "grad_norm": 76.5, |
| "learning_rate": 1.3800000000000001e-06, |
| "loss": 15.9846, |
| "mean_token_accuracy": 0.746401545777917, |
| "num_tokens": 461658.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 0.8314600124955177, |
| "epoch": 0.03560896900906916, |
| "grad_norm": 75.0, |
| "learning_rate": 1.5800000000000001e-06, |
| "loss": 15.8227, |
| "mean_token_accuracy": 0.7472479769960045, |
| "num_tokens": 528285.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 0.8807666478678584, |
| "epoch": 0.04006009013520281, |
| "grad_norm": 72.0, |
| "learning_rate": 1.7800000000000001e-06, |
| "loss": 16.6834, |
| "mean_token_accuracy": 0.7396679904311896, |
| "num_tokens": 595336.0, |
| "step": 90 |
| }, |
| { |
| "entropy": 0.8991396678611636, |
| "epoch": 0.04451121126133645, |
| "grad_norm": 79.5, |
| "learning_rate": 1.98e-06, |
| "loss": 16.3968, |
| "mean_token_accuracy": 0.7396075185388327, |
| "num_tokens": 663126.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.04451121126133645, |
| "eval_biology_entropy": 1.1188154411315918, |
| "eval_biology_loss": 1.2691835165023804, |
| "eval_biology_mean_token_accuracy": 0.6881385813951493, |
| "eval_biology_num_tokens": 663126.0, |
| "eval_biology_runtime": 40.4565, |
| "eval_biology_samples_per_second": 12.359, |
| "eval_biology_steps_per_second": 12.359, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.04451121126133645, |
| "eval_chemistry_entropy": 0.8804921235442161, |
| "eval_chemistry_loss": 1.0042469501495361, |
| "eval_chemistry_mean_token_accuracy": 0.7444319971203804, |
| "eval_chemistry_num_tokens": 663126.0, |
| "eval_chemistry_runtime": 46.3585, |
| "eval_chemistry_samples_per_second": 10.786, |
| "eval_chemistry_steps_per_second": 10.786, |
| "step": 100 |
| }, |
| { |
| "entropy": 0.8988691195845604, |
| "epoch": 0.048962332387470095, |
| "grad_norm": 100.5, |
| "learning_rate": 2.1800000000000003e-06, |
| "loss": 16.3063, |
| "mean_token_accuracy": 0.7414613764733076, |
| "num_tokens": 731060.0, |
| "step": 110 |
| }, |
| { |
| "entropy": 0.8918870648369193, |
| "epoch": 0.053413453513603736, |
| "grad_norm": 77.5, |
| "learning_rate": 2.38e-06, |
| "loss": 15.8311, |
| "mean_token_accuracy": 0.7483173958957196, |
| "num_tokens": 801635.0, |
| "step": 120 |
| }, |
| { |
| "entropy": 0.9162682231515646, |
| "epoch": 0.05786457463973738, |
| "grad_norm": 66.0, |
| "learning_rate": 2.5800000000000003e-06, |
| "loss": 16.1464, |
| "mean_token_accuracy": 0.7448406910523773, |
| "num_tokens": 867260.0, |
| "step": 130 |
| }, |
| { |
| "entropy": 0.9749668512493372, |
| "epoch": 0.06231569576587103, |
| "grad_norm": 79.5, |
| "learning_rate": 2.7800000000000005e-06, |
| "loss": 16.9562, |
| "mean_token_accuracy": 0.7369577366858721, |
| "num_tokens": 931344.0, |
| "step": 140 |
| }, |
| { |
| "entropy": 0.939477625861764, |
| "epoch": 0.06676681689200467, |
| "grad_norm": 56.75, |
| "learning_rate": 2.9800000000000003e-06, |
| "loss": 15.7327, |
| "mean_token_accuracy": 0.7471803797408938, |
| "num_tokens": 993586.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 0.9718907386064529, |
| "epoch": 0.07121793801813832, |
| "grad_norm": 53.75, |
| "learning_rate": 3.1800000000000005e-06, |
| "loss": 16.2036, |
| "mean_token_accuracy": 0.7392314806580543, |
| "num_tokens": 1060812.0, |
| "step": 160 |
| }, |
| { |
| "entropy": 0.9165311623364687, |
| "epoch": 0.07566905914427197, |
| "grad_norm": 63.25, |
| "learning_rate": 3.3800000000000007e-06, |
| "loss": 15.2657, |
| "mean_token_accuracy": 0.7518215283751488, |
| "num_tokens": 1131832.0, |
| "step": 170 |
| }, |
| { |
| "entropy": 0.9465073021128774, |
| "epoch": 0.08012018027040561, |
| "grad_norm": 50.25, |
| "learning_rate": 3.58e-06, |
| "loss": 15.5603, |
| "mean_token_accuracy": 0.7473610159009695, |
| "num_tokens": 1200650.0, |
| "step": 180 |
| }, |
| { |
| "entropy": 0.9450380651280283, |
| "epoch": 0.08457130139653925, |
| "grad_norm": 54.0, |
| "learning_rate": 3.7800000000000002e-06, |
| "loss": 15.56, |
| "mean_token_accuracy": 0.748077143356204, |
| "num_tokens": 1265107.0, |
| "step": 190 |
| }, |
| { |
| "entropy": 0.9630168141797185, |
| "epoch": 0.0890224225226729, |
| "grad_norm": 58.75, |
| "learning_rate": 3.980000000000001e-06, |
| "loss": 15.5029, |
| "mean_token_accuracy": 0.7486971555277705, |
| "num_tokens": 1327380.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.0890224225226729, |
| "eval_biology_entropy": 1.19531831908226, |
| "eval_biology_loss": 1.2584387063980103, |
| "eval_biology_mean_token_accuracy": 0.6882349443435669, |
| "eval_biology_num_tokens": 1327380.0, |
| "eval_biology_runtime": 40.8783, |
| "eval_biology_samples_per_second": 12.231, |
| "eval_biology_steps_per_second": 12.231, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.0890224225226729, |
| "eval_chemistry_entropy": 0.9437598274946213, |
| "eval_chemistry_loss": 0.9655953645706177, |
| "eval_chemistry_mean_token_accuracy": 0.7501006088852883, |
| "eval_chemistry_num_tokens": 1327380.0, |
| "eval_chemistry_runtime": 46.6023, |
| "eval_chemistry_samples_per_second": 10.729, |
| "eval_chemistry_steps_per_second": 10.729, |
| "step": 200 |
| }, |
| { |
| "entropy": 0.9782480053603649, |
| "epoch": 0.09347354364880654, |
| "grad_norm": 64.0, |
| "learning_rate": 4.18e-06, |
| "loss": 15.9821, |
| "mean_token_accuracy": 0.7436690799891948, |
| "num_tokens": 1393379.0, |
| "step": 210 |
| }, |
| { |
| "entropy": 0.9402019061148167, |
| "epoch": 0.09792466477494019, |
| "grad_norm": 54.0, |
| "learning_rate": 4.38e-06, |
| "loss": 15.3729, |
| "mean_token_accuracy": 0.7516820874065161, |
| "num_tokens": 1460130.0, |
| "step": 220 |
| }, |
| { |
| "entropy": 0.9247835712507367, |
| "epoch": 0.10237578590107384, |
| "grad_norm": 54.5, |
| "learning_rate": 4.58e-06, |
| "loss": 15.0731, |
| "mean_token_accuracy": 0.7559274602681398, |
| "num_tokens": 1529183.0, |
| "step": 230 |
| }, |
| { |
| "entropy": 0.9673028320074082, |
| "epoch": 0.10682690702720747, |
| "grad_norm": 71.5, |
| "learning_rate": 4.78e-06, |
| "loss": 15.731, |
| "mean_token_accuracy": 0.7473661951720715, |
| "num_tokens": 1597405.0, |
| "step": 240 |
| }, |
| { |
| "entropy": 0.9974556604400278, |
| "epoch": 0.11127802815334112, |
| "grad_norm": 55.0, |
| "learning_rate": 4.980000000000001e-06, |
| "loss": 15.8962, |
| "mean_token_accuracy": 0.7434635870158672, |
| "num_tokens": 1661767.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 0.985609365440905, |
| "epoch": 0.11572914927947477, |
| "grad_norm": 50.75, |
| "learning_rate": 5.18e-06, |
| "loss": 16.0267, |
| "mean_token_accuracy": 0.7446854375302792, |
| "num_tokens": 1728207.0, |
| "step": 260 |
| }, |
| { |
| "entropy": 0.9456103699281811, |
| "epoch": 0.12018027040560841, |
| "grad_norm": 58.0, |
| "learning_rate": 5.380000000000001e-06, |
| "loss": 15.3377, |
| "mean_token_accuracy": 0.750850186496973, |
| "num_tokens": 1796055.0, |
| "step": 270 |
| }, |
| { |
| "entropy": 0.9541573049500585, |
| "epoch": 0.12463139153174206, |
| "grad_norm": 56.5, |
| "learning_rate": 5.580000000000001e-06, |
| "loss": 15.2532, |
| "mean_token_accuracy": 0.7530262626707553, |
| "num_tokens": 1859684.0, |
| "step": 280 |
| }, |
| { |
| "entropy": 0.9941559780389071, |
| "epoch": 0.1290825126578757, |
| "grad_norm": 59.25, |
| "learning_rate": 5.78e-06, |
| "loss": 15.94, |
| "mean_token_accuracy": 0.745009395852685, |
| "num_tokens": 1921704.0, |
| "step": 290 |
| }, |
| { |
| "entropy": 0.981252990104258, |
| "epoch": 0.13353363378400934, |
| "grad_norm": 56.75, |
| "learning_rate": 5.98e-06, |
| "loss": 15.8943, |
| "mean_token_accuracy": 0.7449573867022992, |
| "num_tokens": 1985766.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.13353363378400934, |
| "eval_biology_entropy": 1.2063790675401687, |
| "eval_biology_loss": 1.2592713832855225, |
| "eval_biology_mean_token_accuracy": 0.6874204781055451, |
| "eval_biology_num_tokens": 1985766.0, |
| "eval_biology_runtime": 40.5061, |
| "eval_biology_samples_per_second": 12.344, |
| "eval_biology_steps_per_second": 12.344, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.13353363378400934, |
| "eval_chemistry_entropy": 0.9332761432528496, |
| "eval_chemistry_loss": 0.9383891820907593, |
| "eval_chemistry_mean_token_accuracy": 0.7543213546276093, |
| "eval_chemistry_num_tokens": 1985766.0, |
| "eval_chemistry_runtime": 46.2485, |
| "eval_chemistry_samples_per_second": 10.811, |
| "eval_chemistry_steps_per_second": 10.811, |
| "step": 300 |
| }, |
| { |
| "entropy": 0.9657653540372848, |
| "epoch": 0.137984754910143, |
| "grad_norm": 66.5, |
| "learning_rate": 6.18e-06, |
| "loss": 15.5291, |
| "mean_token_accuracy": 0.7452911786735058, |
| "num_tokens": 2049041.0, |
| "step": 310 |
| }, |
| { |
| "entropy": 0.922235100530088, |
| "epoch": 0.14243587603627664, |
| "grad_norm": 49.75, |
| "learning_rate": 6.380000000000001e-06, |
| "loss": 14.7109, |
| "mean_token_accuracy": 0.75825478695333, |
| "num_tokens": 2118068.0, |
| "step": 320 |
| }, |
| { |
| "entropy": 0.9044711474329233, |
| "epoch": 0.14688699716241027, |
| "grad_norm": 60.25, |
| "learning_rate": 6.5800000000000005e-06, |
| "loss": 14.5687, |
| "mean_token_accuracy": 0.7618395145982504, |
| "num_tokens": 2186387.0, |
| "step": 330 |
| }, |
| { |
| "entropy": 0.946225673891604, |
| "epoch": 0.15133811828854393, |
| "grad_norm": 52.5, |
| "learning_rate": 6.780000000000001e-06, |
| "loss": 15.1959, |
| "mean_token_accuracy": 0.7535316452383996, |
| "num_tokens": 2252650.0, |
| "step": 340 |
| }, |
| { |
| "entropy": 0.9036338411271572, |
| "epoch": 0.15578923941467757, |
| "grad_norm": 57.75, |
| "learning_rate": 6.98e-06, |
| "loss": 14.5854, |
| "mean_token_accuracy": 0.7611672822386026, |
| "num_tokens": 2320358.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 0.9015818448737264, |
| "epoch": 0.16024036054081123, |
| "grad_norm": 49.5, |
| "learning_rate": 7.180000000000001e-06, |
| "loss": 14.5381, |
| "mean_token_accuracy": 0.7606555309146643, |
| "num_tokens": 2388824.0, |
| "step": 360 |
| }, |
| { |
| "entropy": 0.8864203749224544, |
| "epoch": 0.16469148166694486, |
| "grad_norm": 49.25, |
| "learning_rate": 7.3800000000000005e-06, |
| "loss": 14.1936, |
| "mean_token_accuracy": 0.7665066320449114, |
| "num_tokens": 2456144.0, |
| "step": 370 |
| }, |
| { |
| "entropy": 0.9866490814834833, |
| "epoch": 0.1691426027930785, |
| "grad_norm": 49.5, |
| "learning_rate": 7.58e-06, |
| "loss": 15.8412, |
| "mean_token_accuracy": 0.7478409979492426, |
| "num_tokens": 2515325.0, |
| "step": 380 |
| }, |
| { |
| "entropy": 0.9080646676942706, |
| "epoch": 0.17359372391921216, |
| "grad_norm": 48.25, |
| "learning_rate": 7.78e-06, |
| "loss": 14.5343, |
| "mean_token_accuracy": 0.7594566397368908, |
| "num_tokens": 2580490.0, |
| "step": 390 |
| }, |
| { |
| "entropy": 0.9095059128478169, |
| "epoch": 0.1780448450453458, |
| "grad_norm": 43.5, |
| "learning_rate": 7.980000000000002e-06, |
| "loss": 14.6647, |
| "mean_token_accuracy": 0.7607249341905117, |
| "num_tokens": 2644330.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.1780448450453458, |
| "eval_biology_entropy": 1.1971934199333192, |
| "eval_biology_loss": 1.2638347148895264, |
| "eval_biology_mean_token_accuracy": 0.6877701328396797, |
| "eval_biology_num_tokens": 2644330.0, |
| "eval_biology_runtime": 39.8022, |
| "eval_biology_samples_per_second": 12.562, |
| "eval_biology_steps_per_second": 12.562, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.1780448450453458, |
| "eval_chemistry_entropy": 0.8957328157424926, |
| "eval_chemistry_loss": 0.9148933291435242, |
| "eval_chemistry_mean_token_accuracy": 0.7592848987579346, |
| "eval_chemistry_num_tokens": 2644330.0, |
| "eval_chemistry_runtime": 46.2334, |
| "eval_chemistry_samples_per_second": 10.815, |
| "eval_chemistry_steps_per_second": 10.815, |
| "step": 400 |
| }, |
| { |
| "entropy": 0.8526191784068942, |
| "epoch": 0.18249596617147945, |
| "grad_norm": 50.0, |
| "learning_rate": 8.18e-06, |
| "loss": 13.7182, |
| "mean_token_accuracy": 0.7745671790093184, |
| "num_tokens": 2713234.0, |
| "step": 410 |
| }, |
| { |
| "entropy": 0.932603782787919, |
| "epoch": 0.18694708729761308, |
| "grad_norm": 47.5, |
| "learning_rate": 8.380000000000001e-06, |
| "loss": 15.028, |
| "mean_token_accuracy": 0.7569812458008528, |
| "num_tokens": 2783261.0, |
| "step": 420 |
| }, |
| { |
| "entropy": 0.9143912255764007, |
| "epoch": 0.19139820842374672, |
| "grad_norm": 43.0, |
| "learning_rate": 8.580000000000001e-06, |
| "loss": 14.7724, |
| "mean_token_accuracy": 0.7596961252391339, |
| "num_tokens": 2850170.0, |
| "step": 430 |
| }, |
| { |
| "entropy": 0.9113649705424904, |
| "epoch": 0.19584932954988038, |
| "grad_norm": 58.5, |
| "learning_rate": 8.78e-06, |
| "loss": 14.6432, |
| "mean_token_accuracy": 0.760087676718831, |
| "num_tokens": 2913700.0, |
| "step": 440 |
| }, |
| { |
| "entropy": 0.9072460785508156, |
| "epoch": 0.200300450676014, |
| "grad_norm": 46.5, |
| "learning_rate": 8.98e-06, |
| "loss": 14.609, |
| "mean_token_accuracy": 0.7596194025129079, |
| "num_tokens": 2981212.0, |
| "step": 450 |
| }, |
| { |
| "entropy": 0.8693920068442822, |
| "epoch": 0.20475157180214767, |
| "grad_norm": 53.25, |
| "learning_rate": 9.180000000000002e-06, |
| "loss": 13.9505, |
| "mean_token_accuracy": 0.771946213953197, |
| "num_tokens": 3048973.0, |
| "step": 460 |
| }, |
| { |
| "entropy": 0.9109398307278752, |
| "epoch": 0.2092026929282813, |
| "grad_norm": 47.25, |
| "learning_rate": 9.38e-06, |
| "loss": 14.614, |
| "mean_token_accuracy": 0.7599313069134951, |
| "num_tokens": 3117033.0, |
| "step": 470 |
| }, |
| { |
| "entropy": 0.8936059167608619, |
| "epoch": 0.21365381405441494, |
| "grad_norm": 50.25, |
| "learning_rate": 9.58e-06, |
| "loss": 14.4662, |
| "mean_token_accuracy": 0.7608913701027632, |
| "num_tokens": 3185255.0, |
| "step": 480 |
| }, |
| { |
| "entropy": 0.9031545946374535, |
| "epoch": 0.2181049351805486, |
| "grad_norm": 51.5, |
| "learning_rate": 9.780000000000001e-06, |
| "loss": 14.5197, |
| "mean_token_accuracy": 0.7608289115130902, |
| "num_tokens": 3253121.0, |
| "step": 490 |
| }, |
| { |
| "entropy": 0.8450184227898717, |
| "epoch": 0.22255605630668224, |
| "grad_norm": 49.5, |
| "learning_rate": 9.980000000000001e-06, |
| "loss": 13.4565, |
| "mean_token_accuracy": 0.7740382503718137, |
| "num_tokens": 3322823.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.22255605630668224, |
| "eval_biology_entropy": 1.2025449865460396, |
| "eval_biology_loss": 1.267388939857483, |
| "eval_biology_mean_token_accuracy": 0.6874357106685638, |
| "eval_biology_num_tokens": 3322823.0, |
| "eval_biology_runtime": 40.0827, |
| "eval_biology_samples_per_second": 12.474, |
| "eval_biology_steps_per_second": 12.474, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.22255605630668224, |
| "eval_chemistry_entropy": 0.8795891938209534, |
| "eval_chemistry_loss": 0.8957814574241638, |
| "eval_chemistry_mean_token_accuracy": 0.7629851229190826, |
| "eval_chemistry_num_tokens": 3322823.0, |
| "eval_chemistry_runtime": 46.3478, |
| "eval_chemistry_samples_per_second": 10.788, |
| "eval_chemistry_steps_per_second": 10.788, |
| "step": 500 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 10000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.2280979081801686e+17, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|