| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.4451121126133645, |
| "eval_steps": 100, |
| "global_step": 1000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 0.8066153490915895, |
| "epoch": 0.004451121126133645, |
| "grad_norm": 107.5, |
| "learning_rate": 1.8e-07, |
| "loss": 16.3591, |
| "mean_token_accuracy": 0.7447933696210385, |
| "num_tokens": 63133.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 0.8401238698512316, |
| "epoch": 0.00890224225226729, |
| "grad_norm": 103.5, |
| "learning_rate": 3.8e-07, |
| "loss": 17.2936, |
| "mean_token_accuracy": 0.7330440735444427, |
| "num_tokens": 128968.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 0.828661117143929, |
| "epoch": 0.013353363378400934, |
| "grad_norm": 96.5, |
| "learning_rate": 5.800000000000001e-07, |
| "loss": 16.5809, |
| "mean_token_accuracy": 0.7381731692701579, |
| "num_tokens": 193314.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 0.8600894263014197, |
| "epoch": 0.01780448450453458, |
| "grad_norm": 96.0, |
| "learning_rate": 7.8e-07, |
| "loss": 17.7444, |
| "mean_token_accuracy": 0.7324823886156082, |
| "num_tokens": 257189.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 0.8017022017389536, |
| "epoch": 0.022255605630668224, |
| "grad_norm": 84.5, |
| "learning_rate": 9.800000000000001e-07, |
| "loss": 15.9063, |
| "mean_token_accuracy": 0.7455223135650157, |
| "num_tokens": 324224.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 0.846268966794014, |
| "epoch": 0.026706726756801868, |
| "grad_norm": 78.5, |
| "learning_rate": 1.1800000000000001e-06, |
| "loss": 16.6006, |
| "mean_token_accuracy": 0.7411292420700193, |
| "num_tokens": 394654.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 0.8181243563070894, |
| "epoch": 0.031157847882935515, |
| "grad_norm": 76.5, |
| "learning_rate": 1.3800000000000001e-06, |
| "loss": 15.9846, |
| "mean_token_accuracy": 0.746401545777917, |
| "num_tokens": 461658.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 0.8314600124955177, |
| "epoch": 0.03560896900906916, |
| "grad_norm": 75.0, |
| "learning_rate": 1.5800000000000001e-06, |
| "loss": 15.8227, |
| "mean_token_accuracy": 0.7472479769960045, |
| "num_tokens": 528285.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 0.8807666478678584, |
| "epoch": 0.04006009013520281, |
| "grad_norm": 72.0, |
| "learning_rate": 1.7800000000000001e-06, |
| "loss": 16.6834, |
| "mean_token_accuracy": 0.7396679904311896, |
| "num_tokens": 595336.0, |
| "step": 90 |
| }, |
| { |
| "entropy": 0.8991396678611636, |
| "epoch": 0.04451121126133645, |
| "grad_norm": 79.5, |
| "learning_rate": 1.98e-06, |
| "loss": 16.3968, |
| "mean_token_accuracy": 0.7396075185388327, |
| "num_tokens": 663126.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.04451121126133645, |
| "eval_biology_entropy": 1.1188154411315918, |
| "eval_biology_loss": 1.2691835165023804, |
| "eval_biology_mean_token_accuracy": 0.6881385813951493, |
| "eval_biology_num_tokens": 663126.0, |
| "eval_biology_runtime": 40.4565, |
| "eval_biology_samples_per_second": 12.359, |
| "eval_biology_steps_per_second": 12.359, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.04451121126133645, |
| "eval_chemistry_entropy": 0.8804921235442161, |
| "eval_chemistry_loss": 1.0042469501495361, |
| "eval_chemistry_mean_token_accuracy": 0.7444319971203804, |
| "eval_chemistry_num_tokens": 663126.0, |
| "eval_chemistry_runtime": 46.3585, |
| "eval_chemistry_samples_per_second": 10.786, |
| "eval_chemistry_steps_per_second": 10.786, |
| "step": 100 |
| }, |
| { |
| "entropy": 0.8988691195845604, |
| "epoch": 0.048962332387470095, |
| "grad_norm": 100.5, |
| "learning_rate": 2.1800000000000003e-06, |
| "loss": 16.3063, |
| "mean_token_accuracy": 0.7414613764733076, |
| "num_tokens": 731060.0, |
| "step": 110 |
| }, |
| { |
| "entropy": 0.8918870648369193, |
| "epoch": 0.053413453513603736, |
| "grad_norm": 77.5, |
| "learning_rate": 2.38e-06, |
| "loss": 15.8311, |
| "mean_token_accuracy": 0.7483173958957196, |
| "num_tokens": 801635.0, |
| "step": 120 |
| }, |
| { |
| "entropy": 0.9162682231515646, |
| "epoch": 0.05786457463973738, |
| "grad_norm": 66.0, |
| "learning_rate": 2.5800000000000003e-06, |
| "loss": 16.1464, |
| "mean_token_accuracy": 0.7448406910523773, |
| "num_tokens": 867260.0, |
| "step": 130 |
| }, |
| { |
| "entropy": 0.9749668512493372, |
| "epoch": 0.06231569576587103, |
| "grad_norm": 79.5, |
| "learning_rate": 2.7800000000000005e-06, |
| "loss": 16.9562, |
| "mean_token_accuracy": 0.7369577366858721, |
| "num_tokens": 931344.0, |
| "step": 140 |
| }, |
| { |
| "entropy": 0.939477625861764, |
| "epoch": 0.06676681689200467, |
| "grad_norm": 56.75, |
| "learning_rate": 2.9800000000000003e-06, |
| "loss": 15.7327, |
| "mean_token_accuracy": 0.7471803797408938, |
| "num_tokens": 993586.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 0.9718907386064529, |
| "epoch": 0.07121793801813832, |
| "grad_norm": 53.75, |
| "learning_rate": 3.1800000000000005e-06, |
| "loss": 16.2036, |
| "mean_token_accuracy": 0.7392314806580543, |
| "num_tokens": 1060812.0, |
| "step": 160 |
| }, |
| { |
| "entropy": 0.9165311623364687, |
| "epoch": 0.07566905914427197, |
| "grad_norm": 63.25, |
| "learning_rate": 3.3800000000000007e-06, |
| "loss": 15.2657, |
| "mean_token_accuracy": 0.7518215283751488, |
| "num_tokens": 1131832.0, |
| "step": 170 |
| }, |
| { |
| "entropy": 0.9465073021128774, |
| "epoch": 0.08012018027040561, |
| "grad_norm": 50.25, |
| "learning_rate": 3.58e-06, |
| "loss": 15.5603, |
| "mean_token_accuracy": 0.7473610159009695, |
| "num_tokens": 1200650.0, |
| "step": 180 |
| }, |
| { |
| "entropy": 0.9450380651280283, |
| "epoch": 0.08457130139653925, |
| "grad_norm": 54.0, |
| "learning_rate": 3.7800000000000002e-06, |
| "loss": 15.56, |
| "mean_token_accuracy": 0.748077143356204, |
| "num_tokens": 1265107.0, |
| "step": 190 |
| }, |
| { |
| "entropy": 0.9630168141797185, |
| "epoch": 0.0890224225226729, |
| "grad_norm": 58.75, |
| "learning_rate": 3.980000000000001e-06, |
| "loss": 15.5029, |
| "mean_token_accuracy": 0.7486971555277705, |
| "num_tokens": 1327380.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.0890224225226729, |
| "eval_biology_entropy": 1.19531831908226, |
| "eval_biology_loss": 1.2584387063980103, |
| "eval_biology_mean_token_accuracy": 0.6882349443435669, |
| "eval_biology_num_tokens": 1327380.0, |
| "eval_biology_runtime": 40.8783, |
| "eval_biology_samples_per_second": 12.231, |
| "eval_biology_steps_per_second": 12.231, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.0890224225226729, |
| "eval_chemistry_entropy": 0.9437598274946213, |
| "eval_chemistry_loss": 0.9655953645706177, |
| "eval_chemistry_mean_token_accuracy": 0.7501006088852883, |
| "eval_chemistry_num_tokens": 1327380.0, |
| "eval_chemistry_runtime": 46.6023, |
| "eval_chemistry_samples_per_second": 10.729, |
| "eval_chemistry_steps_per_second": 10.729, |
| "step": 200 |
| }, |
| { |
| "entropy": 0.9782480053603649, |
| "epoch": 0.09347354364880654, |
| "grad_norm": 64.0, |
| "learning_rate": 4.18e-06, |
| "loss": 15.9821, |
| "mean_token_accuracy": 0.7436690799891948, |
| "num_tokens": 1393379.0, |
| "step": 210 |
| }, |
| { |
| "entropy": 0.9402019061148167, |
| "epoch": 0.09792466477494019, |
| "grad_norm": 54.0, |
| "learning_rate": 4.38e-06, |
| "loss": 15.3729, |
| "mean_token_accuracy": 0.7516820874065161, |
| "num_tokens": 1460130.0, |
| "step": 220 |
| }, |
| { |
| "entropy": 0.9247835712507367, |
| "epoch": 0.10237578590107384, |
| "grad_norm": 54.5, |
| "learning_rate": 4.58e-06, |
| "loss": 15.0731, |
| "mean_token_accuracy": 0.7559274602681398, |
| "num_tokens": 1529183.0, |
| "step": 230 |
| }, |
| { |
| "entropy": 0.9673028320074082, |
| "epoch": 0.10682690702720747, |
| "grad_norm": 71.5, |
| "learning_rate": 4.78e-06, |
| "loss": 15.731, |
| "mean_token_accuracy": 0.7473661951720715, |
| "num_tokens": 1597405.0, |
| "step": 240 |
| }, |
| { |
| "entropy": 0.9974556604400278, |
| "epoch": 0.11127802815334112, |
| "grad_norm": 55.0, |
| "learning_rate": 4.980000000000001e-06, |
| "loss": 15.8962, |
| "mean_token_accuracy": 0.7434635870158672, |
| "num_tokens": 1661767.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 0.985609365440905, |
| "epoch": 0.11572914927947477, |
| "grad_norm": 50.75, |
| "learning_rate": 5.18e-06, |
| "loss": 16.0267, |
| "mean_token_accuracy": 0.7446854375302792, |
| "num_tokens": 1728207.0, |
| "step": 260 |
| }, |
| { |
| "entropy": 0.9456103699281811, |
| "epoch": 0.12018027040560841, |
| "grad_norm": 58.0, |
| "learning_rate": 5.380000000000001e-06, |
| "loss": 15.3377, |
| "mean_token_accuracy": 0.750850186496973, |
| "num_tokens": 1796055.0, |
| "step": 270 |
| }, |
| { |
| "entropy": 0.9541573049500585, |
| "epoch": 0.12463139153174206, |
| "grad_norm": 56.5, |
| "learning_rate": 5.580000000000001e-06, |
| "loss": 15.2532, |
| "mean_token_accuracy": 0.7530262626707553, |
| "num_tokens": 1859684.0, |
| "step": 280 |
| }, |
| { |
| "entropy": 0.9941559780389071, |
| "epoch": 0.1290825126578757, |
| "grad_norm": 59.25, |
| "learning_rate": 5.78e-06, |
| "loss": 15.94, |
| "mean_token_accuracy": 0.745009395852685, |
| "num_tokens": 1921704.0, |
| "step": 290 |
| }, |
| { |
| "entropy": 0.981252990104258, |
| "epoch": 0.13353363378400934, |
| "grad_norm": 56.75, |
| "learning_rate": 5.98e-06, |
| "loss": 15.8943, |
| "mean_token_accuracy": 0.7449573867022992, |
| "num_tokens": 1985766.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.13353363378400934, |
| "eval_biology_entropy": 1.2063790675401687, |
| "eval_biology_loss": 1.2592713832855225, |
| "eval_biology_mean_token_accuracy": 0.6874204781055451, |
| "eval_biology_num_tokens": 1985766.0, |
| "eval_biology_runtime": 40.5061, |
| "eval_biology_samples_per_second": 12.344, |
| "eval_biology_steps_per_second": 12.344, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.13353363378400934, |
| "eval_chemistry_entropy": 0.9332761432528496, |
| "eval_chemistry_loss": 0.9383891820907593, |
| "eval_chemistry_mean_token_accuracy": 0.7543213546276093, |
| "eval_chemistry_num_tokens": 1985766.0, |
| "eval_chemistry_runtime": 46.2485, |
| "eval_chemistry_samples_per_second": 10.811, |
| "eval_chemistry_steps_per_second": 10.811, |
| "step": 300 |
| }, |
| { |
| "entropy": 0.9657653540372848, |
| "epoch": 0.137984754910143, |
| "grad_norm": 66.5, |
| "learning_rate": 6.18e-06, |
| "loss": 15.5291, |
| "mean_token_accuracy": 0.7452911786735058, |
| "num_tokens": 2049041.0, |
| "step": 310 |
| }, |
| { |
| "entropy": 0.922235100530088, |
| "epoch": 0.14243587603627664, |
| "grad_norm": 49.75, |
| "learning_rate": 6.380000000000001e-06, |
| "loss": 14.7109, |
| "mean_token_accuracy": 0.75825478695333, |
| "num_tokens": 2118068.0, |
| "step": 320 |
| }, |
| { |
| "entropy": 0.9044711474329233, |
| "epoch": 0.14688699716241027, |
| "grad_norm": 60.25, |
| "learning_rate": 6.5800000000000005e-06, |
| "loss": 14.5687, |
| "mean_token_accuracy": 0.7618395145982504, |
| "num_tokens": 2186387.0, |
| "step": 330 |
| }, |
| { |
| "entropy": 0.946225673891604, |
| "epoch": 0.15133811828854393, |
| "grad_norm": 52.5, |
| "learning_rate": 6.780000000000001e-06, |
| "loss": 15.1959, |
| "mean_token_accuracy": 0.7535316452383996, |
| "num_tokens": 2252650.0, |
| "step": 340 |
| }, |
| { |
| "entropy": 0.9036338411271572, |
| "epoch": 0.15578923941467757, |
| "grad_norm": 57.75, |
| "learning_rate": 6.98e-06, |
| "loss": 14.5854, |
| "mean_token_accuracy": 0.7611672822386026, |
| "num_tokens": 2320358.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 0.9015818448737264, |
| "epoch": 0.16024036054081123, |
| "grad_norm": 49.5, |
| "learning_rate": 7.180000000000001e-06, |
| "loss": 14.5381, |
| "mean_token_accuracy": 0.7606555309146643, |
| "num_tokens": 2388824.0, |
| "step": 360 |
| }, |
| { |
| "entropy": 0.8864203749224544, |
| "epoch": 0.16469148166694486, |
| "grad_norm": 49.25, |
| "learning_rate": 7.3800000000000005e-06, |
| "loss": 14.1936, |
| "mean_token_accuracy": 0.7665066320449114, |
| "num_tokens": 2456144.0, |
| "step": 370 |
| }, |
| { |
| "entropy": 0.9866490814834833, |
| "epoch": 0.1691426027930785, |
| "grad_norm": 49.5, |
| "learning_rate": 7.58e-06, |
| "loss": 15.8412, |
| "mean_token_accuracy": 0.7478409979492426, |
| "num_tokens": 2515325.0, |
| "step": 380 |
| }, |
| { |
| "entropy": 0.9080646676942706, |
| "epoch": 0.17359372391921216, |
| "grad_norm": 48.25, |
| "learning_rate": 7.78e-06, |
| "loss": 14.5343, |
| "mean_token_accuracy": 0.7594566397368908, |
| "num_tokens": 2580490.0, |
| "step": 390 |
| }, |
| { |
| "entropy": 0.9095059128478169, |
| "epoch": 0.1780448450453458, |
| "grad_norm": 43.5, |
| "learning_rate": 7.980000000000002e-06, |
| "loss": 14.6647, |
| "mean_token_accuracy": 0.7607249341905117, |
| "num_tokens": 2644330.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.1780448450453458, |
| "eval_biology_entropy": 1.1971934199333192, |
| "eval_biology_loss": 1.2638347148895264, |
| "eval_biology_mean_token_accuracy": 0.6877701328396797, |
| "eval_biology_num_tokens": 2644330.0, |
| "eval_biology_runtime": 39.8022, |
| "eval_biology_samples_per_second": 12.562, |
| "eval_biology_steps_per_second": 12.562, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.1780448450453458, |
| "eval_chemistry_entropy": 0.8957328157424926, |
| "eval_chemistry_loss": 0.9148933291435242, |
| "eval_chemistry_mean_token_accuracy": 0.7592848987579346, |
| "eval_chemistry_num_tokens": 2644330.0, |
| "eval_chemistry_runtime": 46.2334, |
| "eval_chemistry_samples_per_second": 10.815, |
| "eval_chemistry_steps_per_second": 10.815, |
| "step": 400 |
| }, |
| { |
| "entropy": 0.8526191784068942, |
| "epoch": 0.18249596617147945, |
| "grad_norm": 50.0, |
| "learning_rate": 8.18e-06, |
| "loss": 13.7182, |
| "mean_token_accuracy": 0.7745671790093184, |
| "num_tokens": 2713234.0, |
| "step": 410 |
| }, |
| { |
| "entropy": 0.932603782787919, |
| "epoch": 0.18694708729761308, |
| "grad_norm": 47.5, |
| "learning_rate": 8.380000000000001e-06, |
| "loss": 15.028, |
| "mean_token_accuracy": 0.7569812458008528, |
| "num_tokens": 2783261.0, |
| "step": 420 |
| }, |
| { |
| "entropy": 0.9143912255764007, |
| "epoch": 0.19139820842374672, |
| "grad_norm": 43.0, |
| "learning_rate": 8.580000000000001e-06, |
| "loss": 14.7724, |
| "mean_token_accuracy": 0.7596961252391339, |
| "num_tokens": 2850170.0, |
| "step": 430 |
| }, |
| { |
| "entropy": 0.9113649705424904, |
| "epoch": 0.19584932954988038, |
| "grad_norm": 58.5, |
| "learning_rate": 8.78e-06, |
| "loss": 14.6432, |
| "mean_token_accuracy": 0.760087676718831, |
| "num_tokens": 2913700.0, |
| "step": 440 |
| }, |
| { |
| "entropy": 0.9072460785508156, |
| "epoch": 0.200300450676014, |
| "grad_norm": 46.5, |
| "learning_rate": 8.98e-06, |
| "loss": 14.609, |
| "mean_token_accuracy": 0.7596194025129079, |
| "num_tokens": 2981212.0, |
| "step": 450 |
| }, |
| { |
| "entropy": 0.8693920068442822, |
| "epoch": 0.20475157180214767, |
| "grad_norm": 53.25, |
| "learning_rate": 9.180000000000002e-06, |
| "loss": 13.9505, |
| "mean_token_accuracy": 0.771946213953197, |
| "num_tokens": 3048973.0, |
| "step": 460 |
| }, |
| { |
| "entropy": 0.9109398307278752, |
| "epoch": 0.2092026929282813, |
| "grad_norm": 47.25, |
| "learning_rate": 9.38e-06, |
| "loss": 14.614, |
| "mean_token_accuracy": 0.7599313069134951, |
| "num_tokens": 3117033.0, |
| "step": 470 |
| }, |
| { |
| "entropy": 0.8936059167608619, |
| "epoch": 0.21365381405441494, |
| "grad_norm": 50.25, |
| "learning_rate": 9.58e-06, |
| "loss": 14.4662, |
| "mean_token_accuracy": 0.7608913701027632, |
| "num_tokens": 3185255.0, |
| "step": 480 |
| }, |
| { |
| "entropy": 0.9031545946374535, |
| "epoch": 0.2181049351805486, |
| "grad_norm": 51.5, |
| "learning_rate": 9.780000000000001e-06, |
| "loss": 14.5197, |
| "mean_token_accuracy": 0.7608289115130902, |
| "num_tokens": 3253121.0, |
| "step": 490 |
| }, |
| { |
| "entropy": 0.8450184227898717, |
| "epoch": 0.22255605630668224, |
| "grad_norm": 49.5, |
| "learning_rate": 9.980000000000001e-06, |
| "loss": 13.4565, |
| "mean_token_accuracy": 0.7740382503718137, |
| "num_tokens": 3322823.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.22255605630668224, |
| "eval_biology_entropy": 1.2025449865460396, |
| "eval_biology_loss": 1.267388939857483, |
| "eval_biology_mean_token_accuracy": 0.6874357106685638, |
| "eval_biology_num_tokens": 3322823.0, |
| "eval_biology_runtime": 40.0827, |
| "eval_biology_samples_per_second": 12.474, |
| "eval_biology_steps_per_second": 12.474, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.22255605630668224, |
| "eval_chemistry_entropy": 0.8795891938209534, |
| "eval_chemistry_loss": 0.8957814574241638, |
| "eval_chemistry_mean_token_accuracy": 0.7629851229190826, |
| "eval_chemistry_num_tokens": 3322823.0, |
| "eval_chemistry_runtime": 46.3478, |
| "eval_chemistry_samples_per_second": 10.788, |
| "eval_chemistry_steps_per_second": 10.788, |
| "step": 500 |
| }, |
| { |
| "entropy": 0.8574318964034319, |
| "epoch": 0.2270071774328159, |
| "grad_norm": 45.25, |
| "learning_rate": 1.018e-05, |
| "loss": 13.9042, |
| "mean_token_accuracy": 0.7678481444716454, |
| "num_tokens": 3393393.0, |
| "step": 510 |
| }, |
| { |
| "entropy": 0.9288356432691216, |
| "epoch": 0.23145829855894953, |
| "grad_norm": 57.25, |
| "learning_rate": 1.038e-05, |
| "loss": 15.193, |
| "mean_token_accuracy": 0.7506348451599478, |
| "num_tokens": 3454750.0, |
| "step": 520 |
| }, |
| { |
| "entropy": 0.8564269673079252, |
| "epoch": 0.2359094196850832, |
| "grad_norm": 50.75, |
| "learning_rate": 1.0580000000000002e-05, |
| "loss": 13.7644, |
| "mean_token_accuracy": 0.7751363463699817, |
| "num_tokens": 3526914.0, |
| "step": 530 |
| }, |
| { |
| "entropy": 0.93802858479321, |
| "epoch": 0.24036054081121683, |
| "grad_norm": 45.0, |
| "learning_rate": 1.0780000000000002e-05, |
| "loss": 15.0494, |
| "mean_token_accuracy": 0.7559149663895368, |
| "num_tokens": 3589449.0, |
| "step": 540 |
| }, |
| { |
| "entropy": 0.9012869004160166, |
| "epoch": 0.24481166193735046, |
| "grad_norm": 50.25, |
| "learning_rate": 1.0980000000000002e-05, |
| "loss": 14.2344, |
| "mean_token_accuracy": 0.7642816316336394, |
| "num_tokens": 3655092.0, |
| "step": 550 |
| }, |
| { |
| "entropy": 0.8946880368515849, |
| "epoch": 0.24926278306348412, |
| "grad_norm": 57.25, |
| "learning_rate": 1.1180000000000001e-05, |
| "loss": 14.5832, |
| "mean_token_accuracy": 0.7632339514791966, |
| "num_tokens": 3719113.0, |
| "step": 560 |
| }, |
| { |
| "entropy": 0.8879899585619568, |
| "epoch": 0.25371390418961776, |
| "grad_norm": 44.5, |
| "learning_rate": 1.138e-05, |
| "loss": 14.3692, |
| "mean_token_accuracy": 0.7596106130629778, |
| "num_tokens": 3785282.0, |
| "step": 570 |
| }, |
| { |
| "entropy": 0.9068781601265072, |
| "epoch": 0.2581650253157514, |
| "grad_norm": 42.25, |
| "learning_rate": 1.1580000000000001e-05, |
| "loss": 14.6096, |
| "mean_token_accuracy": 0.761234056390822, |
| "num_tokens": 3852276.0, |
| "step": 580 |
| }, |
| { |
| "entropy": 0.8276193620637059, |
| "epoch": 0.262616146441885, |
| "grad_norm": 43.25, |
| "learning_rate": 1.178e-05, |
| "loss": 13.3892, |
| "mean_token_accuracy": 0.7755540499463678, |
| "num_tokens": 3925649.0, |
| "step": 590 |
| }, |
| { |
| "entropy": 0.8920110030099749, |
| "epoch": 0.2670672675680187, |
| "grad_norm": 52.25, |
| "learning_rate": 1.198e-05, |
| "loss": 14.0197, |
| "mean_token_accuracy": 0.766492671892047, |
| "num_tokens": 3990505.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.2670672675680187, |
| "eval_biology_entropy": 1.2123423200249672, |
| "eval_biology_loss": 1.2729928493499756, |
| "eval_biology_mean_token_accuracy": 0.6870606996417046, |
| "eval_biology_num_tokens": 3990505.0, |
| "eval_biology_runtime": 39.7983, |
| "eval_biology_samples_per_second": 12.563, |
| "eval_biology_steps_per_second": 12.563, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.2670672675680187, |
| "eval_chemistry_entropy": 0.8659857953190804, |
| "eval_chemistry_loss": 0.8781383037567139, |
| "eval_chemistry_mean_token_accuracy": 0.7664026654362679, |
| "eval_chemistry_num_tokens": 3990505.0, |
| "eval_chemistry_runtime": 46.1775, |
| "eval_chemistry_samples_per_second": 10.828, |
| "eval_chemistry_steps_per_second": 10.828, |
| "step": 600 |
| }, |
| { |
| "entropy": 0.8550863016396761, |
| "epoch": 0.27151838869415235, |
| "grad_norm": 51.25, |
| "learning_rate": 1.218e-05, |
| "loss": 13.8181, |
| "mean_token_accuracy": 0.7682087656110526, |
| "num_tokens": 4060657.0, |
| "step": 610 |
| }, |
| { |
| "entropy": 0.8537623688578606, |
| "epoch": 0.275969509820286, |
| "grad_norm": 42.25, |
| "learning_rate": 1.2380000000000002e-05, |
| "loss": 13.6601, |
| "mean_token_accuracy": 0.772542554140091, |
| "num_tokens": 4133119.0, |
| "step": 620 |
| }, |
| { |
| "entropy": 0.8621461872011423, |
| "epoch": 0.2804206309464196, |
| "grad_norm": 49.25, |
| "learning_rate": 1.2580000000000002e-05, |
| "loss": 13.9865, |
| "mean_token_accuracy": 0.7680005200207234, |
| "num_tokens": 4200051.0, |
| "step": 630 |
| }, |
| { |
| "entropy": 0.9106066713109613, |
| "epoch": 0.2848717520725533, |
| "grad_norm": 49.25, |
| "learning_rate": 1.2780000000000001e-05, |
| "loss": 14.717, |
| "mean_token_accuracy": 0.7637291874736547, |
| "num_tokens": 4267403.0, |
| "step": 640 |
| }, |
| { |
| "entropy": 0.868153141438961, |
| "epoch": 0.28932287319868694, |
| "grad_norm": 52.25, |
| "learning_rate": 1.2980000000000001e-05, |
| "loss": 13.8988, |
| "mean_token_accuracy": 0.7680647127330303, |
| "num_tokens": 4333268.0, |
| "step": 650 |
| }, |
| { |
| "entropy": 0.8451825473457575, |
| "epoch": 0.29377399432482054, |
| "grad_norm": 53.5, |
| "learning_rate": 1.3180000000000001e-05, |
| "loss": 13.5394, |
| "mean_token_accuracy": 0.7747031616047024, |
| "num_tokens": 4400242.0, |
| "step": 660 |
| }, |
| { |
| "entropy": 0.8826779069378972, |
| "epoch": 0.2982251154509542, |
| "grad_norm": 49.75, |
| "learning_rate": 1.3380000000000002e-05, |
| "loss": 14.0787, |
| "mean_token_accuracy": 0.7660945057868958, |
| "num_tokens": 4464714.0, |
| "step": 670 |
| }, |
| { |
| "entropy": 0.8183048281818628, |
| "epoch": 0.30267623657708786, |
| "grad_norm": 47.75, |
| "learning_rate": 1.3580000000000002e-05, |
| "loss": 13.4564, |
| "mean_token_accuracy": 0.7773742496967315, |
| "num_tokens": 4535458.0, |
| "step": 680 |
| }, |
| { |
| "entropy": 0.8536316430196166, |
| "epoch": 0.3071273577032215, |
| "grad_norm": 48.25, |
| "learning_rate": 1.378e-05, |
| "loss": 13.552, |
| "mean_token_accuracy": 0.7754087567329406, |
| "num_tokens": 4599099.0, |
| "step": 690 |
| }, |
| { |
| "entropy": 0.816833440028131, |
| "epoch": 0.31157847882935513, |
| "grad_norm": 39.0, |
| "learning_rate": 1.398e-05, |
| "loss": 13.1791, |
| "mean_token_accuracy": 0.777547013387084, |
| "num_tokens": 4662903.0, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.31157847882935513, |
| "eval_biology_entropy": 1.2325178788900375, |
| "eval_biology_loss": 1.278289556503296, |
| "eval_biology_mean_token_accuracy": 0.686549211382866, |
| "eval_biology_num_tokens": 4662903.0, |
| "eval_biology_runtime": 39.6837, |
| "eval_biology_samples_per_second": 12.6, |
| "eval_biology_steps_per_second": 12.6, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.31157847882935513, |
| "eval_chemistry_entropy": 0.870070047557354, |
| "eval_chemistry_loss": 0.8674882650375366, |
| "eval_chemistry_mean_token_accuracy": 0.7685190732479096, |
| "eval_chemistry_num_tokens": 4662903.0, |
| "eval_chemistry_runtime": 46.0896, |
| "eval_chemistry_samples_per_second": 10.848, |
| "eval_chemistry_steps_per_second": 10.848, |
| "step": 700 |
| }, |
| { |
| "entropy": 0.8818355791270733, |
| "epoch": 0.3160295999554888, |
| "grad_norm": 54.0, |
| "learning_rate": 1.418e-05, |
| "loss": 14.2266, |
| "mean_token_accuracy": 0.7657015427947045, |
| "num_tokens": 4729166.0, |
| "step": 710 |
| }, |
| { |
| "entropy": 0.910587764903903, |
| "epoch": 0.32048072108162245, |
| "grad_norm": 54.5, |
| "learning_rate": 1.4380000000000001e-05, |
| "loss": 14.7617, |
| "mean_token_accuracy": 0.7591140177100897, |
| "num_tokens": 4796815.0, |
| "step": 720 |
| }, |
| { |
| "entropy": 0.8099086729809641, |
| "epoch": 0.32493184220775606, |
| "grad_norm": 51.25, |
| "learning_rate": 1.4580000000000001e-05, |
| "loss": 12.9674, |
| "mean_token_accuracy": 0.7837777521461249, |
| "num_tokens": 4865172.0, |
| "step": 730 |
| }, |
| { |
| "entropy": 0.8765029039233923, |
| "epoch": 0.3293829633338897, |
| "grad_norm": 44.25, |
| "learning_rate": 1.478e-05, |
| "loss": 14.0705, |
| "mean_token_accuracy": 0.7659166298806668, |
| "num_tokens": 4932671.0, |
| "step": 740 |
| }, |
| { |
| "entropy": 0.8784225210547447, |
| "epoch": 0.3338340844600234, |
| "grad_norm": 44.0, |
| "learning_rate": 1.498e-05, |
| "loss": 14.1774, |
| "mean_token_accuracy": 0.7669123791158199, |
| "num_tokens": 4998710.0, |
| "step": 750 |
| }, |
| { |
| "entropy": 0.8316720003262162, |
| "epoch": 0.338285205586157, |
| "grad_norm": 42.75, |
| "learning_rate": 1.5180000000000002e-05, |
| "loss": 13.3239, |
| "mean_token_accuracy": 0.7748636573553085, |
| "num_tokens": 5066948.0, |
| "step": 760 |
| }, |
| { |
| "entropy": 0.8594924572855234, |
| "epoch": 0.34273632671229065, |
| "grad_norm": 49.5, |
| "learning_rate": 1.5380000000000002e-05, |
| "loss": 13.8153, |
| "mean_token_accuracy": 0.7711022242903709, |
| "num_tokens": 5129950.0, |
| "step": 770 |
| }, |
| { |
| "entropy": 0.895938608981669, |
| "epoch": 0.3471874478384243, |
| "grad_norm": 48.25, |
| "learning_rate": 1.5580000000000003e-05, |
| "loss": 14.4493, |
| "mean_token_accuracy": 0.7591051306575537, |
| "num_tokens": 5193519.0, |
| "step": 780 |
| }, |
| { |
| "entropy": 0.8509046232327819, |
| "epoch": 0.351638568964558, |
| "grad_norm": 56.5, |
| "learning_rate": 1.578e-05, |
| "loss": 13.6439, |
| "mean_token_accuracy": 0.7696408761665225, |
| "num_tokens": 5256503.0, |
| "step": 790 |
| }, |
| { |
| "entropy": 0.9008656185120344, |
| "epoch": 0.3560896900906916, |
| "grad_norm": 48.75, |
| "learning_rate": 1.5980000000000003e-05, |
| "loss": 14.7305, |
| "mean_token_accuracy": 0.7615581404417753, |
| "num_tokens": 5324751.0, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.3560896900906916, |
| "eval_biology_entropy": 1.267573108136654, |
| "eval_biology_loss": 1.2815055847167969, |
| "eval_biology_mean_token_accuracy": 0.6854563910365105, |
| "eval_biology_num_tokens": 5324751.0, |
| "eval_biology_runtime": 39.7889, |
| "eval_biology_samples_per_second": 12.566, |
| "eval_biology_steps_per_second": 12.566, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.3560896900906916, |
| "eval_chemistry_entropy": 0.8897559930682182, |
| "eval_chemistry_loss": 0.8585976362228394, |
| "eval_chemistry_mean_token_accuracy": 0.7700544927716255, |
| "eval_chemistry_num_tokens": 5324751.0, |
| "eval_chemistry_runtime": 45.9968, |
| "eval_chemistry_samples_per_second": 10.87, |
| "eval_chemistry_steps_per_second": 10.87, |
| "step": 800 |
| }, |
| { |
| "entropy": 0.7946027474477887, |
| "epoch": 0.36054081121682524, |
| "grad_norm": 50.0, |
| "learning_rate": 1.618e-05, |
| "loss": 12.8437, |
| "mean_token_accuracy": 0.7842005740851163, |
| "num_tokens": 5391664.0, |
| "step": 810 |
| }, |
| { |
| "entropy": 0.8090571435168386, |
| "epoch": 0.3649919323429589, |
| "grad_norm": 45.0, |
| "learning_rate": 1.638e-05, |
| "loss": 12.7173, |
| "mean_token_accuracy": 0.7849415507167578, |
| "num_tokens": 5458519.0, |
| "step": 820 |
| }, |
| { |
| "entropy": 0.8408348582684994, |
| "epoch": 0.3694430534690925, |
| "grad_norm": 46.0, |
| "learning_rate": 1.658e-05, |
| "loss": 13.9973, |
| "mean_token_accuracy": 0.7693964328616858, |
| "num_tokens": 5523391.0, |
| "step": 830 |
| }, |
| { |
| "entropy": 0.8412857724353671, |
| "epoch": 0.37389417459522617, |
| "grad_norm": 44.0, |
| "learning_rate": 1.6780000000000002e-05, |
| "loss": 13.3081, |
| "mean_token_accuracy": 0.7760034879669547, |
| "num_tokens": 5585508.0, |
| "step": 840 |
| }, |
| { |
| "entropy": 0.836153868213296, |
| "epoch": 0.37834529572135983, |
| "grad_norm": 48.5, |
| "learning_rate": 1.698e-05, |
| "loss": 13.6747, |
| "mean_token_accuracy": 0.775155283510685, |
| "num_tokens": 5650755.0, |
| "step": 850 |
| }, |
| { |
| "entropy": 0.8424218002706766, |
| "epoch": 0.38279641684749344, |
| "grad_norm": 45.0, |
| "learning_rate": 1.718e-05, |
| "loss": 13.4803, |
| "mean_token_accuracy": 0.7742194497957826, |
| "num_tokens": 5716321.0, |
| "step": 860 |
| }, |
| { |
| "entropy": 0.883666661940515, |
| "epoch": 0.3872475379736271, |
| "grad_norm": 50.75, |
| "learning_rate": 1.7380000000000003e-05, |
| "loss": 14.1247, |
| "mean_token_accuracy": 0.7642681807279587, |
| "num_tokens": 5782698.0, |
| "step": 870 |
| }, |
| { |
| "entropy": 0.8131563207134604, |
| "epoch": 0.39169865909976076, |
| "grad_norm": 39.0, |
| "learning_rate": 1.758e-05, |
| "loss": 13.2211, |
| "mean_token_accuracy": 0.7782290887087584, |
| "num_tokens": 5848889.0, |
| "step": 880 |
| }, |
| { |
| "entropy": 0.8608601313084364, |
| "epoch": 0.3961497802258944, |
| "grad_norm": 44.0, |
| "learning_rate": 1.7780000000000003e-05, |
| "loss": 13.9079, |
| "mean_token_accuracy": 0.7717852048575878, |
| "num_tokens": 5912919.0, |
| "step": 890 |
| }, |
| { |
| "entropy": 0.8315491866320371, |
| "epoch": 0.400600901352028, |
| "grad_norm": 42.25, |
| "learning_rate": 1.798e-05, |
| "loss": 13.4828, |
| "mean_token_accuracy": 0.7756550934165716, |
| "num_tokens": 5980229.0, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.400600901352028, |
| "eval_biology_entropy": 1.2335028433203696, |
| "eval_biology_loss": 1.2846676111221313, |
| "eval_biology_mean_token_accuracy": 0.6853172712922097, |
| "eval_biology_num_tokens": 5980229.0, |
| "eval_biology_runtime": 39.7136, |
| "eval_biology_samples_per_second": 12.59, |
| "eval_biology_steps_per_second": 12.59, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.400600901352028, |
| "eval_chemistry_entropy": 0.8548561576008796, |
| "eval_chemistry_loss": 0.8478842973709106, |
| "eval_chemistry_mean_token_accuracy": 0.773112800002098, |
| "eval_chemistry_num_tokens": 5980229.0, |
| "eval_chemistry_runtime": 45.9488, |
| "eval_chemistry_samples_per_second": 10.882, |
| "eval_chemistry_steps_per_second": 10.882, |
| "step": 900 |
| }, |
| { |
| "entropy": 0.7898269753903151, |
| "epoch": 0.4050520224781617, |
| "grad_norm": 40.25, |
| "learning_rate": 1.8180000000000002e-05, |
| "loss": 12.7209, |
| "mean_token_accuracy": 0.784660654142499, |
| "num_tokens": 6048508.0, |
| "step": 910 |
| }, |
| { |
| "entropy": 0.8143722828477621, |
| "epoch": 0.40950314360429535, |
| "grad_norm": 44.5, |
| "learning_rate": 1.8380000000000004e-05, |
| "loss": 13.2591, |
| "mean_token_accuracy": 0.7791141759604215, |
| "num_tokens": 6114855.0, |
| "step": 920 |
| }, |
| { |
| "entropy": 0.8494924793019891, |
| "epoch": 0.41395426473042896, |
| "grad_norm": 40.0, |
| "learning_rate": 1.858e-05, |
| "loss": 13.506, |
| "mean_token_accuracy": 0.776913444697857, |
| "num_tokens": 6179050.0, |
| "step": 930 |
| }, |
| { |
| "entropy": 0.8335931519046426, |
| "epoch": 0.4184053858565626, |
| "grad_norm": 45.25, |
| "learning_rate": 1.878e-05, |
| "loss": 13.6217, |
| "mean_token_accuracy": 0.7724651444703341, |
| "num_tokens": 6244987.0, |
| "step": 940 |
| }, |
| { |
| "entropy": 0.8118610519915819, |
| "epoch": 0.4228565069826963, |
| "grad_norm": 49.75, |
| "learning_rate": 1.898e-05, |
| "loss": 13.0423, |
| "mean_token_accuracy": 0.7783929593861103, |
| "num_tokens": 6310799.0, |
| "step": 950 |
| }, |
| { |
| "entropy": 0.8028015844523907, |
| "epoch": 0.4273076281088299, |
| "grad_norm": 48.25, |
| "learning_rate": 1.918e-05, |
| "loss": 13.0412, |
| "mean_token_accuracy": 0.778607621230185, |
| "num_tokens": 6378152.0, |
| "step": 960 |
| }, |
| { |
| "entropy": 0.8370830919593573, |
| "epoch": 0.43175874923496355, |
| "grad_norm": 44.75, |
| "learning_rate": 1.938e-05, |
| "loss": 13.2798, |
| "mean_token_accuracy": 0.7766136281192303, |
| "num_tokens": 6442491.0, |
| "step": 970 |
| }, |
| { |
| "entropy": 0.862270618416369, |
| "epoch": 0.4362098703610972, |
| "grad_norm": 39.5, |
| "learning_rate": 1.9580000000000002e-05, |
| "loss": 14.0144, |
| "mean_token_accuracy": 0.7675615277141332, |
| "num_tokens": 6510231.0, |
| "step": 980 |
| }, |
| { |
| "entropy": 0.8360511595383286, |
| "epoch": 0.44066099148723087, |
| "grad_norm": 48.0, |
| "learning_rate": 1.978e-05, |
| "loss": 13.4815, |
| "mean_token_accuracy": 0.7760949255898595, |
| "num_tokens": 6572858.0, |
| "step": 990 |
| }, |
| { |
| "entropy": 0.8388594528660178, |
| "epoch": 0.4451121126133645, |
| "grad_norm": 43.25, |
| "learning_rate": 1.9980000000000002e-05, |
| "loss": 13.6609, |
| "mean_token_accuracy": 0.7700441874563694, |
| "num_tokens": 6637273.0, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.4451121126133645, |
| "eval_biology_entropy": 1.187324990749359, |
| "eval_biology_loss": 1.2962696552276611, |
| "eval_biology_mean_token_accuracy": 0.6844774860739707, |
| "eval_biology_num_tokens": 6637273.0, |
| "eval_biology_runtime": 39.733, |
| "eval_biology_samples_per_second": 12.584, |
| "eval_biology_steps_per_second": 12.584, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.4451121126133645, |
| "eval_chemistry_entropy": 0.8146472455263137, |
| "eval_chemistry_loss": 0.8452854156494141, |
| "eval_chemistry_mean_token_accuracy": 0.7733621709942817, |
| "eval_chemistry_num_tokens": 6637273.0, |
| "eval_chemistry_runtime": 46.1309, |
| "eval_chemistry_samples_per_second": 10.839, |
| "eval_chemistry_steps_per_second": 10.839, |
| "step": 1000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 10000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 4.450581354264347e+17, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|