| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.1126133644911813, |
| "eval_steps": 100, |
| "global_step": 2500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 0.8066153490915895, |
| "epoch": 0.004451121126133645, |
| "grad_norm": 107.5, |
| "learning_rate": 1.8e-07, |
| "loss": 16.3591, |
| "mean_token_accuracy": 0.7447933696210385, |
| "num_tokens": 63133.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 0.8401238698512316, |
| "epoch": 0.00890224225226729, |
| "grad_norm": 103.5, |
| "learning_rate": 3.8e-07, |
| "loss": 17.2936, |
| "mean_token_accuracy": 0.7330440735444427, |
| "num_tokens": 128968.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 0.828661117143929, |
| "epoch": 0.013353363378400934, |
| "grad_norm": 96.5, |
| "learning_rate": 5.800000000000001e-07, |
| "loss": 16.5809, |
| "mean_token_accuracy": 0.7381731692701579, |
| "num_tokens": 193314.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 0.8600894263014197, |
| "epoch": 0.01780448450453458, |
| "grad_norm": 96.0, |
| "learning_rate": 7.8e-07, |
| "loss": 17.7444, |
| "mean_token_accuracy": 0.7324823886156082, |
| "num_tokens": 257189.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 0.8017022017389536, |
| "epoch": 0.022255605630668224, |
| "grad_norm": 84.5, |
| "learning_rate": 9.800000000000001e-07, |
| "loss": 15.9063, |
| "mean_token_accuracy": 0.7455223135650157, |
| "num_tokens": 324224.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 0.846268966794014, |
| "epoch": 0.026706726756801868, |
| "grad_norm": 78.5, |
| "learning_rate": 1.1800000000000001e-06, |
| "loss": 16.6006, |
| "mean_token_accuracy": 0.7411292420700193, |
| "num_tokens": 394654.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 0.8181243563070894, |
| "epoch": 0.031157847882935515, |
| "grad_norm": 76.5, |
| "learning_rate": 1.3800000000000001e-06, |
| "loss": 15.9846, |
| "mean_token_accuracy": 0.746401545777917, |
| "num_tokens": 461658.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 0.8314600124955177, |
| "epoch": 0.03560896900906916, |
| "grad_norm": 75.0, |
| "learning_rate": 1.5800000000000001e-06, |
| "loss": 15.8227, |
| "mean_token_accuracy": 0.7472479769960045, |
| "num_tokens": 528285.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 0.8807666478678584, |
| "epoch": 0.04006009013520281, |
| "grad_norm": 72.0, |
| "learning_rate": 1.7800000000000001e-06, |
| "loss": 16.6834, |
| "mean_token_accuracy": 0.7396679904311896, |
| "num_tokens": 595336.0, |
| "step": 90 |
| }, |
| { |
| "entropy": 0.8991396678611636, |
| "epoch": 0.04451121126133645, |
| "grad_norm": 79.5, |
| "learning_rate": 1.98e-06, |
| "loss": 16.3968, |
| "mean_token_accuracy": 0.7396075185388327, |
| "num_tokens": 663126.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.04451121126133645, |
| "eval_biology_entropy": 1.1188154411315918, |
| "eval_biology_loss": 1.2691835165023804, |
| "eval_biology_mean_token_accuracy": 0.6881385813951493, |
| "eval_biology_num_tokens": 663126.0, |
| "eval_biology_runtime": 40.4565, |
| "eval_biology_samples_per_second": 12.359, |
| "eval_biology_steps_per_second": 12.359, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.04451121126133645, |
| "eval_chemistry_entropy": 0.8804921235442161, |
| "eval_chemistry_loss": 1.0042469501495361, |
| "eval_chemistry_mean_token_accuracy": 0.7444319971203804, |
| "eval_chemistry_num_tokens": 663126.0, |
| "eval_chemistry_runtime": 46.3585, |
| "eval_chemistry_samples_per_second": 10.786, |
| "eval_chemistry_steps_per_second": 10.786, |
| "step": 100 |
| }, |
| { |
| "entropy": 0.8988691195845604, |
| "epoch": 0.048962332387470095, |
| "grad_norm": 100.5, |
| "learning_rate": 2.1800000000000003e-06, |
| "loss": 16.3063, |
| "mean_token_accuracy": 0.7414613764733076, |
| "num_tokens": 731060.0, |
| "step": 110 |
| }, |
| { |
| "entropy": 0.8918870648369193, |
| "epoch": 0.053413453513603736, |
| "grad_norm": 77.5, |
| "learning_rate": 2.38e-06, |
| "loss": 15.8311, |
| "mean_token_accuracy": 0.7483173958957196, |
| "num_tokens": 801635.0, |
| "step": 120 |
| }, |
| { |
| "entropy": 0.9162682231515646, |
| "epoch": 0.05786457463973738, |
| "grad_norm": 66.0, |
| "learning_rate": 2.5800000000000003e-06, |
| "loss": 16.1464, |
| "mean_token_accuracy": 0.7448406910523773, |
| "num_tokens": 867260.0, |
| "step": 130 |
| }, |
| { |
| "entropy": 0.9749668512493372, |
| "epoch": 0.06231569576587103, |
| "grad_norm": 79.5, |
| "learning_rate": 2.7800000000000005e-06, |
| "loss": 16.9562, |
| "mean_token_accuracy": 0.7369577366858721, |
| "num_tokens": 931344.0, |
| "step": 140 |
| }, |
| { |
| "entropy": 0.939477625861764, |
| "epoch": 0.06676681689200467, |
| "grad_norm": 56.75, |
| "learning_rate": 2.9800000000000003e-06, |
| "loss": 15.7327, |
| "mean_token_accuracy": 0.7471803797408938, |
| "num_tokens": 993586.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 0.9718907386064529, |
| "epoch": 0.07121793801813832, |
| "grad_norm": 53.75, |
| "learning_rate": 3.1800000000000005e-06, |
| "loss": 16.2036, |
| "mean_token_accuracy": 0.7392314806580543, |
| "num_tokens": 1060812.0, |
| "step": 160 |
| }, |
| { |
| "entropy": 0.9165311623364687, |
| "epoch": 0.07566905914427197, |
| "grad_norm": 63.25, |
| "learning_rate": 3.3800000000000007e-06, |
| "loss": 15.2657, |
| "mean_token_accuracy": 0.7518215283751488, |
| "num_tokens": 1131832.0, |
| "step": 170 |
| }, |
| { |
| "entropy": 0.9465073021128774, |
| "epoch": 0.08012018027040561, |
| "grad_norm": 50.25, |
| "learning_rate": 3.58e-06, |
| "loss": 15.5603, |
| "mean_token_accuracy": 0.7473610159009695, |
| "num_tokens": 1200650.0, |
| "step": 180 |
| }, |
| { |
| "entropy": 0.9450380651280283, |
| "epoch": 0.08457130139653925, |
| "grad_norm": 54.0, |
| "learning_rate": 3.7800000000000002e-06, |
| "loss": 15.56, |
| "mean_token_accuracy": 0.748077143356204, |
| "num_tokens": 1265107.0, |
| "step": 190 |
| }, |
| { |
| "entropy": 0.9630168141797185, |
| "epoch": 0.0890224225226729, |
| "grad_norm": 58.75, |
| "learning_rate": 3.980000000000001e-06, |
| "loss": 15.5029, |
| "mean_token_accuracy": 0.7486971555277705, |
| "num_tokens": 1327380.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.0890224225226729, |
| "eval_biology_entropy": 1.19531831908226, |
| "eval_biology_loss": 1.2584387063980103, |
| "eval_biology_mean_token_accuracy": 0.6882349443435669, |
| "eval_biology_num_tokens": 1327380.0, |
| "eval_biology_runtime": 40.8783, |
| "eval_biology_samples_per_second": 12.231, |
| "eval_biology_steps_per_second": 12.231, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.0890224225226729, |
| "eval_chemistry_entropy": 0.9437598274946213, |
| "eval_chemistry_loss": 0.9655953645706177, |
| "eval_chemistry_mean_token_accuracy": 0.7501006088852883, |
| "eval_chemistry_num_tokens": 1327380.0, |
| "eval_chemistry_runtime": 46.6023, |
| "eval_chemistry_samples_per_second": 10.729, |
| "eval_chemistry_steps_per_second": 10.729, |
| "step": 200 |
| }, |
| { |
| "entropy": 0.9782480053603649, |
| "epoch": 0.09347354364880654, |
| "grad_norm": 64.0, |
| "learning_rate": 4.18e-06, |
| "loss": 15.9821, |
| "mean_token_accuracy": 0.7436690799891948, |
| "num_tokens": 1393379.0, |
| "step": 210 |
| }, |
| { |
| "entropy": 0.9402019061148167, |
| "epoch": 0.09792466477494019, |
| "grad_norm": 54.0, |
| "learning_rate": 4.38e-06, |
| "loss": 15.3729, |
| "mean_token_accuracy": 0.7516820874065161, |
| "num_tokens": 1460130.0, |
| "step": 220 |
| }, |
| { |
| "entropy": 0.9247835712507367, |
| "epoch": 0.10237578590107384, |
| "grad_norm": 54.5, |
| "learning_rate": 4.58e-06, |
| "loss": 15.0731, |
| "mean_token_accuracy": 0.7559274602681398, |
| "num_tokens": 1529183.0, |
| "step": 230 |
| }, |
| { |
| "entropy": 0.9673028320074082, |
| "epoch": 0.10682690702720747, |
| "grad_norm": 71.5, |
| "learning_rate": 4.78e-06, |
| "loss": 15.731, |
| "mean_token_accuracy": 0.7473661951720715, |
| "num_tokens": 1597405.0, |
| "step": 240 |
| }, |
| { |
| "entropy": 0.9974556604400278, |
| "epoch": 0.11127802815334112, |
| "grad_norm": 55.0, |
| "learning_rate": 4.980000000000001e-06, |
| "loss": 15.8962, |
| "mean_token_accuracy": 0.7434635870158672, |
| "num_tokens": 1661767.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 0.985609365440905, |
| "epoch": 0.11572914927947477, |
| "grad_norm": 50.75, |
| "learning_rate": 5.18e-06, |
| "loss": 16.0267, |
| "mean_token_accuracy": 0.7446854375302792, |
| "num_tokens": 1728207.0, |
| "step": 260 |
| }, |
| { |
| "entropy": 0.9456103699281811, |
| "epoch": 0.12018027040560841, |
| "grad_norm": 58.0, |
| "learning_rate": 5.380000000000001e-06, |
| "loss": 15.3377, |
| "mean_token_accuracy": 0.750850186496973, |
| "num_tokens": 1796055.0, |
| "step": 270 |
| }, |
| { |
| "entropy": 0.9541573049500585, |
| "epoch": 0.12463139153174206, |
| "grad_norm": 56.5, |
| "learning_rate": 5.580000000000001e-06, |
| "loss": 15.2532, |
| "mean_token_accuracy": 0.7530262626707553, |
| "num_tokens": 1859684.0, |
| "step": 280 |
| }, |
| { |
| "entropy": 0.9941559780389071, |
| "epoch": 0.1290825126578757, |
| "grad_norm": 59.25, |
| "learning_rate": 5.78e-06, |
| "loss": 15.94, |
| "mean_token_accuracy": 0.745009395852685, |
| "num_tokens": 1921704.0, |
| "step": 290 |
| }, |
| { |
| "entropy": 0.981252990104258, |
| "epoch": 0.13353363378400934, |
| "grad_norm": 56.75, |
| "learning_rate": 5.98e-06, |
| "loss": 15.8943, |
| "mean_token_accuracy": 0.7449573867022992, |
| "num_tokens": 1985766.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.13353363378400934, |
| "eval_biology_entropy": 1.2063790675401687, |
| "eval_biology_loss": 1.2592713832855225, |
| "eval_biology_mean_token_accuracy": 0.6874204781055451, |
| "eval_biology_num_tokens": 1985766.0, |
| "eval_biology_runtime": 40.5061, |
| "eval_biology_samples_per_second": 12.344, |
| "eval_biology_steps_per_second": 12.344, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.13353363378400934, |
| "eval_chemistry_entropy": 0.9332761432528496, |
| "eval_chemistry_loss": 0.9383891820907593, |
| "eval_chemistry_mean_token_accuracy": 0.7543213546276093, |
| "eval_chemistry_num_tokens": 1985766.0, |
| "eval_chemistry_runtime": 46.2485, |
| "eval_chemistry_samples_per_second": 10.811, |
| "eval_chemistry_steps_per_second": 10.811, |
| "step": 300 |
| }, |
| { |
| "entropy": 0.9657653540372848, |
| "epoch": 0.137984754910143, |
| "grad_norm": 66.5, |
| "learning_rate": 6.18e-06, |
| "loss": 15.5291, |
| "mean_token_accuracy": 0.7452911786735058, |
| "num_tokens": 2049041.0, |
| "step": 310 |
| }, |
| { |
| "entropy": 0.922235100530088, |
| "epoch": 0.14243587603627664, |
| "grad_norm": 49.75, |
| "learning_rate": 6.380000000000001e-06, |
| "loss": 14.7109, |
| "mean_token_accuracy": 0.75825478695333, |
| "num_tokens": 2118068.0, |
| "step": 320 |
| }, |
| { |
| "entropy": 0.9044711474329233, |
| "epoch": 0.14688699716241027, |
| "grad_norm": 60.25, |
| "learning_rate": 6.5800000000000005e-06, |
| "loss": 14.5687, |
| "mean_token_accuracy": 0.7618395145982504, |
| "num_tokens": 2186387.0, |
| "step": 330 |
| }, |
| { |
| "entropy": 0.946225673891604, |
| "epoch": 0.15133811828854393, |
| "grad_norm": 52.5, |
| "learning_rate": 6.780000000000001e-06, |
| "loss": 15.1959, |
| "mean_token_accuracy": 0.7535316452383996, |
| "num_tokens": 2252650.0, |
| "step": 340 |
| }, |
| { |
| "entropy": 0.9036338411271572, |
| "epoch": 0.15578923941467757, |
| "grad_norm": 57.75, |
| "learning_rate": 6.98e-06, |
| "loss": 14.5854, |
| "mean_token_accuracy": 0.7611672822386026, |
| "num_tokens": 2320358.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 0.9015818448737264, |
| "epoch": 0.16024036054081123, |
| "grad_norm": 49.5, |
| "learning_rate": 7.180000000000001e-06, |
| "loss": 14.5381, |
| "mean_token_accuracy": 0.7606555309146643, |
| "num_tokens": 2388824.0, |
| "step": 360 |
| }, |
| { |
| "entropy": 0.8864203749224544, |
| "epoch": 0.16469148166694486, |
| "grad_norm": 49.25, |
| "learning_rate": 7.3800000000000005e-06, |
| "loss": 14.1936, |
| "mean_token_accuracy": 0.7665066320449114, |
| "num_tokens": 2456144.0, |
| "step": 370 |
| }, |
| { |
| "entropy": 0.9866490814834833, |
| "epoch": 0.1691426027930785, |
| "grad_norm": 49.5, |
| "learning_rate": 7.58e-06, |
| "loss": 15.8412, |
| "mean_token_accuracy": 0.7478409979492426, |
| "num_tokens": 2515325.0, |
| "step": 380 |
| }, |
| { |
| "entropy": 0.9080646676942706, |
| "epoch": 0.17359372391921216, |
| "grad_norm": 48.25, |
| "learning_rate": 7.78e-06, |
| "loss": 14.5343, |
| "mean_token_accuracy": 0.7594566397368908, |
| "num_tokens": 2580490.0, |
| "step": 390 |
| }, |
| { |
| "entropy": 0.9095059128478169, |
| "epoch": 0.1780448450453458, |
| "grad_norm": 43.5, |
| "learning_rate": 7.980000000000002e-06, |
| "loss": 14.6647, |
| "mean_token_accuracy": 0.7607249341905117, |
| "num_tokens": 2644330.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.1780448450453458, |
| "eval_biology_entropy": 1.1971934199333192, |
| "eval_biology_loss": 1.2638347148895264, |
| "eval_biology_mean_token_accuracy": 0.6877701328396797, |
| "eval_biology_num_tokens": 2644330.0, |
| "eval_biology_runtime": 39.8022, |
| "eval_biology_samples_per_second": 12.562, |
| "eval_biology_steps_per_second": 12.562, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.1780448450453458, |
| "eval_chemistry_entropy": 0.8957328157424926, |
| "eval_chemistry_loss": 0.9148933291435242, |
| "eval_chemistry_mean_token_accuracy": 0.7592848987579346, |
| "eval_chemistry_num_tokens": 2644330.0, |
| "eval_chemistry_runtime": 46.2334, |
| "eval_chemistry_samples_per_second": 10.815, |
| "eval_chemistry_steps_per_second": 10.815, |
| "step": 400 |
| }, |
| { |
| "entropy": 0.8526191784068942, |
| "epoch": 0.18249596617147945, |
| "grad_norm": 50.0, |
| "learning_rate": 8.18e-06, |
| "loss": 13.7182, |
| "mean_token_accuracy": 0.7745671790093184, |
| "num_tokens": 2713234.0, |
| "step": 410 |
| }, |
| { |
| "entropy": 0.932603782787919, |
| "epoch": 0.18694708729761308, |
| "grad_norm": 47.5, |
| "learning_rate": 8.380000000000001e-06, |
| "loss": 15.028, |
| "mean_token_accuracy": 0.7569812458008528, |
| "num_tokens": 2783261.0, |
| "step": 420 |
| }, |
| { |
| "entropy": 0.9143912255764007, |
| "epoch": 0.19139820842374672, |
| "grad_norm": 43.0, |
| "learning_rate": 8.580000000000001e-06, |
| "loss": 14.7724, |
| "mean_token_accuracy": 0.7596961252391339, |
| "num_tokens": 2850170.0, |
| "step": 430 |
| }, |
| { |
| "entropy": 0.9113649705424904, |
| "epoch": 0.19584932954988038, |
| "grad_norm": 58.5, |
| "learning_rate": 8.78e-06, |
| "loss": 14.6432, |
| "mean_token_accuracy": 0.760087676718831, |
| "num_tokens": 2913700.0, |
| "step": 440 |
| }, |
| { |
| "entropy": 0.9072460785508156, |
| "epoch": 0.200300450676014, |
| "grad_norm": 46.5, |
| "learning_rate": 8.98e-06, |
| "loss": 14.609, |
| "mean_token_accuracy": 0.7596194025129079, |
| "num_tokens": 2981212.0, |
| "step": 450 |
| }, |
| { |
| "entropy": 0.8693920068442822, |
| "epoch": 0.20475157180214767, |
| "grad_norm": 53.25, |
| "learning_rate": 9.180000000000002e-06, |
| "loss": 13.9505, |
| "mean_token_accuracy": 0.771946213953197, |
| "num_tokens": 3048973.0, |
| "step": 460 |
| }, |
| { |
| "entropy": 0.9109398307278752, |
| "epoch": 0.2092026929282813, |
| "grad_norm": 47.25, |
| "learning_rate": 9.38e-06, |
| "loss": 14.614, |
| "mean_token_accuracy": 0.7599313069134951, |
| "num_tokens": 3117033.0, |
| "step": 470 |
| }, |
| { |
| "entropy": 0.8936059167608619, |
| "epoch": 0.21365381405441494, |
| "grad_norm": 50.25, |
| "learning_rate": 9.58e-06, |
| "loss": 14.4662, |
| "mean_token_accuracy": 0.7608913701027632, |
| "num_tokens": 3185255.0, |
| "step": 480 |
| }, |
| { |
| "entropy": 0.9031545946374535, |
| "epoch": 0.2181049351805486, |
| "grad_norm": 51.5, |
| "learning_rate": 9.780000000000001e-06, |
| "loss": 14.5197, |
| "mean_token_accuracy": 0.7608289115130902, |
| "num_tokens": 3253121.0, |
| "step": 490 |
| }, |
| { |
| "entropy": 0.8450184227898717, |
| "epoch": 0.22255605630668224, |
| "grad_norm": 49.5, |
| "learning_rate": 9.980000000000001e-06, |
| "loss": 13.4565, |
| "mean_token_accuracy": 0.7740382503718137, |
| "num_tokens": 3322823.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.22255605630668224, |
| "eval_biology_entropy": 1.2025449865460396, |
| "eval_biology_loss": 1.267388939857483, |
| "eval_biology_mean_token_accuracy": 0.6874357106685638, |
| "eval_biology_num_tokens": 3322823.0, |
| "eval_biology_runtime": 40.0827, |
| "eval_biology_samples_per_second": 12.474, |
| "eval_biology_steps_per_second": 12.474, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.22255605630668224, |
| "eval_chemistry_entropy": 0.8795891938209534, |
| "eval_chemistry_loss": 0.8957814574241638, |
| "eval_chemistry_mean_token_accuracy": 0.7629851229190826, |
| "eval_chemistry_num_tokens": 3322823.0, |
| "eval_chemistry_runtime": 46.3478, |
| "eval_chemistry_samples_per_second": 10.788, |
| "eval_chemistry_steps_per_second": 10.788, |
| "step": 500 |
| }, |
| { |
| "entropy": 0.8574318964034319, |
| "epoch": 0.2270071774328159, |
| "grad_norm": 45.25, |
| "learning_rate": 1.018e-05, |
| "loss": 13.9042, |
| "mean_token_accuracy": 0.7678481444716454, |
| "num_tokens": 3393393.0, |
| "step": 510 |
| }, |
| { |
| "entropy": 0.9288356432691216, |
| "epoch": 0.23145829855894953, |
| "grad_norm": 57.25, |
| "learning_rate": 1.038e-05, |
| "loss": 15.193, |
| "mean_token_accuracy": 0.7506348451599478, |
| "num_tokens": 3454750.0, |
| "step": 520 |
| }, |
| { |
| "entropy": 0.8564269673079252, |
| "epoch": 0.2359094196850832, |
| "grad_norm": 50.75, |
| "learning_rate": 1.0580000000000002e-05, |
| "loss": 13.7644, |
| "mean_token_accuracy": 0.7751363463699817, |
| "num_tokens": 3526914.0, |
| "step": 530 |
| }, |
| { |
| "entropy": 0.93802858479321, |
| "epoch": 0.24036054081121683, |
| "grad_norm": 45.0, |
| "learning_rate": 1.0780000000000002e-05, |
| "loss": 15.0494, |
| "mean_token_accuracy": 0.7559149663895368, |
| "num_tokens": 3589449.0, |
| "step": 540 |
| }, |
| { |
| "entropy": 0.9012869004160166, |
| "epoch": 0.24481166193735046, |
| "grad_norm": 50.25, |
| "learning_rate": 1.0980000000000002e-05, |
| "loss": 14.2344, |
| "mean_token_accuracy": 0.7642816316336394, |
| "num_tokens": 3655092.0, |
| "step": 550 |
| }, |
| { |
| "entropy": 0.8946880368515849, |
| "epoch": 0.24926278306348412, |
| "grad_norm": 57.25, |
| "learning_rate": 1.1180000000000001e-05, |
| "loss": 14.5832, |
| "mean_token_accuracy": 0.7632339514791966, |
| "num_tokens": 3719113.0, |
| "step": 560 |
| }, |
| { |
| "entropy": 0.8879899585619568, |
| "epoch": 0.25371390418961776, |
| "grad_norm": 44.5, |
| "learning_rate": 1.138e-05, |
| "loss": 14.3692, |
| "mean_token_accuracy": 0.7596106130629778, |
| "num_tokens": 3785282.0, |
| "step": 570 |
| }, |
| { |
| "entropy": 0.9068781601265072, |
| "epoch": 0.2581650253157514, |
| "grad_norm": 42.25, |
| "learning_rate": 1.1580000000000001e-05, |
| "loss": 14.6096, |
| "mean_token_accuracy": 0.761234056390822, |
| "num_tokens": 3852276.0, |
| "step": 580 |
| }, |
| { |
| "entropy": 0.8276193620637059, |
| "epoch": 0.262616146441885, |
| "grad_norm": 43.25, |
| "learning_rate": 1.178e-05, |
| "loss": 13.3892, |
| "mean_token_accuracy": 0.7755540499463678, |
| "num_tokens": 3925649.0, |
| "step": 590 |
| }, |
| { |
| "entropy": 0.8920110030099749, |
| "epoch": 0.2670672675680187, |
| "grad_norm": 52.25, |
| "learning_rate": 1.198e-05, |
| "loss": 14.0197, |
| "mean_token_accuracy": 0.766492671892047, |
| "num_tokens": 3990505.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.2670672675680187, |
| "eval_biology_entropy": 1.2123423200249672, |
| "eval_biology_loss": 1.2729928493499756, |
| "eval_biology_mean_token_accuracy": 0.6870606996417046, |
| "eval_biology_num_tokens": 3990505.0, |
| "eval_biology_runtime": 39.7983, |
| "eval_biology_samples_per_second": 12.563, |
| "eval_biology_steps_per_second": 12.563, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.2670672675680187, |
| "eval_chemistry_entropy": 0.8659857953190804, |
| "eval_chemistry_loss": 0.8781383037567139, |
| "eval_chemistry_mean_token_accuracy": 0.7664026654362679, |
| "eval_chemistry_num_tokens": 3990505.0, |
| "eval_chemistry_runtime": 46.1775, |
| "eval_chemistry_samples_per_second": 10.828, |
| "eval_chemistry_steps_per_second": 10.828, |
| "step": 600 |
| }, |
| { |
| "entropy": 0.8550863016396761, |
| "epoch": 0.27151838869415235, |
| "grad_norm": 51.25, |
| "learning_rate": 1.218e-05, |
| "loss": 13.8181, |
| "mean_token_accuracy": 0.7682087656110526, |
| "num_tokens": 4060657.0, |
| "step": 610 |
| }, |
| { |
| "entropy": 0.8537623688578606, |
| "epoch": 0.275969509820286, |
| "grad_norm": 42.25, |
| "learning_rate": 1.2380000000000002e-05, |
| "loss": 13.6601, |
| "mean_token_accuracy": 0.772542554140091, |
| "num_tokens": 4133119.0, |
| "step": 620 |
| }, |
| { |
| "entropy": 0.8621461872011423, |
| "epoch": 0.2804206309464196, |
| "grad_norm": 49.25, |
| "learning_rate": 1.2580000000000002e-05, |
| "loss": 13.9865, |
| "mean_token_accuracy": 0.7680005200207234, |
| "num_tokens": 4200051.0, |
| "step": 630 |
| }, |
| { |
| "entropy": 0.9106066713109613, |
| "epoch": 0.2848717520725533, |
| "grad_norm": 49.25, |
| "learning_rate": 1.2780000000000001e-05, |
| "loss": 14.717, |
| "mean_token_accuracy": 0.7637291874736547, |
| "num_tokens": 4267403.0, |
| "step": 640 |
| }, |
| { |
| "entropy": 0.868153141438961, |
| "epoch": 0.28932287319868694, |
| "grad_norm": 52.25, |
| "learning_rate": 1.2980000000000001e-05, |
| "loss": 13.8988, |
| "mean_token_accuracy": 0.7680647127330303, |
| "num_tokens": 4333268.0, |
| "step": 650 |
| }, |
| { |
| "entropy": 0.8451825473457575, |
| "epoch": 0.29377399432482054, |
| "grad_norm": 53.5, |
| "learning_rate": 1.3180000000000001e-05, |
| "loss": 13.5394, |
| "mean_token_accuracy": 0.7747031616047024, |
| "num_tokens": 4400242.0, |
| "step": 660 |
| }, |
| { |
| "entropy": 0.8826779069378972, |
| "epoch": 0.2982251154509542, |
| "grad_norm": 49.75, |
| "learning_rate": 1.3380000000000002e-05, |
| "loss": 14.0787, |
| "mean_token_accuracy": 0.7660945057868958, |
| "num_tokens": 4464714.0, |
| "step": 670 |
| }, |
| { |
| "entropy": 0.8183048281818628, |
| "epoch": 0.30267623657708786, |
| "grad_norm": 47.75, |
| "learning_rate": 1.3580000000000002e-05, |
| "loss": 13.4564, |
| "mean_token_accuracy": 0.7773742496967315, |
| "num_tokens": 4535458.0, |
| "step": 680 |
| }, |
| { |
| "entropy": 0.8536316430196166, |
| "epoch": 0.3071273577032215, |
| "grad_norm": 48.25, |
| "learning_rate": 1.378e-05, |
| "loss": 13.552, |
| "mean_token_accuracy": 0.7754087567329406, |
| "num_tokens": 4599099.0, |
| "step": 690 |
| }, |
| { |
| "entropy": 0.816833440028131, |
| "epoch": 0.31157847882935513, |
| "grad_norm": 39.0, |
| "learning_rate": 1.398e-05, |
| "loss": 13.1791, |
| "mean_token_accuracy": 0.777547013387084, |
| "num_tokens": 4662903.0, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.31157847882935513, |
| "eval_biology_entropy": 1.2325178788900375, |
| "eval_biology_loss": 1.278289556503296, |
| "eval_biology_mean_token_accuracy": 0.686549211382866, |
| "eval_biology_num_tokens": 4662903.0, |
| "eval_biology_runtime": 39.6837, |
| "eval_biology_samples_per_second": 12.6, |
| "eval_biology_steps_per_second": 12.6, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.31157847882935513, |
| "eval_chemistry_entropy": 0.870070047557354, |
| "eval_chemistry_loss": 0.8674882650375366, |
| "eval_chemistry_mean_token_accuracy": 0.7685190732479096, |
| "eval_chemistry_num_tokens": 4662903.0, |
| "eval_chemistry_runtime": 46.0896, |
| "eval_chemistry_samples_per_second": 10.848, |
| "eval_chemistry_steps_per_second": 10.848, |
| "step": 700 |
| }, |
| { |
| "entropy": 0.8818355791270733, |
| "epoch": 0.3160295999554888, |
| "grad_norm": 54.0, |
| "learning_rate": 1.418e-05, |
| "loss": 14.2266, |
| "mean_token_accuracy": 0.7657015427947045, |
| "num_tokens": 4729166.0, |
| "step": 710 |
| }, |
| { |
| "entropy": 0.910587764903903, |
| "epoch": 0.32048072108162245, |
| "grad_norm": 54.5, |
| "learning_rate": 1.4380000000000001e-05, |
| "loss": 14.7617, |
| "mean_token_accuracy": 0.7591140177100897, |
| "num_tokens": 4796815.0, |
| "step": 720 |
| }, |
| { |
| "entropy": 0.8099086729809641, |
| "epoch": 0.32493184220775606, |
| "grad_norm": 51.25, |
| "learning_rate": 1.4580000000000001e-05, |
| "loss": 12.9674, |
| "mean_token_accuracy": 0.7837777521461249, |
| "num_tokens": 4865172.0, |
| "step": 730 |
| }, |
| { |
| "entropy": 0.8765029039233923, |
| "epoch": 0.3293829633338897, |
| "grad_norm": 44.25, |
| "learning_rate": 1.478e-05, |
| "loss": 14.0705, |
| "mean_token_accuracy": 0.7659166298806668, |
| "num_tokens": 4932671.0, |
| "step": 740 |
| }, |
| { |
| "entropy": 0.8784225210547447, |
| "epoch": 0.3338340844600234, |
| "grad_norm": 44.0, |
| "learning_rate": 1.498e-05, |
| "loss": 14.1774, |
| "mean_token_accuracy": 0.7669123791158199, |
| "num_tokens": 4998710.0, |
| "step": 750 |
| }, |
| { |
| "entropy": 0.8316720003262162, |
| "epoch": 0.338285205586157, |
| "grad_norm": 42.75, |
| "learning_rate": 1.5180000000000002e-05, |
| "loss": 13.3239, |
| "mean_token_accuracy": 0.7748636573553085, |
| "num_tokens": 5066948.0, |
| "step": 760 |
| }, |
| { |
| "entropy": 0.8594924572855234, |
| "epoch": 0.34273632671229065, |
| "grad_norm": 49.5, |
| "learning_rate": 1.5380000000000002e-05, |
| "loss": 13.8153, |
| "mean_token_accuracy": 0.7711022242903709, |
| "num_tokens": 5129950.0, |
| "step": 770 |
| }, |
| { |
| "entropy": 0.895938608981669, |
| "epoch": 0.3471874478384243, |
| "grad_norm": 48.25, |
| "learning_rate": 1.5580000000000003e-05, |
| "loss": 14.4493, |
| "mean_token_accuracy": 0.7591051306575537, |
| "num_tokens": 5193519.0, |
| "step": 780 |
| }, |
| { |
| "entropy": 0.8509046232327819, |
| "epoch": 0.351638568964558, |
| "grad_norm": 56.5, |
| "learning_rate": 1.578e-05, |
| "loss": 13.6439, |
| "mean_token_accuracy": 0.7696408761665225, |
| "num_tokens": 5256503.0, |
| "step": 790 |
| }, |
| { |
| "entropy": 0.9008656185120344, |
| "epoch": 0.3560896900906916, |
| "grad_norm": 48.75, |
| "learning_rate": 1.5980000000000003e-05, |
| "loss": 14.7305, |
| "mean_token_accuracy": 0.7615581404417753, |
| "num_tokens": 5324751.0, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.3560896900906916, |
| "eval_biology_entropy": 1.267573108136654, |
| "eval_biology_loss": 1.2815055847167969, |
| "eval_biology_mean_token_accuracy": 0.6854563910365105, |
| "eval_biology_num_tokens": 5324751.0, |
| "eval_biology_runtime": 39.7889, |
| "eval_biology_samples_per_second": 12.566, |
| "eval_biology_steps_per_second": 12.566, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.3560896900906916, |
| "eval_chemistry_entropy": 0.8897559930682182, |
| "eval_chemistry_loss": 0.8585976362228394, |
| "eval_chemistry_mean_token_accuracy": 0.7700544927716255, |
| "eval_chemistry_num_tokens": 5324751.0, |
| "eval_chemistry_runtime": 45.9968, |
| "eval_chemistry_samples_per_second": 10.87, |
| "eval_chemistry_steps_per_second": 10.87, |
| "step": 800 |
| }, |
| { |
| "entropy": 0.7946027474477887, |
| "epoch": 0.36054081121682524, |
| "grad_norm": 50.0, |
| "learning_rate": 1.618e-05, |
| "loss": 12.8437, |
| "mean_token_accuracy": 0.7842005740851163, |
| "num_tokens": 5391664.0, |
| "step": 810 |
| }, |
| { |
| "entropy": 0.8090571435168386, |
| "epoch": 0.3649919323429589, |
| "grad_norm": 45.0, |
| "learning_rate": 1.638e-05, |
| "loss": 12.7173, |
| "mean_token_accuracy": 0.7849415507167578, |
| "num_tokens": 5458519.0, |
| "step": 820 |
| }, |
| { |
| "entropy": 0.8408348582684994, |
| "epoch": 0.3694430534690925, |
| "grad_norm": 46.0, |
| "learning_rate": 1.658e-05, |
| "loss": 13.9973, |
| "mean_token_accuracy": 0.7693964328616858, |
| "num_tokens": 5523391.0, |
| "step": 830 |
| }, |
| { |
| "entropy": 0.8412857724353671, |
| "epoch": 0.37389417459522617, |
| "grad_norm": 44.0, |
| "learning_rate": 1.6780000000000002e-05, |
| "loss": 13.3081, |
| "mean_token_accuracy": 0.7760034879669547, |
| "num_tokens": 5585508.0, |
| "step": 840 |
| }, |
| { |
| "entropy": 0.836153868213296, |
| "epoch": 0.37834529572135983, |
| "grad_norm": 48.5, |
| "learning_rate": 1.698e-05, |
| "loss": 13.6747, |
| "mean_token_accuracy": 0.775155283510685, |
| "num_tokens": 5650755.0, |
| "step": 850 |
| }, |
| { |
| "entropy": 0.8424218002706766, |
| "epoch": 0.38279641684749344, |
| "grad_norm": 45.0, |
| "learning_rate": 1.718e-05, |
| "loss": 13.4803, |
| "mean_token_accuracy": 0.7742194497957826, |
| "num_tokens": 5716321.0, |
| "step": 860 |
| }, |
| { |
| "entropy": 0.883666661940515, |
| "epoch": 0.3872475379736271, |
| "grad_norm": 50.75, |
| "learning_rate": 1.7380000000000003e-05, |
| "loss": 14.1247, |
| "mean_token_accuracy": 0.7642681807279587, |
| "num_tokens": 5782698.0, |
| "step": 870 |
| }, |
| { |
| "entropy": 0.8131563207134604, |
| "epoch": 0.39169865909976076, |
| "grad_norm": 39.0, |
| "learning_rate": 1.758e-05, |
| "loss": 13.2211, |
| "mean_token_accuracy": 0.7782290887087584, |
| "num_tokens": 5848889.0, |
| "step": 880 |
| }, |
| { |
| "entropy": 0.8608601313084364, |
| "epoch": 0.3961497802258944, |
| "grad_norm": 44.0, |
| "learning_rate": 1.7780000000000003e-05, |
| "loss": 13.9079, |
| "mean_token_accuracy": 0.7717852048575878, |
| "num_tokens": 5912919.0, |
| "step": 890 |
| }, |
| { |
| "entropy": 0.8315491866320371, |
| "epoch": 0.400600901352028, |
| "grad_norm": 42.25, |
| "learning_rate": 1.798e-05, |
| "loss": 13.4828, |
| "mean_token_accuracy": 0.7756550934165716, |
| "num_tokens": 5980229.0, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.400600901352028, |
| "eval_biology_entropy": 1.2335028433203696, |
| "eval_biology_loss": 1.2846676111221313, |
| "eval_biology_mean_token_accuracy": 0.6853172712922097, |
| "eval_biology_num_tokens": 5980229.0, |
| "eval_biology_runtime": 39.7136, |
| "eval_biology_samples_per_second": 12.59, |
| "eval_biology_steps_per_second": 12.59, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.400600901352028, |
| "eval_chemistry_entropy": 0.8548561576008796, |
| "eval_chemistry_loss": 0.8478842973709106, |
| "eval_chemistry_mean_token_accuracy": 0.773112800002098, |
| "eval_chemistry_num_tokens": 5980229.0, |
| "eval_chemistry_runtime": 45.9488, |
| "eval_chemistry_samples_per_second": 10.882, |
| "eval_chemistry_steps_per_second": 10.882, |
| "step": 900 |
| }, |
| { |
| "entropy": 0.7898269753903151, |
| "epoch": 0.4050520224781617, |
| "grad_norm": 40.25, |
| "learning_rate": 1.8180000000000002e-05, |
| "loss": 12.7209, |
| "mean_token_accuracy": 0.784660654142499, |
| "num_tokens": 6048508.0, |
| "step": 910 |
| }, |
| { |
| "entropy": 0.8143722828477621, |
| "epoch": 0.40950314360429535, |
| "grad_norm": 44.5, |
| "learning_rate": 1.8380000000000004e-05, |
| "loss": 13.2591, |
| "mean_token_accuracy": 0.7791141759604215, |
| "num_tokens": 6114855.0, |
| "step": 920 |
| }, |
| { |
| "entropy": 0.8494924793019891, |
| "epoch": 0.41395426473042896, |
| "grad_norm": 40.0, |
| "learning_rate": 1.858e-05, |
| "loss": 13.506, |
| "mean_token_accuracy": 0.776913444697857, |
| "num_tokens": 6179050.0, |
| "step": 930 |
| }, |
| { |
| "entropy": 0.8335931519046426, |
| "epoch": 0.4184053858565626, |
| "grad_norm": 45.25, |
| "learning_rate": 1.878e-05, |
| "loss": 13.6217, |
| "mean_token_accuracy": 0.7724651444703341, |
| "num_tokens": 6244987.0, |
| "step": 940 |
| }, |
| { |
| "entropy": 0.8118610519915819, |
| "epoch": 0.4228565069826963, |
| "grad_norm": 49.75, |
| "learning_rate": 1.898e-05, |
| "loss": 13.0423, |
| "mean_token_accuracy": 0.7783929593861103, |
| "num_tokens": 6310799.0, |
| "step": 950 |
| }, |
| { |
| "entropy": 0.8028015844523907, |
| "epoch": 0.4273076281088299, |
| "grad_norm": 48.25, |
| "learning_rate": 1.918e-05, |
| "loss": 13.0412, |
| "mean_token_accuracy": 0.778607621230185, |
| "num_tokens": 6378152.0, |
| "step": 960 |
| }, |
| { |
| "entropy": 0.8370830919593573, |
| "epoch": 0.43175874923496355, |
| "grad_norm": 44.75, |
| "learning_rate": 1.938e-05, |
| "loss": 13.2798, |
| "mean_token_accuracy": 0.7766136281192303, |
| "num_tokens": 6442491.0, |
| "step": 970 |
| }, |
| { |
| "entropy": 0.862270618416369, |
| "epoch": 0.4362098703610972, |
| "grad_norm": 39.5, |
| "learning_rate": 1.9580000000000002e-05, |
| "loss": 14.0144, |
| "mean_token_accuracy": 0.7675615277141332, |
| "num_tokens": 6510231.0, |
| "step": 980 |
| }, |
| { |
| "entropy": 0.8360511595383286, |
| "epoch": 0.44066099148723087, |
| "grad_norm": 48.0, |
| "learning_rate": 1.978e-05, |
| "loss": 13.4815, |
| "mean_token_accuracy": 0.7760949255898595, |
| "num_tokens": 6572858.0, |
| "step": 990 |
| }, |
| { |
| "entropy": 0.8388594528660178, |
| "epoch": 0.4451121126133645, |
| "grad_norm": 43.25, |
| "learning_rate": 1.9980000000000002e-05, |
| "loss": 13.6609, |
| "mean_token_accuracy": 0.7700441874563694, |
| "num_tokens": 6637273.0, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.4451121126133645, |
| "eval_biology_entropy": 1.187324990749359, |
| "eval_biology_loss": 1.2962696552276611, |
| "eval_biology_mean_token_accuracy": 0.6844774860739707, |
| "eval_biology_num_tokens": 6637273.0, |
| "eval_biology_runtime": 39.733, |
| "eval_biology_samples_per_second": 12.584, |
| "eval_biology_steps_per_second": 12.584, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.4451121126133645, |
| "eval_chemistry_entropy": 0.8146472455263137, |
| "eval_chemistry_loss": 0.8452854156494141, |
| "eval_chemistry_mean_token_accuracy": 0.7733621709942817, |
| "eval_chemistry_num_tokens": 6637273.0, |
| "eval_chemistry_runtime": 46.1309, |
| "eval_chemistry_samples_per_second": 10.839, |
| "eval_chemistry_steps_per_second": 10.839, |
| "step": 1000 |
| }, |
| { |
| "entropy": 0.8283287117257714, |
| "epoch": 0.44956323373949814, |
| "grad_norm": 45.25, |
| "learning_rate": 1.9980000000000002e-05, |
| "loss": 13.2785, |
| "mean_token_accuracy": 0.7738786302506924, |
| "num_tokens": 6700799.0, |
| "step": 1010 |
| }, |
| { |
| "entropy": 0.8240121186710894, |
| "epoch": 0.4540143548656318, |
| "grad_norm": 43.0, |
| "learning_rate": 1.995777777777778e-05, |
| "loss": 13.5714, |
| "mean_token_accuracy": 0.7778675271198153, |
| "num_tokens": 6765798.0, |
| "step": 1020 |
| }, |
| { |
| "entropy": 0.8027254937216639, |
| "epoch": 0.4584654759917654, |
| "grad_norm": 45.0, |
| "learning_rate": 1.9935555555555557e-05, |
| "loss": 12.958, |
| "mean_token_accuracy": 0.7836722049862146, |
| "num_tokens": 6832774.0, |
| "step": 1030 |
| }, |
| { |
| "entropy": 0.8506452234461903, |
| "epoch": 0.46291659711789906, |
| "grad_norm": 40.75, |
| "learning_rate": 1.9913333333333335e-05, |
| "loss": 13.6169, |
| "mean_token_accuracy": 0.7709769554436207, |
| "num_tokens": 6896684.0, |
| "step": 1040 |
| }, |
| { |
| "entropy": 0.7970458004623652, |
| "epoch": 0.4673677182440327, |
| "grad_norm": 49.5, |
| "learning_rate": 1.9891111111111112e-05, |
| "loss": 13.1299, |
| "mean_token_accuracy": 0.7849638484418392, |
| "num_tokens": 6964423.0, |
| "step": 1050 |
| }, |
| { |
| "entropy": 0.811302705295384, |
| "epoch": 0.4718188393701664, |
| "grad_norm": 37.5, |
| "learning_rate": 1.986888888888889e-05, |
| "loss": 13.1097, |
| "mean_token_accuracy": 0.7798706620931626, |
| "num_tokens": 7033417.0, |
| "step": 1060 |
| }, |
| { |
| "entropy": 0.8476884752511978, |
| "epoch": 0.4762699604963, |
| "grad_norm": 44.5, |
| "learning_rate": 1.9846666666666668e-05, |
| "loss": 13.6585, |
| "mean_token_accuracy": 0.7750785838812589, |
| "num_tokens": 7101274.0, |
| "step": 1070 |
| }, |
| { |
| "entropy": 0.8404533293098211, |
| "epoch": 0.48072108162243365, |
| "grad_norm": 39.5, |
| "learning_rate": 1.9824444444444445e-05, |
| "loss": 13.6161, |
| "mean_token_accuracy": 0.7751947242766619, |
| "num_tokens": 7166608.0, |
| "step": 1080 |
| }, |
| { |
| "entropy": 0.7973418578505516, |
| "epoch": 0.4851722027485673, |
| "grad_norm": 58.5, |
| "learning_rate": 1.9802222222222226e-05, |
| "loss": 12.8893, |
| "mean_token_accuracy": 0.7859396133571863, |
| "num_tokens": 7240958.0, |
| "step": 1090 |
| }, |
| { |
| "entropy": 0.8283490337431431, |
| "epoch": 0.4896233238747009, |
| "grad_norm": 43.0, |
| "learning_rate": 1.978e-05, |
| "loss": 13.2474, |
| "mean_token_accuracy": 0.7790504809468984, |
| "num_tokens": 7311968.0, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.4896233238747009, |
| "eval_biology_entropy": 1.2187969796061515, |
| "eval_biology_loss": 1.297374963760376, |
| "eval_biology_mean_token_accuracy": 0.6832944719195366, |
| "eval_biology_num_tokens": 7311968.0, |
| "eval_biology_runtime": 40.0896, |
| "eval_biology_samples_per_second": 12.472, |
| "eval_biology_steps_per_second": 12.472, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.4896233238747009, |
| "eval_chemistry_entropy": 0.8075549347996712, |
| "eval_chemistry_loss": 0.8330864310264587, |
| "eval_chemistry_mean_token_accuracy": 0.7758210087418557, |
| "eval_chemistry_num_tokens": 7311968.0, |
| "eval_chemistry_runtime": 46.2026, |
| "eval_chemistry_samples_per_second": 10.822, |
| "eval_chemistry_steps_per_second": 10.822, |
| "step": 1100 |
| }, |
| { |
| "entropy": 0.8035251742228866, |
| "epoch": 0.4940744450008346, |
| "grad_norm": 42.25, |
| "learning_rate": 1.975777777777778e-05, |
| "loss": 13.0881, |
| "mean_token_accuracy": 0.7785759992897511, |
| "num_tokens": 7378961.0, |
| "step": 1110 |
| }, |
| { |
| "entropy": 0.7911803729832172, |
| "epoch": 0.49852556612696824, |
| "grad_norm": 47.0, |
| "learning_rate": 1.9735555555555556e-05, |
| "loss": 12.8175, |
| "mean_token_accuracy": 0.7857681257650256, |
| "num_tokens": 7444923.0, |
| "step": 1120 |
| }, |
| { |
| "entropy": 0.8341826571151614, |
| "epoch": 0.5029766872531019, |
| "grad_norm": 42.0, |
| "learning_rate": 1.9713333333333337e-05, |
| "loss": 13.2652, |
| "mean_token_accuracy": 0.7798049133270979, |
| "num_tokens": 7509885.0, |
| "step": 1130 |
| }, |
| { |
| "entropy": 0.8292949998751282, |
| "epoch": 0.5074278083792355, |
| "grad_norm": 42.25, |
| "learning_rate": 1.969111111111111e-05, |
| "loss": 13.6776, |
| "mean_token_accuracy": 0.7743924837559462, |
| "num_tokens": 7576456.0, |
| "step": 1140 |
| }, |
| { |
| "entropy": 0.867630060762167, |
| "epoch": 0.5118789295053692, |
| "grad_norm": 44.25, |
| "learning_rate": 1.9668888888888892e-05, |
| "loss": 13.874, |
| "mean_token_accuracy": 0.7730787601321936, |
| "num_tokens": 7641821.0, |
| "step": 1150 |
| }, |
| { |
| "entropy": 0.800755001604557, |
| "epoch": 0.5163300506315028, |
| "grad_norm": 47.0, |
| "learning_rate": 1.9646666666666666e-05, |
| "loss": 12.939, |
| "mean_token_accuracy": 0.7809992711991072, |
| "num_tokens": 7706502.0, |
| "step": 1160 |
| }, |
| { |
| "entropy": 0.8834998097270728, |
| "epoch": 0.5207811717576365, |
| "grad_norm": 48.0, |
| "learning_rate": 1.9624444444444447e-05, |
| "loss": 14.3515, |
| "mean_token_accuracy": 0.7673096172511578, |
| "num_tokens": 7772353.0, |
| "step": 1170 |
| }, |
| { |
| "entropy": 0.8068421924486756, |
| "epoch": 0.52523229288377, |
| "grad_norm": 38.25, |
| "learning_rate": 1.9602222222222225e-05, |
| "loss": 13.219, |
| "mean_token_accuracy": 0.7816231641918421, |
| "num_tokens": 7838516.0, |
| "step": 1180 |
| }, |
| { |
| "entropy": 0.8032166380435228, |
| "epoch": 0.5296834140099037, |
| "grad_norm": 52.25, |
| "learning_rate": 1.9580000000000002e-05, |
| "loss": 12.8455, |
| "mean_token_accuracy": 0.7809245727956295, |
| "num_tokens": 7902568.0, |
| "step": 1190 |
| }, |
| { |
| "entropy": 0.8334995551034808, |
| "epoch": 0.5341345351360374, |
| "grad_norm": 42.25, |
| "learning_rate": 1.955777777777778e-05, |
| "loss": 13.3201, |
| "mean_token_accuracy": 0.7768194541335106, |
| "num_tokens": 7969704.0, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.5341345351360374, |
| "eval_biology_entropy": 1.2117234426736831, |
| "eval_biology_loss": 1.300293207168579, |
| "eval_biology_mean_token_accuracy": 0.6825520681738854, |
| "eval_biology_num_tokens": 7969704.0, |
| "eval_biology_runtime": 39.4328, |
| "eval_biology_samples_per_second": 12.68, |
| "eval_biology_steps_per_second": 12.68, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.5341345351360374, |
| "eval_chemistry_entropy": 0.7991210364103317, |
| "eval_chemistry_loss": 0.8259859681129456, |
| "eval_chemistry_mean_token_accuracy": 0.7772398797273636, |
| "eval_chemistry_num_tokens": 7969704.0, |
| "eval_chemistry_runtime": 45.8311, |
| "eval_chemistry_samples_per_second": 10.91, |
| "eval_chemistry_steps_per_second": 10.91, |
| "step": 1200 |
| }, |
| { |
| "entropy": 0.7765231873840094, |
| "epoch": 0.538585656262171, |
| "grad_norm": 41.25, |
| "learning_rate": 1.9535555555555557e-05, |
| "loss": 12.6868, |
| "mean_token_accuracy": 0.7813247825950385, |
| "num_tokens": 8034765.0, |
| "step": 1210 |
| }, |
| { |
| "entropy": 0.8038391519337893, |
| "epoch": 0.5430367773883047, |
| "grad_norm": 35.5, |
| "learning_rate": 1.9513333333333335e-05, |
| "loss": 13.2505, |
| "mean_token_accuracy": 0.7795963916927576, |
| "num_tokens": 8103869.0, |
| "step": 1220 |
| }, |
| { |
| "entropy": 0.7666595270857215, |
| "epoch": 0.5474878985144384, |
| "grad_norm": 41.25, |
| "learning_rate": 1.9491111111111113e-05, |
| "loss": 12.3886, |
| "mean_token_accuracy": 0.7900463610887527, |
| "num_tokens": 8173386.0, |
| "step": 1230 |
| }, |
| { |
| "entropy": 0.8262818416580557, |
| "epoch": 0.551939019640572, |
| "grad_norm": 37.0, |
| "learning_rate": 1.946888888888889e-05, |
| "loss": 13.2345, |
| "mean_token_accuracy": 0.7755838381126523, |
| "num_tokens": 8242162.0, |
| "step": 1240 |
| }, |
| { |
| "entropy": 0.8054397076368331, |
| "epoch": 0.5563901407667056, |
| "grad_norm": 48.5, |
| "learning_rate": 1.9446666666666668e-05, |
| "loss": 13.0502, |
| "mean_token_accuracy": 0.7804440699517727, |
| "num_tokens": 8308919.0, |
| "step": 1250 |
| }, |
| { |
| "entropy": 0.811793964356184, |
| "epoch": 0.5608412618928392, |
| "grad_norm": 49.25, |
| "learning_rate": 1.9424444444444446e-05, |
| "loss": 13.3739, |
| "mean_token_accuracy": 0.7758540976792574, |
| "num_tokens": 8365617.0, |
| "step": 1260 |
| }, |
| { |
| "entropy": 0.8213885102421046, |
| "epoch": 0.5652923830189729, |
| "grad_norm": 37.5, |
| "learning_rate": 1.9402222222222223e-05, |
| "loss": 13.2316, |
| "mean_token_accuracy": 0.7780250526964665, |
| "num_tokens": 8430728.0, |
| "step": 1270 |
| }, |
| { |
| "entropy": 0.7969466263428331, |
| "epoch": 0.5697435041451065, |
| "grad_norm": 34.25, |
| "learning_rate": 1.938e-05, |
| "loss": 12.6971, |
| "mean_token_accuracy": 0.7868836035951972, |
| "num_tokens": 8497852.0, |
| "step": 1280 |
| }, |
| { |
| "entropy": 0.8166826661676169, |
| "epoch": 0.5741946252712402, |
| "grad_norm": 43.75, |
| "learning_rate": 1.935777777777778e-05, |
| "loss": 13.4076, |
| "mean_token_accuracy": 0.7757051605731249, |
| "num_tokens": 8558952.0, |
| "step": 1290 |
| }, |
| { |
| "entropy": 0.8129263132810592, |
| "epoch": 0.5786457463973739, |
| "grad_norm": 44.5, |
| "learning_rate": 1.9335555555555556e-05, |
| "loss": 13.1537, |
| "mean_token_accuracy": 0.7816494394093751, |
| "num_tokens": 8622868.0, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.5786457463973739, |
| "eval_biology_entropy": 1.2590930373072624, |
| "eval_biology_loss": 1.3004297018051147, |
| "eval_biology_mean_token_accuracy": 0.6824918667078018, |
| "eval_biology_num_tokens": 8622868.0, |
| "eval_biology_runtime": 39.5147, |
| "eval_biology_samples_per_second": 12.654, |
| "eval_biology_steps_per_second": 12.654, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.5786457463973739, |
| "eval_chemistry_entropy": 0.8134206305742264, |
| "eval_chemistry_loss": 0.8170297741889954, |
| "eval_chemistry_mean_token_accuracy": 0.7793359256386757, |
| "eval_chemistry_num_tokens": 8622868.0, |
| "eval_chemistry_runtime": 45.8445, |
| "eval_chemistry_samples_per_second": 10.906, |
| "eval_chemistry_steps_per_second": 10.906, |
| "step": 1300 |
| }, |
| { |
| "entropy": 0.7748439759016037, |
| "epoch": 0.5830968675235075, |
| "grad_norm": 40.25, |
| "learning_rate": 1.9313333333333334e-05, |
| "loss": 12.585, |
| "mean_token_accuracy": 0.7868875458836555, |
| "num_tokens": 8694510.0, |
| "step": 1310 |
| }, |
| { |
| "entropy": 0.8616355959326029, |
| "epoch": 0.5875479886496411, |
| "grad_norm": 49.25, |
| "learning_rate": 1.9291111111111115e-05, |
| "loss": 14.0344, |
| "mean_token_accuracy": 0.766650452464819, |
| "num_tokens": 8757753.0, |
| "step": 1320 |
| }, |
| { |
| "entropy": 0.8312898099422454, |
| "epoch": 0.5919991097757747, |
| "grad_norm": 44.0, |
| "learning_rate": 1.926888888888889e-05, |
| "loss": 13.3145, |
| "mean_token_accuracy": 0.7796449743211269, |
| "num_tokens": 8823153.0, |
| "step": 1330 |
| }, |
| { |
| "entropy": 0.8332564871758222, |
| "epoch": 0.5964502309019084, |
| "grad_norm": 41.5, |
| "learning_rate": 1.924666666666667e-05, |
| "loss": 13.599, |
| "mean_token_accuracy": 0.7727281775325536, |
| "num_tokens": 8886401.0, |
| "step": 1340 |
| }, |
| { |
| "entropy": 0.8084427203983069, |
| "epoch": 0.6009013520280421, |
| "grad_norm": 36.75, |
| "learning_rate": 1.9224444444444444e-05, |
| "loss": 12.8022, |
| "mean_token_accuracy": 0.7834546566009521, |
| "num_tokens": 8958530.0, |
| "step": 1350 |
| }, |
| { |
| "entropy": 0.8060288658365607, |
| "epoch": 0.6053524731541757, |
| "grad_norm": 41.0, |
| "learning_rate": 1.9202222222222225e-05, |
| "loss": 13.1094, |
| "mean_token_accuracy": 0.7809524293988943, |
| "num_tokens": 9024677.0, |
| "step": 1360 |
| }, |
| { |
| "entropy": 0.8401720520108938, |
| "epoch": 0.6098035942803094, |
| "grad_norm": 40.75, |
| "learning_rate": 1.918e-05, |
| "loss": 13.5138, |
| "mean_token_accuracy": 0.7706755470484495, |
| "num_tokens": 9090280.0, |
| "step": 1370 |
| }, |
| { |
| "entropy": 0.798151072487235, |
| "epoch": 0.614254715406443, |
| "grad_norm": 43.5, |
| "learning_rate": 1.915777777777778e-05, |
| "loss": 12.9648, |
| "mean_token_accuracy": 0.7831501496955753, |
| "num_tokens": 9153771.0, |
| "step": 1380 |
| }, |
| { |
| "entropy": 0.7643128799274563, |
| "epoch": 0.6187058365325766, |
| "grad_norm": 39.25, |
| "learning_rate": 1.9135555555555555e-05, |
| "loss": 12.2452, |
| "mean_token_accuracy": 0.7925934199243784, |
| "num_tokens": 9227117.0, |
| "step": 1390 |
| }, |
| { |
| "entropy": 0.8085228271782399, |
| "epoch": 0.6231569576587103, |
| "grad_norm": 43.0, |
| "learning_rate": 1.9113333333333336e-05, |
| "loss": 13.2475, |
| "mean_token_accuracy": 0.7788113884627819, |
| "num_tokens": 9291760.0, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.6231569576587103, |
| "eval_biology_entropy": 1.2567389221787453, |
| "eval_biology_loss": 1.3026176691055298, |
| "eval_biology_mean_token_accuracy": 0.6818688949346542, |
| "eval_biology_num_tokens": 9291760.0, |
| "eval_biology_runtime": 39.2644, |
| "eval_biology_samples_per_second": 12.734, |
| "eval_biology_steps_per_second": 12.734, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.6231569576587103, |
| "eval_chemistry_entropy": 0.8002244250178338, |
| "eval_chemistry_loss": 0.8070799708366394, |
| "eval_chemistry_mean_token_accuracy": 0.7815486862063408, |
| "eval_chemistry_num_tokens": 9291760.0, |
| "eval_chemistry_runtime": 55.9364, |
| "eval_chemistry_samples_per_second": 8.939, |
| "eval_chemistry_steps_per_second": 8.939, |
| "step": 1400 |
| }, |
| { |
| "entropy": 0.856862205825746, |
| "epoch": 0.6276080787848439, |
| "grad_norm": 43.75, |
| "learning_rate": 1.9091111111111113e-05, |
| "loss": 13.4277, |
| "mean_token_accuracy": 0.7707107689231634, |
| "num_tokens": 9357561.0, |
| "step": 1410 |
| }, |
| { |
| "entropy": 0.7735245639458299, |
| "epoch": 0.6320591999109776, |
| "grad_norm": 48.0, |
| "learning_rate": 1.906888888888889e-05, |
| "loss": 12.943, |
| "mean_token_accuracy": 0.7830769792199135, |
| "num_tokens": 9425022.0, |
| "step": 1420 |
| }, |
| { |
| "entropy": 0.8000387817621231, |
| "epoch": 0.6365103210371112, |
| "grad_norm": 49.0, |
| "learning_rate": 1.904666666666667e-05, |
| "loss": 12.637, |
| "mean_token_accuracy": 0.7861681949347258, |
| "num_tokens": 9492028.0, |
| "step": 1430 |
| }, |
| { |
| "entropy": 0.7522478165104985, |
| "epoch": 0.6409614421632449, |
| "grad_norm": 42.0, |
| "learning_rate": 1.9024444444444446e-05, |
| "loss": 12.2016, |
| "mean_token_accuracy": 0.7882154919207096, |
| "num_tokens": 9561091.0, |
| "step": 1440 |
| }, |
| { |
| "entropy": 0.7836794227361679, |
| "epoch": 0.6454125632893785, |
| "grad_norm": 51.75, |
| "learning_rate": 1.9002222222222224e-05, |
| "loss": 12.744, |
| "mean_token_accuracy": 0.7859560146927833, |
| "num_tokens": 9628383.0, |
| "step": 1450 |
| }, |
| { |
| "entropy": 0.7407985650002956, |
| "epoch": 0.6498636844155121, |
| "grad_norm": 34.0, |
| "learning_rate": 1.898e-05, |
| "loss": 11.8696, |
| "mean_token_accuracy": 0.7983961008489132, |
| "num_tokens": 9695723.0, |
| "step": 1460 |
| }, |
| { |
| "entropy": 0.7636277657002211, |
| "epoch": 0.6543148055416458, |
| "grad_norm": 38.75, |
| "learning_rate": 1.895777777777778e-05, |
| "loss": 12.4201, |
| "mean_token_accuracy": 0.7915539544075727, |
| "num_tokens": 9763393.0, |
| "step": 1470 |
| }, |
| { |
| "entropy": 0.8254500133916736, |
| "epoch": 0.6587659266677794, |
| "grad_norm": 38.25, |
| "learning_rate": 1.8935555555555556e-05, |
| "loss": 13.0763, |
| "mean_token_accuracy": 0.7825288005173207, |
| "num_tokens": 9827272.0, |
| "step": 1480 |
| }, |
| { |
| "entropy": 0.7628985194489359, |
| "epoch": 0.6632170477939131, |
| "grad_norm": 35.75, |
| "learning_rate": 1.8913333333333334e-05, |
| "loss": 12.4827, |
| "mean_token_accuracy": 0.7848137805238367, |
| "num_tokens": 9892418.0, |
| "step": 1490 |
| }, |
| { |
| "entropy": 0.7648764431476593, |
| "epoch": 0.6676681689200468, |
| "grad_norm": 40.25, |
| "learning_rate": 1.8891111111111115e-05, |
| "loss": 12.3863, |
| "mean_token_accuracy": 0.7885230954736471, |
| "num_tokens": 9955369.0, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.6676681689200468, |
| "eval_biology_entropy": 1.232845685839653, |
| "eval_biology_loss": 1.299937129020691, |
| "eval_biology_mean_token_accuracy": 0.6833411865234374, |
| "eval_biology_num_tokens": 9955369.0, |
| "eval_biology_runtime": 39.6042, |
| "eval_biology_samples_per_second": 12.625, |
| "eval_biology_steps_per_second": 12.625, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.6676681689200468, |
| "eval_chemistry_entropy": 0.7862835813760758, |
| "eval_chemistry_loss": 0.7996346354484558, |
| "eval_chemistry_mean_token_accuracy": 0.7832771391272545, |
| "eval_chemistry_num_tokens": 9955369.0, |
| "eval_chemistry_runtime": 46.138, |
| "eval_chemistry_samples_per_second": 10.837, |
| "eval_chemistry_steps_per_second": 10.837, |
| "step": 1500 |
| }, |
| { |
| "entropy": 0.7809047346934677, |
| "epoch": 0.6721192900461804, |
| "grad_norm": 41.25, |
| "learning_rate": 1.886888888888889e-05, |
| "loss": 12.5052, |
| "mean_token_accuracy": 0.7869493119418621, |
| "num_tokens": 10026298.0, |
| "step": 1510 |
| }, |
| { |
| "entropy": 0.7423510169610381, |
| "epoch": 0.676570411172314, |
| "grad_norm": 36.0, |
| "learning_rate": 1.884666666666667e-05, |
| "loss": 11.9187, |
| "mean_token_accuracy": 0.7974143566563725, |
| "num_tokens": 10096065.0, |
| "step": 1520 |
| }, |
| { |
| "entropy": 0.771459529362619, |
| "epoch": 0.6810215322984476, |
| "grad_norm": 40.5, |
| "learning_rate": 1.8824444444444445e-05, |
| "loss": 12.7462, |
| "mean_token_accuracy": 0.7863322600722313, |
| "num_tokens": 10160745.0, |
| "step": 1530 |
| }, |
| { |
| "entropy": 0.7509715856052935, |
| "epoch": 0.6854726534245813, |
| "grad_norm": 35.75, |
| "learning_rate": 1.8802222222222226e-05, |
| "loss": 11.927, |
| "mean_token_accuracy": 0.7947466436773538, |
| "num_tokens": 10233806.0, |
| "step": 1540 |
| }, |
| { |
| "entropy": 0.7742771266028285, |
| "epoch": 0.689923774550715, |
| "grad_norm": 42.5, |
| "learning_rate": 1.878e-05, |
| "loss": 12.5918, |
| "mean_token_accuracy": 0.7863378578796982, |
| "num_tokens": 10298845.0, |
| "step": 1550 |
| }, |
| { |
| "entropy": 0.7730063889175653, |
| "epoch": 0.6943748956768486, |
| "grad_norm": 38.25, |
| "learning_rate": 1.875777777777778e-05, |
| "loss": 12.4047, |
| "mean_token_accuracy": 0.7885195638984441, |
| "num_tokens": 10364601.0, |
| "step": 1560 |
| }, |
| { |
| "entropy": 0.7708892775699496, |
| "epoch": 0.6988260168029823, |
| "grad_norm": 40.75, |
| "learning_rate": 1.873555555555556e-05, |
| "loss": 12.6199, |
| "mean_token_accuracy": 0.7856390193104744, |
| "num_tokens": 10434433.0, |
| "step": 1570 |
| }, |
| { |
| "entropy": 0.7958233149722218, |
| "epoch": 0.703277137929116, |
| "grad_norm": 43.75, |
| "learning_rate": 1.8713333333333336e-05, |
| "loss": 12.8595, |
| "mean_token_accuracy": 0.7833300601691008, |
| "num_tokens": 10500891.0, |
| "step": 1580 |
| }, |
| { |
| "entropy": 0.7902805691584944, |
| "epoch": 0.7077282590552495, |
| "grad_norm": 40.0, |
| "learning_rate": 1.8691111111111114e-05, |
| "loss": 12.8921, |
| "mean_token_accuracy": 0.7834575500339269, |
| "num_tokens": 10567784.0, |
| "step": 1590 |
| }, |
| { |
| "entropy": 0.7975674813613296, |
| "epoch": 0.7121793801813832, |
| "grad_norm": 38.5, |
| "learning_rate": 1.866888888888889e-05, |
| "loss": 12.4954, |
| "mean_token_accuracy": 0.7869599737226963, |
| "num_tokens": 10633325.0, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.7121793801813832, |
| "eval_biology_entropy": 1.2419219986200332, |
| "eval_biology_loss": 1.3013286590576172, |
| "eval_biology_mean_token_accuracy": 0.6823853524923325, |
| "eval_biology_num_tokens": 10633325.0, |
| "eval_biology_runtime": 40.8294, |
| "eval_biology_samples_per_second": 12.246, |
| "eval_biology_steps_per_second": 12.246, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.7121793801813832, |
| "eval_chemistry_entropy": 0.7956894148588181, |
| "eval_chemistry_loss": 0.7937653660774231, |
| "eval_chemistry_mean_token_accuracy": 0.7838373360037804, |
| "eval_chemistry_num_tokens": 10633325.0, |
| "eval_chemistry_runtime": 46.4829, |
| "eval_chemistry_samples_per_second": 10.757, |
| "eval_chemistry_steps_per_second": 10.757, |
| "step": 1600 |
| }, |
| { |
| "entropy": 0.7231507489457727, |
| "epoch": 0.7166305013075168, |
| "grad_norm": 44.25, |
| "learning_rate": 1.864666666666667e-05, |
| "loss": 11.7926, |
| "mean_token_accuracy": 0.7977068889886141, |
| "num_tokens": 10699737.0, |
| "step": 1610 |
| }, |
| { |
| "entropy": 0.7011674824170768, |
| "epoch": 0.7210816224336505, |
| "grad_norm": 47.25, |
| "learning_rate": 1.8624444444444446e-05, |
| "loss": 11.2514, |
| "mean_token_accuracy": 0.8032249186187983, |
| "num_tokens": 10764319.0, |
| "step": 1620 |
| }, |
| { |
| "entropy": 0.7829309536144138, |
| "epoch": 0.7255327435597841, |
| "grad_norm": 37.0, |
| "learning_rate": 1.8602222222222224e-05, |
| "loss": 12.7194, |
| "mean_token_accuracy": 0.7839431796222925, |
| "num_tokens": 10831617.0, |
| "step": 1630 |
| }, |
| { |
| "entropy": 0.7741082075983285, |
| "epoch": 0.7299838646859178, |
| "grad_norm": 44.0, |
| "learning_rate": 1.858e-05, |
| "loss": 12.5457, |
| "mean_token_accuracy": 0.7877275109291076, |
| "num_tokens": 10897916.0, |
| "step": 1640 |
| }, |
| { |
| "entropy": 0.7683334495872259, |
| "epoch": 0.7344349858120514, |
| "grad_norm": 40.0, |
| "learning_rate": 1.855777777777778e-05, |
| "loss": 12.3848, |
| "mean_token_accuracy": 0.7887299537658692, |
| "num_tokens": 10965483.0, |
| "step": 1650 |
| }, |
| { |
| "entropy": 0.7886540442705154, |
| "epoch": 0.738886106938185, |
| "grad_norm": 37.5, |
| "learning_rate": 1.8535555555555557e-05, |
| "loss": 12.7194, |
| "mean_token_accuracy": 0.7842936536297203, |
| "num_tokens": 11030485.0, |
| "step": 1660 |
| }, |
| { |
| "entropy": 0.7601794632151723, |
| "epoch": 0.7433372280643187, |
| "grad_norm": 42.5, |
| "learning_rate": 1.8513333333333335e-05, |
| "loss": 12.345, |
| "mean_token_accuracy": 0.789710770919919, |
| "num_tokens": 11097108.0, |
| "step": 1670 |
| }, |
| { |
| "entropy": 0.7141751017421484, |
| "epoch": 0.7477883491904523, |
| "grad_norm": 40.5, |
| "learning_rate": 1.8491111111111112e-05, |
| "loss": 11.629, |
| "mean_token_accuracy": 0.8011402323842048, |
| "num_tokens": 11165356.0, |
| "step": 1680 |
| }, |
| { |
| "entropy": 0.7918755512684583, |
| "epoch": 0.752239470316586, |
| "grad_norm": 36.0, |
| "learning_rate": 1.846888888888889e-05, |
| "loss": 12.6792, |
| "mean_token_accuracy": 0.7855523183941842, |
| "num_tokens": 11234429.0, |
| "step": 1690 |
| }, |
| { |
| "entropy": 0.7715185107663274, |
| "epoch": 0.7566905914427197, |
| "grad_norm": 40.75, |
| "learning_rate": 1.8446666666666667e-05, |
| "loss": 12.3163, |
| "mean_token_accuracy": 0.7902058430016041, |
| "num_tokens": 11299223.0, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.7566905914427197, |
| "eval_biology_entropy": 1.2339779297113418, |
| "eval_biology_loss": 1.3050655126571655, |
| "eval_biology_mean_token_accuracy": 0.6822574281096458, |
| "eval_biology_num_tokens": 11299223.0, |
| "eval_biology_runtime": 39.9731, |
| "eval_biology_samples_per_second": 12.508, |
| "eval_biology_steps_per_second": 12.508, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.7566905914427197, |
| "eval_chemistry_entropy": 0.7556659379005433, |
| "eval_chemistry_loss": 0.7893115282058716, |
| "eval_chemistry_mean_token_accuracy": 0.7856479023098946, |
| "eval_chemistry_num_tokens": 11299223.0, |
| "eval_chemistry_runtime": 48.0592, |
| "eval_chemistry_samples_per_second": 10.404, |
| "eval_chemistry_steps_per_second": 10.404, |
| "step": 1700 |
| }, |
| { |
| "entropy": 0.7491960693150759, |
| "epoch": 0.7611417125688533, |
| "grad_norm": 37.5, |
| "learning_rate": 1.842444444444445e-05, |
| "loss": 12.221, |
| "mean_token_accuracy": 0.7910046689212322, |
| "num_tokens": 11371200.0, |
| "step": 1710 |
| }, |
| { |
| "entropy": 0.7779044238850474, |
| "epoch": 0.7655928336949869, |
| "grad_norm": 36.0, |
| "learning_rate": 1.8402222222222223e-05, |
| "loss": 12.6537, |
| "mean_token_accuracy": 0.7857811234891414, |
| "num_tokens": 11436799.0, |
| "step": 1720 |
| }, |
| { |
| "entropy": 0.7611921314150095, |
| "epoch": 0.7700439548211205, |
| "grad_norm": 46.25, |
| "learning_rate": 1.8380000000000004e-05, |
| "loss": 12.1793, |
| "mean_token_accuracy": 0.7942970298230648, |
| "num_tokens": 11502965.0, |
| "step": 1730 |
| }, |
| { |
| "entropy": 0.7916558500379324, |
| "epoch": 0.7744950759472542, |
| "grad_norm": 39.5, |
| "learning_rate": 1.8357777777777778e-05, |
| "loss": 12.728, |
| "mean_token_accuracy": 0.7856421928852797, |
| "num_tokens": 11567422.0, |
| "step": 1740 |
| }, |
| { |
| "entropy": 0.7364842056296765, |
| "epoch": 0.7789461970733879, |
| "grad_norm": 30.625, |
| "learning_rate": 1.833555555555556e-05, |
| "loss": 12.1674, |
| "mean_token_accuracy": 0.791620584949851, |
| "num_tokens": 11637076.0, |
| "step": 1750 |
| }, |
| { |
| "entropy": 0.8320787468925118, |
| "epoch": 0.7833973181995215, |
| "grad_norm": 38.0, |
| "learning_rate": 1.8313333333333333e-05, |
| "loss": 13.1941, |
| "mean_token_accuracy": 0.7748327614739537, |
| "num_tokens": 11703496.0, |
| "step": 1760 |
| }, |
| { |
| "entropy": 0.7116687665693462, |
| "epoch": 0.7878484393256552, |
| "grad_norm": 29.625, |
| "learning_rate": 1.8291111111111114e-05, |
| "loss": 11.5027, |
| "mean_token_accuracy": 0.8015623264014721, |
| "num_tokens": 11772773.0, |
| "step": 1770 |
| }, |
| { |
| "entropy": 0.7382532864809036, |
| "epoch": 0.7922995604517888, |
| "grad_norm": 37.25, |
| "learning_rate": 1.8268888888888888e-05, |
| "loss": 12.107, |
| "mean_token_accuracy": 0.7927792508155107, |
| "num_tokens": 11840317.0, |
| "step": 1780 |
| }, |
| { |
| "entropy": 0.8406919915229082, |
| "epoch": 0.7967506815779224, |
| "grad_norm": 40.75, |
| "learning_rate": 1.824666666666667e-05, |
| "loss": 13.4841, |
| "mean_token_accuracy": 0.7751987297087908, |
| "num_tokens": 11903376.0, |
| "step": 1790 |
| }, |
| { |
| "entropy": 0.7729794921353459, |
| "epoch": 0.801201802704056, |
| "grad_norm": 41.25, |
| "learning_rate": 1.8224444444444447e-05, |
| "loss": 12.3404, |
| "mean_token_accuracy": 0.789885114133358, |
| "num_tokens": 11965530.0, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.801201802704056, |
| "eval_biology_entropy": 1.238894326388836, |
| "eval_biology_loss": 1.3023967742919922, |
| "eval_biology_mean_token_accuracy": 0.6827202830314636, |
| "eval_biology_num_tokens": 11965530.0, |
| "eval_biology_runtime": 39.8685, |
| "eval_biology_samples_per_second": 12.541, |
| "eval_biology_steps_per_second": 12.541, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.801201802704056, |
| "eval_chemistry_entropy": 0.7590603602528572, |
| "eval_chemistry_loss": 0.7831795811653137, |
| "eval_chemistry_mean_token_accuracy": 0.7866261592507362, |
| "eval_chemistry_num_tokens": 11965530.0, |
| "eval_chemistry_runtime": 48.0982, |
| "eval_chemistry_samples_per_second": 10.395, |
| "eval_chemistry_steps_per_second": 10.395, |
| "step": 1800 |
| }, |
| { |
| "entropy": 0.7708059819415212, |
| "epoch": 0.8056529238301897, |
| "grad_norm": 43.25, |
| "learning_rate": 1.8202222222222225e-05, |
| "loss": 12.5294, |
| "mean_token_accuracy": 0.7862987028434872, |
| "num_tokens": 12031493.0, |
| "step": 1810 |
| }, |
| { |
| "entropy": 0.7691075187176466, |
| "epoch": 0.8101040449563234, |
| "grad_norm": 34.0, |
| "learning_rate": 1.8180000000000002e-05, |
| "loss": 12.552, |
| "mean_token_accuracy": 0.7872704153880477, |
| "num_tokens": 12097917.0, |
| "step": 1820 |
| }, |
| { |
| "entropy": 0.7835509760305286, |
| "epoch": 0.814555166082457, |
| "grad_norm": 35.25, |
| "learning_rate": 1.815777777777778e-05, |
| "loss": 12.581, |
| "mean_token_accuracy": 0.7855123173445463, |
| "num_tokens": 12163029.0, |
| "step": 1830 |
| }, |
| { |
| "entropy": 0.7693919812329113, |
| "epoch": 0.8190062872085907, |
| "grad_norm": 40.25, |
| "learning_rate": 1.8135555555555557e-05, |
| "loss": 12.5332, |
| "mean_token_accuracy": 0.7862321555614471, |
| "num_tokens": 12224427.0, |
| "step": 1840 |
| }, |
| { |
| "entropy": 0.8024251624941826, |
| "epoch": 0.8234574083347244, |
| "grad_norm": 36.5, |
| "learning_rate": 1.8113333333333335e-05, |
| "loss": 12.7152, |
| "mean_token_accuracy": 0.7867181565612554, |
| "num_tokens": 12291917.0, |
| "step": 1850 |
| }, |
| { |
| "entropy": 0.7431695537641645, |
| "epoch": 0.8279085294608579, |
| "grad_norm": 37.5, |
| "learning_rate": 1.8091111111111113e-05, |
| "loss": 12.067, |
| "mean_token_accuracy": 0.7929942118003964, |
| "num_tokens": 12365332.0, |
| "step": 1860 |
| }, |
| { |
| "entropy": 0.7609774840995669, |
| "epoch": 0.8323596505869916, |
| "grad_norm": 32.25, |
| "learning_rate": 1.806888888888889e-05, |
| "loss": 12.307, |
| "mean_token_accuracy": 0.7898187723010779, |
| "num_tokens": 12441423.0, |
| "step": 1870 |
| }, |
| { |
| "entropy": 0.7366092208772897, |
| "epoch": 0.8368107717131252, |
| "grad_norm": 43.0, |
| "learning_rate": 1.8046666666666668e-05, |
| "loss": 11.6312, |
| "mean_token_accuracy": 0.8000296927988529, |
| "num_tokens": 12509124.0, |
| "step": 1880 |
| }, |
| { |
| "entropy": 0.7049085019156337, |
| "epoch": 0.8412618928392589, |
| "grad_norm": 34.5, |
| "learning_rate": 1.8024444444444445e-05, |
| "loss": 11.6414, |
| "mean_token_accuracy": 0.7974906180053949, |
| "num_tokens": 12575355.0, |
| "step": 1890 |
| }, |
| { |
| "entropy": 0.7814744580537081, |
| "epoch": 0.8457130139653926, |
| "grad_norm": 44.25, |
| "learning_rate": 1.8002222222222223e-05, |
| "loss": 12.6335, |
| "mean_token_accuracy": 0.7851035960018635, |
| "num_tokens": 12642651.0, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.8457130139653926, |
| "eval_biology_entropy": 1.2500024722218515, |
| "eval_biology_loss": 1.3012299537658691, |
| "eval_biology_mean_token_accuracy": 0.6823599907159805, |
| "eval_biology_num_tokens": 12642651.0, |
| "eval_biology_runtime": 39.4244, |
| "eval_biology_samples_per_second": 12.682, |
| "eval_biology_steps_per_second": 12.682, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.8457130139653926, |
| "eval_chemistry_entropy": 0.7712779935002327, |
| "eval_chemistry_loss": 0.7782201170921326, |
| "eval_chemistry_mean_token_accuracy": 0.787936452627182, |
| "eval_chemistry_num_tokens": 12642651.0, |
| "eval_chemistry_runtime": 45.8268, |
| "eval_chemistry_samples_per_second": 10.911, |
| "eval_chemistry_steps_per_second": 10.911, |
| "step": 1900 |
| }, |
| { |
| "entropy": 0.7286117426119745, |
| "epoch": 0.8501641350915262, |
| "grad_norm": 38.0, |
| "learning_rate": 1.798e-05, |
| "loss": 11.5153, |
| "mean_token_accuracy": 0.8004030931741, |
| "num_tokens": 12710065.0, |
| "step": 1910 |
| }, |
| { |
| "entropy": 0.7636802634224296, |
| "epoch": 0.8546152562176598, |
| "grad_norm": 41.75, |
| "learning_rate": 1.7957777777777778e-05, |
| "loss": 12.3297, |
| "mean_token_accuracy": 0.7917378932237625, |
| "num_tokens": 12778408.0, |
| "step": 1920 |
| }, |
| { |
| "entropy": 0.7151092055253685, |
| "epoch": 0.8590663773437934, |
| "grad_norm": 44.75, |
| "learning_rate": 1.7935555555555556e-05, |
| "loss": 11.6627, |
| "mean_token_accuracy": 0.7980649210512638, |
| "num_tokens": 12845949.0, |
| "step": 1930 |
| }, |
| { |
| "entropy": 0.7645737134851516, |
| "epoch": 0.8635174984699271, |
| "grad_norm": 41.25, |
| "learning_rate": 1.7913333333333337e-05, |
| "loss": 12.1823, |
| "mean_token_accuracy": 0.7940845835953951, |
| "num_tokens": 12910630.0, |
| "step": 1940 |
| }, |
| { |
| "entropy": 0.7626768484711647, |
| "epoch": 0.8679686195960608, |
| "grad_norm": 33.5, |
| "learning_rate": 1.789111111111111e-05, |
| "loss": 12.4, |
| "mean_token_accuracy": 0.7878372304141521, |
| "num_tokens": 12975379.0, |
| "step": 1950 |
| }, |
| { |
| "entropy": 0.728480844758451, |
| "epoch": 0.8724197407221944, |
| "grad_norm": 34.25, |
| "learning_rate": 1.7868888888888892e-05, |
| "loss": 11.683, |
| "mean_token_accuracy": 0.7970448518171906, |
| "num_tokens": 13046473.0, |
| "step": 1960 |
| }, |
| { |
| "entropy": 0.8084357729181647, |
| "epoch": 0.8768708618483281, |
| "grad_norm": 36.75, |
| "learning_rate": 1.7846666666666666e-05, |
| "loss": 13.1812, |
| "mean_token_accuracy": 0.780071578361094, |
| "num_tokens": 13105019.0, |
| "step": 1970 |
| }, |
| { |
| "entropy": 0.8157536951825023, |
| "epoch": 0.8813219829744617, |
| "grad_norm": 39.0, |
| "learning_rate": 1.7824444444444447e-05, |
| "loss": 12.8778, |
| "mean_token_accuracy": 0.7852013517171145, |
| "num_tokens": 13167768.0, |
| "step": 1980 |
| }, |
| { |
| "entropy": 0.747572572156787, |
| "epoch": 0.8857731041005953, |
| "grad_norm": 35.75, |
| "learning_rate": 1.780222222222222e-05, |
| "loss": 11.9019, |
| "mean_token_accuracy": 0.7958117298781872, |
| "num_tokens": 13235899.0, |
| "step": 1990 |
| }, |
| { |
| "entropy": 0.7473404568620026, |
| "epoch": 0.890224225226729, |
| "grad_norm": 33.25, |
| "learning_rate": 1.7780000000000003e-05, |
| "loss": 12.0581, |
| "mean_token_accuracy": 0.7940714538097382, |
| "num_tokens": 13301659.0, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.890224225226729, |
| "eval_biology_entropy": 1.2476614614725112, |
| "eval_biology_loss": 1.3003644943237305, |
| "eval_biology_mean_token_accuracy": 0.682231693148613, |
| "eval_biology_num_tokens": 13301659.0, |
| "eval_biology_runtime": 39.7204, |
| "eval_biology_samples_per_second": 12.588, |
| "eval_biology_steps_per_second": 12.588, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.890224225226729, |
| "eval_chemistry_entropy": 0.7678308563828469, |
| "eval_chemistry_loss": 0.7745650410652161, |
| "eval_chemistry_mean_token_accuracy": 0.7882890626788139, |
| "eval_chemistry_num_tokens": 13301659.0, |
| "eval_chemistry_runtime": 46.1368, |
| "eval_chemistry_samples_per_second": 10.837, |
| "eval_chemistry_steps_per_second": 10.837, |
| "step": 2000 |
| }, |
| { |
| "entropy": 0.7491364620625973, |
| "epoch": 0.8946753463528626, |
| "grad_norm": 40.75, |
| "learning_rate": 1.7757777777777777e-05, |
| "loss": 12.0785, |
| "mean_token_accuracy": 0.7944943491369486, |
| "num_tokens": 13368880.0, |
| "step": 2010 |
| }, |
| { |
| "entropy": 0.7931207174435257, |
| "epoch": 0.8991264674789963, |
| "grad_norm": 43.25, |
| "learning_rate": 1.7735555555555558e-05, |
| "loss": 12.6523, |
| "mean_token_accuracy": 0.78772840090096, |
| "num_tokens": 13434364.0, |
| "step": 2020 |
| }, |
| { |
| "entropy": 0.7574795215390623, |
| "epoch": 0.9035775886051299, |
| "grad_norm": 37.75, |
| "learning_rate": 1.7713333333333335e-05, |
| "loss": 12.3514, |
| "mean_token_accuracy": 0.7918346397578716, |
| "num_tokens": 13499871.0, |
| "step": 2030 |
| }, |
| { |
| "entropy": 0.7294158147647977, |
| "epoch": 0.9080287097312636, |
| "grad_norm": 34.25, |
| "learning_rate": 1.7691111111111113e-05, |
| "loss": 11.6435, |
| "mean_token_accuracy": 0.8019150290638208, |
| "num_tokens": 13570986.0, |
| "step": 2040 |
| }, |
| { |
| "entropy": 0.6758658250793814, |
| "epoch": 0.9124798308573973, |
| "grad_norm": 36.0, |
| "learning_rate": 1.766888888888889e-05, |
| "loss": 10.8889, |
| "mean_token_accuracy": 0.8116297330707312, |
| "num_tokens": 13639400.0, |
| "step": 2050 |
| }, |
| { |
| "entropy": 0.7913496998138726, |
| "epoch": 0.9169309519835308, |
| "grad_norm": 38.25, |
| "learning_rate": 1.7646666666666668e-05, |
| "loss": 12.8282, |
| "mean_token_accuracy": 0.7836458418518305, |
| "num_tokens": 13706343.0, |
| "step": 2060 |
| }, |
| { |
| "entropy": 0.8106502434238791, |
| "epoch": 0.9213820731096645, |
| "grad_norm": 44.5, |
| "learning_rate": 1.7624444444444446e-05, |
| "loss": 13.0573, |
| "mean_token_accuracy": 0.7821768958121538, |
| "num_tokens": 13772145.0, |
| "step": 2070 |
| }, |
| { |
| "entropy": 0.7930893866345287, |
| "epoch": 0.9258331942357981, |
| "grad_norm": 30.0, |
| "learning_rate": 1.7602222222222223e-05, |
| "loss": 12.7334, |
| "mean_token_accuracy": 0.7857527777552604, |
| "num_tokens": 13833901.0, |
| "step": 2080 |
| }, |
| { |
| "entropy": 0.7608680401928722, |
| "epoch": 0.9302843153619318, |
| "grad_norm": 43.25, |
| "learning_rate": 1.758e-05, |
| "loss": 12.2845, |
| "mean_token_accuracy": 0.7906877096742392, |
| "num_tokens": 13890922.0, |
| "step": 2090 |
| }, |
| { |
| "entropy": 0.7530267771333456, |
| "epoch": 0.9347354364880655, |
| "grad_norm": 38.5, |
| "learning_rate": 1.755777777777778e-05, |
| "loss": 12.1729, |
| "mean_token_accuracy": 0.791902843117714, |
| "num_tokens": 13954258.0, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.9347354364880655, |
| "eval_biology_entropy": 1.2157604062557221, |
| "eval_biology_loss": 1.3008348941802979, |
| "eval_biology_mean_token_accuracy": 0.682707477748394, |
| "eval_biology_num_tokens": 13954258.0, |
| "eval_biology_runtime": 40.7733, |
| "eval_biology_samples_per_second": 12.263, |
| "eval_biology_steps_per_second": 12.263, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.9347354364880655, |
| "eval_chemistry_entropy": 0.7506245082020759, |
| "eval_chemistry_loss": 0.7717900276184082, |
| "eval_chemistry_mean_token_accuracy": 0.7892645244002342, |
| "eval_chemistry_num_tokens": 13954258.0, |
| "eval_chemistry_runtime": 46.2743, |
| "eval_chemistry_samples_per_second": 10.805, |
| "eval_chemistry_steps_per_second": 10.805, |
| "step": 2100 |
| }, |
| { |
| "entropy": 0.7269412121735513, |
| "epoch": 0.9391865576141991, |
| "grad_norm": 43.0, |
| "learning_rate": 1.7535555555555556e-05, |
| "loss": 11.6708, |
| "mean_token_accuracy": 0.8014364942908287, |
| "num_tokens": 14024422.0, |
| "step": 2110 |
| }, |
| { |
| "entropy": 0.7339699132367968, |
| "epoch": 0.9436376787403328, |
| "grad_norm": 45.25, |
| "learning_rate": 1.7513333333333334e-05, |
| "loss": 11.8606, |
| "mean_token_accuracy": 0.7975826554000378, |
| "num_tokens": 14093888.0, |
| "step": 2120 |
| }, |
| { |
| "entropy": 0.7686109783127903, |
| "epoch": 0.9480887998664663, |
| "grad_norm": 42.75, |
| "learning_rate": 1.749111111111111e-05, |
| "loss": 12.4624, |
| "mean_token_accuracy": 0.7877364981919527, |
| "num_tokens": 14160983.0, |
| "step": 2130 |
| }, |
| { |
| "entropy": 0.7372720196843148, |
| "epoch": 0.9525399209926, |
| "grad_norm": 38.5, |
| "learning_rate": 1.746888888888889e-05, |
| "loss": 12.0564, |
| "mean_token_accuracy": 0.7951818112283945, |
| "num_tokens": 14222626.0, |
| "step": 2140 |
| }, |
| { |
| "entropy": 0.7825360232032835, |
| "epoch": 0.9569910421187336, |
| "grad_norm": 39.5, |
| "learning_rate": 1.7446666666666667e-05, |
| "loss": 12.5673, |
| "mean_token_accuracy": 0.7859856501221657, |
| "num_tokens": 14287730.0, |
| "step": 2150 |
| }, |
| { |
| "entropy": 0.7773288476280868, |
| "epoch": 0.9614421632448673, |
| "grad_norm": 39.0, |
| "learning_rate": 1.7424444444444444e-05, |
| "loss": 12.4443, |
| "mean_token_accuracy": 0.7885882891714573, |
| "num_tokens": 14354918.0, |
| "step": 2160 |
| }, |
| { |
| "entropy": 0.7363769317045807, |
| "epoch": 0.965893284371001, |
| "grad_norm": 44.25, |
| "learning_rate": 1.7402222222222222e-05, |
| "loss": 11.9357, |
| "mean_token_accuracy": 0.7947400573641061, |
| "num_tokens": 14427338.0, |
| "step": 2170 |
| }, |
| { |
| "entropy": 0.7489111572504044, |
| "epoch": 0.9703444054971346, |
| "grad_norm": 37.0, |
| "learning_rate": 1.7380000000000003e-05, |
| "loss": 12.1624, |
| "mean_token_accuracy": 0.7900368690490722, |
| "num_tokens": 14493349.0, |
| "step": 2180 |
| }, |
| { |
| "entropy": 0.7617103135213256, |
| "epoch": 0.9747955266232682, |
| "grad_norm": 37.5, |
| "learning_rate": 1.735777777777778e-05, |
| "loss": 12.1613, |
| "mean_token_accuracy": 0.7906056232750416, |
| "num_tokens": 14555091.0, |
| "step": 2190 |
| }, |
| { |
| "entropy": 0.725032649282366, |
| "epoch": 0.9792466477494018, |
| "grad_norm": 33.5, |
| "learning_rate": 1.7335555555555558e-05, |
| "loss": 11.5895, |
| "mean_token_accuracy": 0.8007797665894032, |
| "num_tokens": 14620244.0, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.9792466477494018, |
| "eval_biology_entropy": 1.2365998299121856, |
| "eval_biology_loss": 1.303519606590271, |
| "eval_biology_mean_token_accuracy": 0.6815042721629143, |
| "eval_biology_num_tokens": 14620244.0, |
| "eval_biology_runtime": 40.1978, |
| "eval_biology_samples_per_second": 12.438, |
| "eval_biology_steps_per_second": 12.438, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.9792466477494018, |
| "eval_chemistry_entropy": 0.7446092162430287, |
| "eval_chemistry_loss": 0.7655045390129089, |
| "eval_chemistry_mean_token_accuracy": 0.7904723164439201, |
| "eval_chemistry_num_tokens": 14620244.0, |
| "eval_chemistry_runtime": 46.0753, |
| "eval_chemistry_samples_per_second": 10.852, |
| "eval_chemistry_steps_per_second": 10.852, |
| "step": 2200 |
| }, |
| { |
| "entropy": 0.7139286000281573, |
| "epoch": 0.9836977688755355, |
| "grad_norm": 36.5, |
| "learning_rate": 1.7313333333333336e-05, |
| "loss": 11.5837, |
| "mean_token_accuracy": 0.8020144417881966, |
| "num_tokens": 14688111.0, |
| "step": 2210 |
| }, |
| { |
| "entropy": 0.7395884351804852, |
| "epoch": 0.9881488900016692, |
| "grad_norm": 31.25, |
| "learning_rate": 1.7291111111111113e-05, |
| "loss": 11.9123, |
| "mean_token_accuracy": 0.7961550422012806, |
| "num_tokens": 14760951.0, |
| "step": 2220 |
| }, |
| { |
| "entropy": 0.7878218747675418, |
| "epoch": 0.9926000111278028, |
| "grad_norm": 36.5, |
| "learning_rate": 1.726888888888889e-05, |
| "loss": 12.7326, |
| "mean_token_accuracy": 0.7862138673663139, |
| "num_tokens": 14823141.0, |
| "step": 2230 |
| }, |
| { |
| "entropy": 0.7608288100920617, |
| "epoch": 0.9970511322539365, |
| "grad_norm": 37.0, |
| "learning_rate": 1.724666666666667e-05, |
| "loss": 12.2585, |
| "mean_token_accuracy": 0.7904601756483316, |
| "num_tokens": 14886593.0, |
| "step": 2240 |
| }, |
| { |
| "entropy": 0.7384239917064642, |
| "epoch": 1.00133533633784, |
| "grad_norm": 46.25, |
| "learning_rate": 1.7224444444444446e-05, |
| "loss": 11.0289, |
| "mean_token_accuracy": 0.8005193413852097, |
| "num_tokens": 14951602.0, |
| "step": 2250 |
| }, |
| { |
| "entropy": 0.6310809502378106, |
| "epoch": 1.0057864574639737, |
| "grad_norm": 39.75, |
| "learning_rate": 1.7202222222222224e-05, |
| "loss": 10.1315, |
| "mean_token_accuracy": 0.8204233139753342, |
| "num_tokens": 15016941.0, |
| "step": 2260 |
| }, |
| { |
| "entropy": 0.6455286235548556, |
| "epoch": 1.0102375785901074, |
| "grad_norm": 38.25, |
| "learning_rate": 1.718e-05, |
| "loss": 10.3133, |
| "mean_token_accuracy": 0.8160859376192093, |
| "num_tokens": 15083858.0, |
| "step": 2270 |
| }, |
| { |
| "entropy": 0.5831745907664299, |
| "epoch": 1.014688699716241, |
| "grad_norm": 50.25, |
| "learning_rate": 1.715777777777778e-05, |
| "loss": 9.3342, |
| "mean_token_accuracy": 0.8300177838653326, |
| "num_tokens": 15152865.0, |
| "step": 2280 |
| }, |
| { |
| "entropy": 0.6471488554030657, |
| "epoch": 1.0191398208423748, |
| "grad_norm": 43.75, |
| "learning_rate": 1.7135555555555557e-05, |
| "loss": 10.2835, |
| "mean_token_accuracy": 0.8149789605289698, |
| "num_tokens": 15218564.0, |
| "step": 2290 |
| }, |
| { |
| "entropy": 0.6528089676983655, |
| "epoch": 1.0235909419685083, |
| "grad_norm": 44.25, |
| "learning_rate": 1.7113333333333334e-05, |
| "loss": 10.3278, |
| "mean_token_accuracy": 0.8157306212931872, |
| "num_tokens": 15281217.0, |
| "step": 2300 |
| }, |
| { |
| "epoch": 1.0235909419685083, |
| "eval_biology_entropy": 1.0225402714014054, |
| "eval_biology_loss": 1.3434255123138428, |
| "eval_biology_mean_token_accuracy": 0.6797452400922775, |
| "eval_biology_num_tokens": 15281217.0, |
| "eval_biology_runtime": 39.5295, |
| "eval_biology_samples_per_second": 12.649, |
| "eval_biology_steps_per_second": 12.649, |
| "step": 2300 |
| }, |
| { |
| "epoch": 1.0235909419685083, |
| "eval_chemistry_entropy": 0.6383010303974151, |
| "eval_chemistry_loss": 0.783981204032898, |
| "eval_chemistry_mean_token_accuracy": 0.7899075618982315, |
| "eval_chemistry_num_tokens": 15281217.0, |
| "eval_chemistry_runtime": 45.8468, |
| "eval_chemistry_samples_per_second": 10.906, |
| "eval_chemistry_steps_per_second": 10.906, |
| "step": 2300 |
| }, |
| { |
| "entropy": 0.6117025960236788, |
| "epoch": 1.0280420630946419, |
| "grad_norm": 41.75, |
| "learning_rate": 1.7091111111111112e-05, |
| "loss": 9.897, |
| "mean_token_accuracy": 0.8194273430854082, |
| "num_tokens": 15346128.0, |
| "step": 2310 |
| }, |
| { |
| "entropy": 0.6535170486196875, |
| "epoch": 1.0324931842207756, |
| "grad_norm": 33.5, |
| "learning_rate": 1.706888888888889e-05, |
| "loss": 10.4595, |
| "mean_token_accuracy": 0.8142324227839708, |
| "num_tokens": 15412606.0, |
| "step": 2320 |
| }, |
| { |
| "entropy": 0.6200952081009745, |
| "epoch": 1.0369443053469092, |
| "grad_norm": 41.75, |
| "learning_rate": 1.704666666666667e-05, |
| "loss": 10.0173, |
| "mean_token_accuracy": 0.8197212640196085, |
| "num_tokens": 15484636.0, |
| "step": 2330 |
| }, |
| { |
| "entropy": 0.5985450498759747, |
| "epoch": 1.041395426473043, |
| "grad_norm": 47.5, |
| "learning_rate": 1.7024444444444445e-05, |
| "loss": 9.4372, |
| "mean_token_accuracy": 0.8322823897004128, |
| "num_tokens": 15556398.0, |
| "step": 2340 |
| }, |
| { |
| "entropy": 0.6191821810789406, |
| "epoch": 1.0458465475991765, |
| "grad_norm": 50.25, |
| "learning_rate": 1.7002222222222226e-05, |
| "loss": 9.9681, |
| "mean_token_accuracy": 0.8229743007570505, |
| "num_tokens": 15624248.0, |
| "step": 2350 |
| }, |
| { |
| "entropy": 0.6431950107216835, |
| "epoch": 1.0502976687253103, |
| "grad_norm": 43.25, |
| "learning_rate": 1.698e-05, |
| "loss": 10.2769, |
| "mean_token_accuracy": 0.8181428145617247, |
| "num_tokens": 15689184.0, |
| "step": 2360 |
| }, |
| { |
| "entropy": 0.6467125362716615, |
| "epoch": 1.0547487898514438, |
| "grad_norm": 51.0, |
| "learning_rate": 1.695777777777778e-05, |
| "loss": 10.3969, |
| "mean_token_accuracy": 0.8166089791804552, |
| "num_tokens": 15754370.0, |
| "step": 2370 |
| }, |
| { |
| "entropy": 0.6156461857259273, |
| "epoch": 1.0591999109775774, |
| "grad_norm": 36.5, |
| "learning_rate": 1.6935555555555555e-05, |
| "loss": 10.0446, |
| "mean_token_accuracy": 0.8230110257863998, |
| "num_tokens": 15824975.0, |
| "step": 2380 |
| }, |
| { |
| "entropy": 0.6369591388851404, |
| "epoch": 1.0636510321037111, |
| "grad_norm": 45.75, |
| "learning_rate": 1.6913333333333336e-05, |
| "loss": 10.2334, |
| "mean_token_accuracy": 0.8198270745575428, |
| "num_tokens": 15886149.0, |
| "step": 2390 |
| }, |
| { |
| "entropy": 0.6227695440873504, |
| "epoch": 1.0681021532298447, |
| "grad_norm": 41.5, |
| "learning_rate": 1.689111111111111e-05, |
| "loss": 9.9398, |
| "mean_token_accuracy": 0.821424588188529, |
| "num_tokens": 15956560.0, |
| "step": 2400 |
| }, |
| { |
| "epoch": 1.0681021532298447, |
| "eval_biology_entropy": 1.005448550403118, |
| "eval_biology_loss": 1.3492766618728638, |
| "eval_biology_mean_token_accuracy": 0.6789008138775825, |
| "eval_biology_num_tokens": 15956560.0, |
| "eval_biology_runtime": 39.9693, |
| "eval_biology_samples_per_second": 12.51, |
| "eval_biology_steps_per_second": 12.51, |
| "step": 2400 |
| }, |
| { |
| "epoch": 1.0681021532298447, |
| "eval_chemistry_entropy": 0.624100355386734, |
| "eval_chemistry_loss": 0.7797554731369019, |
| "eval_chemistry_mean_token_accuracy": 0.7903617503643036, |
| "eval_chemistry_num_tokens": 15956560.0, |
| "eval_chemistry_runtime": 46.1352, |
| "eval_chemistry_samples_per_second": 10.838, |
| "eval_chemistry_steps_per_second": 10.838, |
| "step": 2400 |
| }, |
| { |
| "entropy": 0.6266451105475426, |
| "epoch": 1.0725532743559785, |
| "grad_norm": 39.25, |
| "learning_rate": 1.686888888888889e-05, |
| "loss": 10.219, |
| "mean_token_accuracy": 0.8190864365547895, |
| "num_tokens": 16025633.0, |
| "step": 2410 |
| }, |
| { |
| "entropy": 0.6389865916222334, |
| "epoch": 1.077004395482112, |
| "grad_norm": 55.5, |
| "learning_rate": 1.684666666666667e-05, |
| "loss": 10.0901, |
| "mean_token_accuracy": 0.8181588027626276, |
| "num_tokens": 16094535.0, |
| "step": 2420 |
| }, |
| { |
| "entropy": 0.6265366824343801, |
| "epoch": 1.0814555166082458, |
| "grad_norm": 44.5, |
| "learning_rate": 1.6824444444444447e-05, |
| "loss": 10.2274, |
| "mean_token_accuracy": 0.8162553690373897, |
| "num_tokens": 16159781.0, |
| "step": 2430 |
| }, |
| { |
| "entropy": 0.6317514531314373, |
| "epoch": 1.0859066377343793, |
| "grad_norm": 41.5, |
| "learning_rate": 1.6802222222222224e-05, |
| "loss": 10.0703, |
| "mean_token_accuracy": 0.8204928413033485, |
| "num_tokens": 16227695.0, |
| "step": 2440 |
| }, |
| { |
| "entropy": 0.6408463085070253, |
| "epoch": 1.090357758860513, |
| "grad_norm": 52.0, |
| "learning_rate": 1.6780000000000002e-05, |
| "loss": 10.1971, |
| "mean_token_accuracy": 0.8168174952268601, |
| "num_tokens": 16293211.0, |
| "step": 2450 |
| }, |
| { |
| "entropy": 0.624163047131151, |
| "epoch": 1.0948088799866467, |
| "grad_norm": 43.5, |
| "learning_rate": 1.675777777777778e-05, |
| "loss": 10.1139, |
| "mean_token_accuracy": 0.8228958930820227, |
| "num_tokens": 16361111.0, |
| "step": 2460 |
| }, |
| { |
| "entropy": 0.6617415506392718, |
| "epoch": 1.0992600011127802, |
| "grad_norm": 43.5, |
| "learning_rate": 1.6735555555555557e-05, |
| "loss": 10.5201, |
| "mean_token_accuracy": 0.8147166967391968, |
| "num_tokens": 16424587.0, |
| "step": 2470 |
| }, |
| { |
| "entropy": 0.6115672853775322, |
| "epoch": 1.103711122238914, |
| "grad_norm": 41.75, |
| "learning_rate": 1.6713333333333335e-05, |
| "loss": 9.9204, |
| "mean_token_accuracy": 0.824297409504652, |
| "num_tokens": 16495082.0, |
| "step": 2480 |
| }, |
| { |
| "entropy": 0.6393632598221302, |
| "epoch": 1.1081622433650475, |
| "grad_norm": 47.0, |
| "learning_rate": 1.6691111111111112e-05, |
| "loss": 10.2437, |
| "mean_token_accuracy": 0.8169457126408816, |
| "num_tokens": 16557596.0, |
| "step": 2490 |
| }, |
| { |
| "entropy": 0.6181258006952703, |
| "epoch": 1.1126133644911813, |
| "grad_norm": 33.25, |
| "learning_rate": 1.666888888888889e-05, |
| "loss": 10.0825, |
| "mean_token_accuracy": 0.8184908539056778, |
| "num_tokens": 16625999.0, |
| "step": 2500 |
| }, |
| { |
| "epoch": 1.1126133644911813, |
| "eval_biology_entropy": 1.0345592698454857, |
| "eval_biology_loss": 1.3405461311340332, |
| "eval_biology_mean_token_accuracy": 0.6796377742290497, |
| "eval_biology_num_tokens": 16625999.0, |
| "eval_biology_runtime": 39.4133, |
| "eval_biology_samples_per_second": 12.686, |
| "eval_biology_steps_per_second": 12.686, |
| "step": 2500 |
| }, |
| { |
| "epoch": 1.1126133644911813, |
| "eval_chemistry_entropy": 0.6612432144582272, |
| "eval_chemistry_loss": 0.7703909277915955, |
| "eval_chemistry_mean_token_accuracy": 0.7913251945972443, |
| "eval_chemistry_num_tokens": 16625999.0, |
| "eval_chemistry_runtime": 45.9075, |
| "eval_chemistry_samples_per_second": 10.891, |
| "eval_chemistry_steps_per_second": 10.891, |
| "step": 2500 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 10000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.1148458281800018e+18, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|