{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 25, "global_step": 750, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.6712856006622314, "epoch": 0.1, "grad_norm": 0.6304148435592651, "learning_rate": 0.0001999990663152786, "loss": 2.0491, "mean_token_accuracy": 0.6684889650344848, "num_tokens": 122580.0, "step": 25 }, { "epoch": 0.1, "eval_entropy": 0.9992010481655598, "eval_loss": 0.9745309948921204, "eval_mean_token_accuracy": 0.815976720303297, "eval_num_tokens": 122580.0, "eval_runtime": 66.8951, "eval_samples_per_second": 29.898, "eval_steps_per_second": 0.478, "step": 25 }, { "entropy": 0.9771250176429749, "epoch": 0.2, "grad_norm": 0.4267440736293793, "learning_rate": 0.0001993694918299864, "loss": 0.9676, "mean_token_accuracy": 0.8179380083084107, "num_tokens": 244723.0, "step": 50 }, { "epoch": 0.2, "eval_entropy": 0.9376886244863272, "eval_loss": 0.940742015838623, "eval_mean_token_accuracy": 0.8212726972997189, "eval_num_tokens": 244723.0, "eval_runtime": 66.3094, "eval_samples_per_second": 30.162, "eval_steps_per_second": 0.483, "step": 50 }, { "entropy": 0.9467579674720764, "epoch": 0.3, "grad_norm": 0.39123401045799255, "learning_rate": 0.0001975812958575343, "loss": 0.9473, "mean_token_accuracy": 0.820980327129364, "num_tokens": 366803.0, "step": 75 }, { "epoch": 0.3, "eval_entropy": 0.930314002558589, "eval_loss": 0.9318345785140991, "eval_mean_token_accuracy": 0.8218902982771397, "eval_num_tokens": 366803.0, "eval_runtime": 66.1507, "eval_samples_per_second": 30.234, "eval_steps_per_second": 0.484, "step": 75 }, { "entropy": 0.9574691414833069, "epoch": 0.4, "grad_norm": 0.3710257112979889, "learning_rate": 0.00019465532828090735, "loss": 0.9545, "mean_token_accuracy": 0.8186049485206603, "num_tokens": 489374.0, "step": 100 }, { "epoch": 0.4, "eval_entropy": 0.9250399190932512, "eval_loss": 0.927577555179596, "eval_mean_token_accuracy": 0.8226032145321369, "eval_num_tokens": 489374.0, "eval_runtime": 66.6912, "eval_samples_per_second": 29.989, "eval_steps_per_second": 0.48, "step": 100 }, { "entropy": 0.9415939354896545, "epoch": 0.5, "grad_norm": 0.3609308898448944, "learning_rate": 0.00019062570509327992, "loss": 0.938, "mean_token_accuracy": 0.8219008255004883, "num_tokens": 611509.0, "step": 125 }, { "epoch": 0.5, "eval_entropy": 0.9134028796106577, "eval_loss": 0.9236319661140442, "eval_mean_token_accuracy": 0.8233284279704094, "eval_num_tokens": 611509.0, "eval_runtime": 67.9585, "eval_samples_per_second": 29.43, "eval_steps_per_second": 0.471, "step": 125 }, { "entropy": 0.9313359212875366, "epoch": 0.6, "grad_norm": 0.3689591586589813, "learning_rate": 0.00018553941061473218, "loss": 0.9277, "mean_token_accuracy": 0.8244240188598633, "num_tokens": 733437.0, "step": 150 }, { "epoch": 0.6, "eval_entropy": 0.9121778700500727, "eval_loss": 0.9206869006156921, "eval_mean_token_accuracy": 0.8243093993514776, "eval_num_tokens": 733437.0, "eval_runtime": 68.185, "eval_samples_per_second": 29.332, "eval_steps_per_second": 0.469, "step": 150 }, { "entropy": 0.9311716175079345, "epoch": 0.7, "grad_norm": 0.3817085027694702, "learning_rate": 0.00017945574966774376, "loss": 0.9292, "mean_token_accuracy": 0.8232095694541931, "num_tokens": 855571.0, "step": 175 }, { "epoch": 0.7, "eval_entropy": 0.9240057598799467, "eval_loss": 0.9190523028373718, "eval_mean_token_accuracy": 0.8243635054677725, "eval_num_tokens": 855571.0, "eval_runtime": 68.1745, "eval_samples_per_second": 29.336, "eval_steps_per_second": 0.469, "step": 175 }, { "entropy": 0.9368235039710998, "epoch": 0.8, "grad_norm": 0.343488872051239, "learning_rate": 0.00017244565609895074, "loss": 0.9328, "mean_token_accuracy": 0.822948260307312, "num_tokens": 977886.0, "step": 200 }, { "epoch": 0.8, "eval_entropy": 0.9235651567578316, "eval_loss": 0.9163441061973572, "eval_mean_token_accuracy": 0.8248479198664427, "eval_num_tokens": 977886.0, "eval_runtime": 68.3524, "eval_samples_per_second": 29.26, "eval_steps_per_second": 0.468, "step": 200 }, { "entropy": 0.9266206288337707, "epoch": 0.9, "grad_norm": 0.34697458148002625, "learning_rate": 0.00016459086570961594, "loss": 0.9247, "mean_token_accuracy": 0.8257838773727417, "num_tokens": 1099649.0, "step": 225 }, { "epoch": 0.9, "eval_entropy": 0.9170300494879484, "eval_loss": 0.9154583215713501, "eval_mean_token_accuracy": 0.8250364065170288, "eval_num_tokens": 1099649.0, "eval_runtime": 67.9598, "eval_samples_per_second": 29.429, "eval_steps_per_second": 0.471, "step": 225 }, { "entropy": 0.9486314964294433, "epoch": 1.0, "grad_norm": 0.38791951537132263, "learning_rate": 0.00015598296323822024, "loss": 0.9457, "mean_token_accuracy": 0.8199513030052185, "num_tokens": 1222434.0, "step": 250 }, { "epoch": 1.0, "eval_entropy": 0.9278132077306509, "eval_loss": 0.9133847951889038, "eval_mean_token_accuracy": 0.8252693247050047, "eval_num_tokens": 1222434.0, "eval_runtime": 67.957, "eval_samples_per_second": 29.43, "eval_steps_per_second": 0.471, "step": 250 }, { "entropy": 0.8935548496246338, "epoch": 1.1, "grad_norm": 0.3913302421569824, "learning_rate": 0.00014672231450710066, "loss": 0.8794, "mean_token_accuracy": 0.8301347184181214, "num_tokens": 1344769.0, "step": 275 }, { "epoch": 1.1, "eval_entropy": 0.8964261263608932, "eval_loss": 0.918248176574707, "eval_mean_token_accuracy": 0.8246872704476118, "eval_num_tokens": 1344769.0, "eval_runtime": 67.949, "eval_samples_per_second": 29.434, "eval_steps_per_second": 0.471, "step": 275 }, { "entropy": 0.8727394843101501, "epoch": 1.2, "grad_norm": 0.39644864201545715, "learning_rate": 0.00013691689618401835, "loss": 0.8669, "mean_token_accuracy": 0.8310828638076783, "num_tokens": 1466893.0, "step": 300 }, { "epoch": 1.2, "eval_entropy": 0.8782581854611635, "eval_loss": 0.917015552520752, "eval_mean_token_accuracy": 0.8250990845263004, "eval_num_tokens": 1466893.0, "eval_runtime": 67.8451, "eval_samples_per_second": 29.479, "eval_steps_per_second": 0.472, "step": 300 }, { "entropy": 0.8778977513313293, "epoch": 1.3, "grad_norm": 0.3395286500453949, "learning_rate": 0.00012668103680332012, "loss": 0.876, "mean_token_accuracy": 0.8312993144989014, "num_tokens": 1589094.0, "step": 325 }, { "epoch": 1.3, "eval_entropy": 0.8799433764070272, "eval_loss": 0.9174618124961853, "eval_mean_token_accuracy": 0.8249436803162098, "eval_num_tokens": 1589094.0, "eval_runtime": 68.1596, "eval_samples_per_second": 29.343, "eval_steps_per_second": 0.469, "step": 325 }, { "entropy": 0.8876111268997192, "epoch": 1.4, "grad_norm": 0.364728182554245, "learning_rate": 0.00011613408372604825, "loss": 0.8773, "mean_token_accuracy": 0.8295953154563904, "num_tokens": 1711583.0, "step": 350 }, { "epoch": 1.4, "eval_entropy": 0.85817476734519, "eval_loss": 0.9163104295730591, "eval_mean_token_accuracy": 0.8252090867608786, "eval_num_tokens": 1711583.0, "eval_runtime": 68.1812, "eval_samples_per_second": 29.334, "eval_steps_per_second": 0.469, "step": 350 }, { "entropy": 0.88423011302948, "epoch": 1.5, "grad_norm": 0.38693588972091675, "learning_rate": 0.00010539901158188398, "loss": 0.883, "mean_token_accuracy": 0.8286035037040711, "num_tokens": 1834553.0, "step": 375 }, { "epoch": 1.5, "eval_entropy": 0.873553803190589, "eval_loss": 0.9151167273521423, "eval_mean_token_accuracy": 0.8254394326359034, "eval_num_tokens": 1834553.0, "eval_runtime": 68.1992, "eval_samples_per_second": 29.326, "eval_steps_per_second": 0.469, "step": 375 }, { "entropy": 0.8715341877937317, "epoch": 1.6, "grad_norm": 0.3770897686481476, "learning_rate": 9.460098841811601e-05, "loss": 0.87, "mean_token_accuracy": 0.8320170021057129, "num_tokens": 1956574.0, "step": 400 }, { "epoch": 1.6, "eval_entropy": 0.87935302965343, "eval_loss": 0.9151723980903625, "eval_mean_token_accuracy": 0.8255144450813532, "eval_num_tokens": 1956574.0, "eval_runtime": 67.9177, "eval_samples_per_second": 29.447, "eval_steps_per_second": 0.471, "step": 400 }, { "entropy": 0.8710586881637573, "epoch": 1.7, "grad_norm": 0.3885301351547241, "learning_rate": 8.386591627395173e-05, "loss": 0.8672, "mean_token_accuracy": 0.8299804759025574, "num_tokens": 2078811.0, "step": 425 }, { "epoch": 1.7, "eval_entropy": 0.8739563841372728, "eval_loss": 0.9149895906448364, "eval_mean_token_accuracy": 0.8253704849630594, "eval_num_tokens": 2078811.0, "eval_runtime": 66.7099, "eval_samples_per_second": 29.981, "eval_steps_per_second": 0.48, "step": 425 }, { "entropy": 0.8689111661911011, "epoch": 1.8, "grad_norm": 0.38322678208351135, "learning_rate": 7.33189631966799e-05, "loss": 0.865, "mean_token_accuracy": 0.8322654938697815, "num_tokens": 2200571.0, "step": 450 }, { "epoch": 1.8, "eval_entropy": 0.8741269316524267, "eval_loss": 0.9147102236747742, "eval_mean_token_accuracy": 0.8253954574465752, "eval_num_tokens": 2200571.0, "eval_runtime": 66.1144, "eval_samples_per_second": 30.251, "eval_steps_per_second": 0.484, "step": 450 }, { "entropy": 0.8786772465705872, "epoch": 1.9, "grad_norm": 0.3754963278770447, "learning_rate": 6.308310381598168e-05, "loss": 0.8755, "mean_token_accuracy": 0.8297365355491638, "num_tokens": 2322788.0, "step": 475 }, { "epoch": 1.9, "eval_entropy": 0.8773031029850245, "eval_loss": 0.9134438633918762, "eval_mean_token_accuracy": 0.8253685813397169, "eval_num_tokens": 2322788.0, "eval_runtime": 66.1468, "eval_samples_per_second": 30.236, "eval_steps_per_second": 0.484, "step": 475 }, { "entropy": 0.8698205542564392, "epoch": 2.0, "grad_norm": 0.37209320068359375, "learning_rate": 5.327768549289934e-05, "loss": 0.8654, "mean_token_accuracy": 0.8314353656768799, "num_tokens": 2444868.0, "step": 500 }, { "epoch": 2.0, "eval_entropy": 0.874303799122572, "eval_loss": 0.9136784076690674, "eval_mean_token_accuracy": 0.8255054354667664, "eval_num_tokens": 2444868.0, "eval_runtime": 65.7882, "eval_samples_per_second": 30.401, "eval_steps_per_second": 0.486, "step": 500 }, { "entropy": 0.8443045258522034, "epoch": 2.1, "grad_norm": 0.3855106830596924, "learning_rate": 4.4017036761779787e-05, "loss": 0.8248, "mean_token_accuracy": 0.8369424676895142, "num_tokens": 2567342.0, "step": 525 }, { "epoch": 2.1, "eval_entropy": 0.8267646413296461, "eval_loss": 0.9248229265213013, "eval_mean_token_accuracy": 0.8247568681836128, "eval_num_tokens": 2567342.0, "eval_runtime": 66.4418, "eval_samples_per_second": 30.102, "eval_steps_per_second": 0.482, "step": 525 }, { "entropy": 0.8175929665565491, "epoch": 2.2, "grad_norm": 0.3969684839248657, "learning_rate": 3.540913429038407e-05, "loss": 0.8159, "mean_token_accuracy": 0.8389404201507569, "num_tokens": 2689296.0, "step": 550 }, { "epoch": 2.2, "eval_entropy": 0.8400543294847012, "eval_loss": 0.9235786199569702, "eval_mean_token_accuracy": 0.8250540643930435, "eval_num_tokens": 2689296.0, "eval_runtime": 66.682, "eval_samples_per_second": 29.993, "eval_steps_per_second": 0.48, "step": 550 }, { "entropy": 0.8221414375305176, "epoch": 2.3, "grad_norm": 0.4103279709815979, "learning_rate": 2.7554343901049294e-05, "loss": 0.8108, "mean_token_accuracy": 0.8397244310379028, "num_tokens": 2811102.0, "step": 575 }, { "epoch": 2.3, "eval_entropy": 0.8320044614374638, "eval_loss": 0.9250128269195557, "eval_mean_token_accuracy": 0.8249792046844959, "eval_num_tokens": 2811102.0, "eval_runtime": 68.2399, "eval_samples_per_second": 29.308, "eval_steps_per_second": 0.469, "step": 575 }, { "entropy": 0.8239590454101563, "epoch": 2.4, "grad_norm": 0.4046800434589386, "learning_rate": 2.0544250332256276e-05, "loss": 0.8193, "mean_token_accuracy": 0.8385440015792847, "num_tokens": 2933467.0, "step": 600 }, { "epoch": 2.4, "eval_entropy": 0.8278196454048157, "eval_loss": 0.9243917465209961, "eval_mean_token_accuracy": 0.82486542314291, "eval_num_tokens": 2933467.0, "eval_runtime": 68.1284, "eval_samples_per_second": 29.356, "eval_steps_per_second": 0.47, "step": 600 }, { "entropy": 0.8189333271980286, "epoch": 2.5, "grad_norm": 0.38442716002464294, "learning_rate": 1.4460589385267842e-05, "loss": 0.8178, "mean_token_accuracy": 0.8380931878089904, "num_tokens": 3055276.0, "step": 625 }, { "epoch": 2.5, "eval_entropy": 0.8392351847141981, "eval_loss": 0.9240784645080566, "eval_mean_token_accuracy": 0.8248794414103031, "eval_num_tokens": 3055276.0, "eval_runtime": 68.2247, "eval_samples_per_second": 29.315, "eval_steps_per_second": 0.469, "step": 625 }, { "entropy": 0.8184775829315185, "epoch": 2.6, "grad_norm": 0.42365893721580505, "learning_rate": 9.374294906720082e-06, "loss": 0.8096, "mean_token_accuracy": 0.8411558413505554, "num_tokens": 3176654.0, "step": 650 }, { "epoch": 2.6, "eval_entropy": 0.8347331862896681, "eval_loss": 0.9245245456695557, "eval_mean_token_accuracy": 0.8248291350901127, "eval_num_tokens": 3176654.0, "eval_runtime": 67.8139, "eval_samples_per_second": 29.492, "eval_steps_per_second": 0.472, "step": 650 }, { "entropy": 0.8422538375854492, "epoch": 2.7, "grad_norm": 0.410686194896698, "learning_rate": 5.344671719092664e-06, "loss": 0.8453, "mean_token_accuracy": 0.832388162612915, "num_tokens": 3299985.0, "step": 675 }, { "epoch": 2.7, "eval_entropy": 0.8364100754261017, "eval_loss": 0.9244425296783447, "eval_mean_token_accuracy": 0.8246717043220997, "eval_num_tokens": 3299985.0, "eval_runtime": 67.9993, "eval_samples_per_second": 29.412, "eval_steps_per_second": 0.471, "step": 675 }, { "entropy": 0.8317058253288269, "epoch": 2.8, "grad_norm": 0.45394909381866455, "learning_rate": 2.418704142465722e-06, "loss": 0.8301, "mean_token_accuracy": 0.8355569648742676, "num_tokens": 3422725.0, "step": 700 }, { "epoch": 2.8, "eval_entropy": 0.8373962622135878, "eval_loss": 0.924216091632843, "eval_mean_token_accuracy": 0.8247099295258522, "eval_num_tokens": 3422725.0, "eval_runtime": 67.6852, "eval_samples_per_second": 29.549, "eval_steps_per_second": 0.473, "step": 700 }, { "entropy": 0.8333729934692383, "epoch": 2.9, "grad_norm": 0.4409957230091095, "learning_rate": 6.305081700136328e-07, "loss": 0.8241, "mean_token_accuracy": 0.83688227891922, "num_tokens": 3545132.0, "step": 725 }, { "epoch": 2.9, "eval_entropy": 0.8377762027084827, "eval_loss": 0.924144446849823, "eval_mean_token_accuracy": 0.824684476479888, "eval_num_tokens": 3545132.0, "eval_runtime": 68.0427, "eval_samples_per_second": 29.393, "eval_steps_per_second": 0.47, "step": 725 }, { "entropy": 0.8257460308074951, "epoch": 3.0, "grad_norm": 0.40077298879623413, "learning_rate": 9.336847214269639e-10, "loss": 0.8146, "mean_token_accuracy": 0.8383808851242065, "num_tokens": 3667302.0, "step": 750 }, { "epoch": 3.0, "eval_entropy": 0.8377156592905521, "eval_loss": 0.9241430759429932, "eval_mean_token_accuracy": 0.8246781621128321, "eval_num_tokens": 3667302.0, "eval_runtime": 67.5858, "eval_samples_per_second": 29.592, "eval_steps_per_second": 0.473, "step": 750 } ], "logging_steps": 25, "max_steps": 750, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.177111226528563e+16, "train_batch_size": 64, "trial_name": null, "trial_params": null }