{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 333, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.7490966856479644, "epoch": 0.09009009009009009, "grad_norm": 4.318613052368164, "learning_rate": 9e-05, "loss": 6.8516, "mean_token_accuracy": 0.31817560344934465, "num_tokens": 5273.0, "step": 10 }, { "entropy": 2.358074116706848, "epoch": 0.18018018018018017, "grad_norm": 2.591895341873169, "learning_rate": 0.00019, "loss": 4.7791, "mean_token_accuracy": 0.4064402043819427, "num_tokens": 10323.0, "step": 20 }, { "entropy": 2.7442060232162477, "epoch": 0.2702702702702703, "grad_norm": 2.085822105407715, "learning_rate": 0.00019424920127795528, "loss": 2.528, "mean_token_accuracy": 0.6114133253693581, "num_tokens": 15657.0, "step": 30 }, { "entropy": 1.6624038934707641, "epoch": 0.36036036036036034, "grad_norm": 1.898437261581421, "learning_rate": 0.0001878594249201278, "loss": 1.5967, "mean_token_accuracy": 0.75713110268116, "num_tokens": 20566.0, "step": 40 }, { "entropy": 1.3454128205776215, "epoch": 0.45045045045045046, "grad_norm": 2.2532074451446533, "learning_rate": 0.00018146964856230032, "loss": 1.3224, "mean_token_accuracy": 0.7765294492244721, "num_tokens": 25625.0, "step": 50 }, { "entropy": 1.1165625154972076, "epoch": 0.5405405405405406, "grad_norm": 2.300675868988037, "learning_rate": 0.00017507987220447287, "loss": 1.0734, "mean_token_accuracy": 0.803844365477562, "num_tokens": 30624.0, "step": 60 }, { "entropy": 0.975060197710991, "epoch": 0.6306306306306306, "grad_norm": 1.954689383506775, "learning_rate": 0.00016869009584664536, "loss": 0.9733, "mean_token_accuracy": 0.818100169301033, "num_tokens": 35746.0, "step": 70 }, { "entropy": 0.8530668556690216, "epoch": 0.7207207207207207, "grad_norm": 2.083409309387207, "learning_rate": 0.0001623003194888179, "loss": 0.8131, "mean_token_accuracy": 0.8403199791908265, "num_tokens": 41031.0, "step": 80 }, { "entropy": 0.8242652952671051, "epoch": 0.8108108108108109, "grad_norm": 2.1543054580688477, "learning_rate": 0.00015591054313099042, "loss": 0.7285, "mean_token_accuracy": 0.8522787302732467, "num_tokens": 45910.0, "step": 90 }, { "entropy": 0.7216496258974076, "epoch": 0.9009009009009009, "grad_norm": 1.8068711757659912, "learning_rate": 0.00014952076677316297, "loss": 0.632, "mean_token_accuracy": 0.8595692455768585, "num_tokens": 51027.0, "step": 100 }, { "entropy": 0.6904670834541321, "epoch": 0.990990990990991, "grad_norm": 2.6211395263671875, "learning_rate": 0.00014313099041533546, "loss": 0.6219, "mean_token_accuracy": 0.8704036623239517, "num_tokens": 55880.0, "step": 110 }, { "entropy": 0.5557049199938774, "epoch": 1.0810810810810811, "grad_norm": 2.952716112136841, "learning_rate": 0.000136741214057508, "loss": 0.5041, "mean_token_accuracy": 0.8821422606706619, "num_tokens": 60665.0, "step": 120 }, { "entropy": 0.4335968837141991, "epoch": 1.1711711711711712, "grad_norm": 1.972198724746704, "learning_rate": 0.00013035143769968052, "loss": 0.4319, "mean_token_accuracy": 0.8849613904953003, "num_tokens": 65647.0, "step": 130 }, { "entropy": 0.44969954043626786, "epoch": 1.2612612612612613, "grad_norm": 1.9271284341812134, "learning_rate": 0.00012396166134185304, "loss": 0.4252, "mean_token_accuracy": 0.8931547611951828, "num_tokens": 70510.0, "step": 140 }, { "entropy": 0.40625376254320145, "epoch": 1.3513513513513513, "grad_norm": 2.1082940101623535, "learning_rate": 0.00011757188498402556, "loss": 0.4017, "mean_token_accuracy": 0.8913865387439728, "num_tokens": 75632.0, "step": 150 }, { "entropy": 0.45655421912670135, "epoch": 1.4414414414414414, "grad_norm": 1.5840785503387451, "learning_rate": 0.00011118210862619809, "loss": 0.4614, "mean_token_accuracy": 0.8811476260423661, "num_tokens": 80976.0, "step": 160 }, { "entropy": 0.42031795233488084, "epoch": 1.5315315315315314, "grad_norm": 1.667891502380371, "learning_rate": 0.00010479233226837062, "loss": 0.395, "mean_token_accuracy": 0.894879949092865, "num_tokens": 86111.0, "step": 170 }, { "entropy": 0.42292200922966006, "epoch": 1.6216216216216215, "grad_norm": 1.7975671291351318, "learning_rate": 9.840255591054314e-05, "loss": 0.4021, "mean_token_accuracy": 0.8952047973871231, "num_tokens": 91226.0, "step": 180 }, { "entropy": 0.3855786919593811, "epoch": 1.7117117117117115, "grad_norm": 2.244504690170288, "learning_rate": 9.201277955271566e-05, "loss": 0.387, "mean_token_accuracy": 0.8959153234958649, "num_tokens": 96405.0, "step": 190 }, { "entropy": 0.4205052852630615, "epoch": 1.8018018018018018, "grad_norm": 1.7068063020706177, "learning_rate": 8.562300319488819e-05, "loss": 0.419, "mean_token_accuracy": 0.888062196969986, "num_tokens": 101805.0, "step": 200 }, { "entropy": 0.37721182107925416, "epoch": 1.8918918918918919, "grad_norm": 1.7175687551498413, "learning_rate": 7.923322683706071e-05, "loss": 0.342, "mean_token_accuracy": 0.9077515214681625, "num_tokens": 106683.0, "step": 210 }, { "entropy": 0.33771649897098543, "epoch": 1.981981981981982, "grad_norm": 1.5167031288146973, "learning_rate": 7.284345047923323e-05, "loss": 0.3184, "mean_token_accuracy": 0.9064931035041809, "num_tokens": 111616.0, "step": 220 }, { "entropy": 0.3555280163884163, "epoch": 2.0720720720720722, "grad_norm": 1.660430908203125, "learning_rate": 6.645367412140575e-05, "loss": 0.3216, "mean_token_accuracy": 0.9094264894723892, "num_tokens": 116395.0, "step": 230 }, { "entropy": 0.33757986277341845, "epoch": 2.1621621621621623, "grad_norm": 1.3866914510726929, "learning_rate": 6.006389776357828e-05, "loss": 0.3271, "mean_token_accuracy": 0.9032522082328797, "num_tokens": 121543.0, "step": 240 }, { "entropy": 0.3460941515862942, "epoch": 2.2522522522522523, "grad_norm": 1.5430920124053955, "learning_rate": 5.36741214057508e-05, "loss": 0.3345, "mean_token_accuracy": 0.9088279217481613, "num_tokens": 126607.0, "step": 250 }, { "entropy": 0.30904691815376284, "epoch": 2.3423423423423424, "grad_norm": 1.9197715520858765, "learning_rate": 4.728434504792332e-05, "loss": 0.2831, "mean_token_accuracy": 0.9148383587598801, "num_tokens": 131589.0, "step": 260 }, { "entropy": 0.29833986386656763, "epoch": 2.4324324324324325, "grad_norm": 2.148452043533325, "learning_rate": 4.089456869009585e-05, "loss": 0.2785, "mean_token_accuracy": 0.9198061287403106, "num_tokens": 136431.0, "step": 270 }, { "entropy": 0.30950267389416697, "epoch": 2.5225225225225225, "grad_norm": 1.3571665287017822, "learning_rate": 3.450479233226837e-05, "loss": 0.2827, "mean_token_accuracy": 0.9169433653354645, "num_tokens": 141739.0, "step": 280 }, { "entropy": 0.27893091589212415, "epoch": 2.6126126126126126, "grad_norm": 1.4515094757080078, "learning_rate": 2.8115015974440894e-05, "loss": 0.2708, "mean_token_accuracy": 0.9156843811273575, "num_tokens": 146814.0, "step": 290 }, { "entropy": 0.3063665457069874, "epoch": 2.7027027027027026, "grad_norm": 2.0274174213409424, "learning_rate": 2.172523961661342e-05, "loss": 0.2873, "mean_token_accuracy": 0.9122885495424271, "num_tokens": 152037.0, "step": 300 }, { "entropy": 0.3225431203842163, "epoch": 2.7927927927927927, "grad_norm": 1.8013384342193604, "learning_rate": 1.533546325878594e-05, "loss": 0.2961, "mean_token_accuracy": 0.9138898193836212, "num_tokens": 157245.0, "step": 310 }, { "entropy": 0.2981728859245777, "epoch": 2.8828828828828827, "grad_norm": 1.7200424671173096, "learning_rate": 8.945686900958466e-06, "loss": 0.2758, "mean_token_accuracy": 0.9152151584625244, "num_tokens": 162185.0, "step": 320 }, { "entropy": 0.2874752961099148, "epoch": 2.972972972972973, "grad_norm": 1.8970165252685547, "learning_rate": 2.5559105431309904e-06, "loss": 0.2718, "mean_token_accuracy": 0.9175638914108276, "num_tokens": 167004.0, "step": 330 } ], "logging_steps": 10, "max_steps": 333, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1848157889673216.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }