{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.48, "eval_steps": 500, "global_step": 60, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008, "grad_norm": 34.44804382324219, "learning_rate": 1e-05, "loss": 13.0101, "mean_token_accuracy": 0.4696590006351471, "step": 1 }, { "epoch": 0.016, "grad_norm": 30.779788970947266, "learning_rate": 2e-05, "loss": 12.3851, "mean_token_accuracy": 0.47303473204374313, "step": 2 }, { "epoch": 0.024, "grad_norm": 29.67559242248535, "learning_rate": 3e-05, "loss": 12.3488, "mean_token_accuracy": 0.49709559231996536, "step": 3 }, { "epoch": 0.032, "grad_norm": 26.862010955810547, "learning_rate": 4e-05, "loss": 11.6596, "mean_token_accuracy": 0.5584611147642136, "step": 4 }, { "epoch": 0.04, "grad_norm": 22.10072135925293, "learning_rate": 5e-05, "loss": 10.1384, "mean_token_accuracy": 0.5924926251173019, "step": 5 }, { "epoch": 0.048, "grad_norm": 20.171361923217773, "learning_rate": 4.909090909090909e-05, "loss": 9.5421, "mean_token_accuracy": 0.5888276249170303, "step": 6 }, { "epoch": 0.056, "grad_norm": 16.452842712402344, "learning_rate": 4.8181818181818186e-05, "loss": 8.4344, "mean_token_accuracy": 0.632336363196373, "step": 7 }, { "epoch": 0.064, "grad_norm": 12.617485046386719, "learning_rate": 4.7272727272727275e-05, "loss": 7.7811, "mean_token_accuracy": 0.6625082790851593, "step": 8 }, { "epoch": 0.072, "grad_norm": 11.546710968017578, "learning_rate": 4.636363636363636e-05, "loss": 7.5684, "mean_token_accuracy": 0.6712179630994797, "step": 9 }, { "epoch": 0.08, "grad_norm": 9.277382850646973, "learning_rate": 4.545454545454546e-05, "loss": 7.5668, "mean_token_accuracy": 0.6923484355211258, "step": 10 }, { "epoch": 0.088, "grad_norm": 9.137121200561523, "learning_rate": 4.454545454545455e-05, "loss": 7.2181, "mean_token_accuracy": 0.6936477273702621, "step": 11 }, { "epoch": 0.096, "grad_norm": 9.180350303649902, "learning_rate": 4.3636363636363636e-05, "loss": 6.8036, "mean_token_accuracy": 0.713180348277092, "step": 12 }, { "epoch": 0.104, "grad_norm": 8.957921028137207, "learning_rate": 4.2727272727272724e-05, "loss": 6.8041, "mean_token_accuracy": 0.7040871828794479, "step": 13 }, { "epoch": 0.112, "grad_norm": 8.476972579956055, "learning_rate": 4.181818181818182e-05, "loss": 6.6941, "mean_token_accuracy": 0.6894638538360596, "step": 14 }, { "epoch": 0.12, "grad_norm": 7.86570930480957, "learning_rate": 4.0909090909090915e-05, "loss": 6.1827, "mean_token_accuracy": 0.7322177290916443, "step": 15 }, { "epoch": 0.128, "grad_norm": 10.238632202148438, "learning_rate": 4e-05, "loss": 6.4157, "mean_token_accuracy": 0.706741139292717, "step": 16 }, { "epoch": 0.136, "grad_norm": 9.224650382995605, "learning_rate": 3.909090909090909e-05, "loss": 6.0898, "mean_token_accuracy": 0.7200081646442413, "step": 17 }, { "epoch": 0.144, "grad_norm": 9.348233222961426, "learning_rate": 3.818181818181819e-05, "loss": 6.2038, "mean_token_accuracy": 0.7200899869203568, "step": 18 }, { "epoch": 0.152, "grad_norm": 7.475156307220459, "learning_rate": 3.7272727272727276e-05, "loss": 6.4592, "mean_token_accuracy": 0.715986579656601, "step": 19 }, { "epoch": 0.16, "grad_norm": 7.94195556640625, "learning_rate": 3.6363636363636364e-05, "loss": 6.2736, "mean_token_accuracy": 0.7263044863939285, "step": 20 }, { "epoch": 0.168, "grad_norm": 8.899458885192871, "learning_rate": 3.545454545454546e-05, "loss": 6.0882, "mean_token_accuracy": 0.7329658418893814, "step": 21 }, { "epoch": 0.176, "grad_norm": 7.235325813293457, "learning_rate": 3.454545454545455e-05, "loss": 5.2582, "mean_token_accuracy": 0.764240637421608, "step": 22 }, { "epoch": 0.184, "grad_norm": 7.569215774536133, "learning_rate": 3.3636363636363636e-05, "loss": 6.3141, "mean_token_accuracy": 0.7120286226272583, "step": 23 }, { "epoch": 0.192, "grad_norm": 6.983068943023682, "learning_rate": 3.272727272727273e-05, "loss": 5.6318, "mean_token_accuracy": 0.7469822317361832, "step": 24 }, { "epoch": 0.2, "grad_norm": 8.990687370300293, "learning_rate": 3.181818181818182e-05, "loss": 5.3612, "mean_token_accuracy": 0.7554384022951126, "step": 25 }, { "epoch": 0.208, "grad_norm": 7.9287848472595215, "learning_rate": 3.090909090909091e-05, "loss": 5.4804, "mean_token_accuracy": 0.7457730770111084, "step": 26 }, { "epoch": 0.216, "grad_norm": 9.334953308105469, "learning_rate": 3e-05, "loss": 5.7448, "mean_token_accuracy": 0.7454082369804382, "step": 27 }, { "epoch": 0.224, "grad_norm": 8.032136917114258, "learning_rate": 2.909090909090909e-05, "loss": 5.5671, "mean_token_accuracy": 0.7557599395513535, "step": 28 }, { "epoch": 0.232, "grad_norm": 7.999697208404541, "learning_rate": 2.818181818181818e-05, "loss": 5.7043, "mean_token_accuracy": 0.7504953891038895, "step": 29 }, { "epoch": 0.24, "grad_norm": 7.705344200134277, "learning_rate": 2.7272727272727273e-05, "loss": 5.8932, "mean_token_accuracy": 0.7292879223823547, "step": 30 }, { "epoch": 0.248, "grad_norm": 7.740062236785889, "learning_rate": 2.636363636363636e-05, "loss": 6.1266, "mean_token_accuracy": 0.7393394261598587, "step": 31 }, { "epoch": 0.256, "grad_norm": 7.92697286605835, "learning_rate": 2.5454545454545454e-05, "loss": 5.2805, "mean_token_accuracy": 0.764379158616066, "step": 32 }, { "epoch": 0.264, "grad_norm": 6.873337268829346, "learning_rate": 2.4545454545454545e-05, "loss": 5.1379, "mean_token_accuracy": 0.7647270262241364, "step": 33 }, { "epoch": 0.272, "grad_norm": 6.499383449554443, "learning_rate": 2.3636363636363637e-05, "loss": 5.4832, "mean_token_accuracy": 0.7581320852041245, "step": 34 }, { "epoch": 0.28, "grad_norm": 7.361469745635986, "learning_rate": 2.272727272727273e-05, "loss": 5.821, "mean_token_accuracy": 0.7482968121767044, "step": 35 }, { "epoch": 0.288, "grad_norm": 6.693004131317139, "learning_rate": 2.1818181818181818e-05, "loss": 5.6741, "mean_token_accuracy": 0.7488989531993866, "step": 36 }, { "epoch": 0.296, "grad_norm": 6.434469223022461, "learning_rate": 2.090909090909091e-05, "loss": 4.9946, "mean_token_accuracy": 0.7904711812734604, "step": 37 }, { "epoch": 0.304, "grad_norm": 7.098775386810303, "learning_rate": 2e-05, "loss": 5.5143, "mean_token_accuracy": 0.7486915439367294, "step": 38 }, { "epoch": 0.312, "grad_norm": 7.341176509857178, "learning_rate": 1.9090909090909094e-05, "loss": 5.7145, "mean_token_accuracy": 0.7340664714574814, "step": 39 }, { "epoch": 0.32, "grad_norm": 6.959894180297852, "learning_rate": 1.8181818181818182e-05, "loss": 5.6219, "mean_token_accuracy": 0.7580364942550659, "step": 40 }, { "epoch": 0.328, "grad_norm": 7.867330074310303, "learning_rate": 1.7272727272727274e-05, "loss": 5.1341, "mean_token_accuracy": 0.7646168619394302, "step": 41 }, { "epoch": 0.336, "grad_norm": 6.413680076599121, "learning_rate": 1.6363636363636366e-05, "loss": 5.2441, "mean_token_accuracy": 0.7653319537639618, "step": 42 }, { "epoch": 0.344, "grad_norm": 6.393170356750488, "learning_rate": 1.5454545454545454e-05, "loss": 4.8617, "mean_token_accuracy": 0.7705393433570862, "step": 43 }, { "epoch": 0.352, "grad_norm": 7.7843523025512695, "learning_rate": 1.4545454545454545e-05, "loss": 4.9986, "mean_token_accuracy": 0.7739708423614502, "step": 44 }, { "epoch": 0.36, "grad_norm": 6.432173728942871, "learning_rate": 1.3636363636363637e-05, "loss": 4.998, "mean_token_accuracy": 0.7664141207933426, "step": 45 }, { "epoch": 0.368, "grad_norm": 8.786981582641602, "learning_rate": 1.2727272727272727e-05, "loss": 4.3787, "mean_token_accuracy": 0.794494241476059, "step": 46 }, { "epoch": 0.376, "grad_norm": 7.160989284515381, "learning_rate": 1.1818181818181819e-05, "loss": 5.0146, "mean_token_accuracy": 0.7729983627796173, "step": 47 }, { "epoch": 0.384, "grad_norm": 7.532926559448242, "learning_rate": 1.0909090909090909e-05, "loss": 5.3312, "mean_token_accuracy": 0.757838249206543, "step": 48 }, { "epoch": 0.392, "grad_norm": 7.59615421295166, "learning_rate": 1e-05, "loss": 5.4593, "mean_token_accuracy": 0.7495291978120804, "step": 49 }, { "epoch": 0.4, "grad_norm": 7.142901420593262, "learning_rate": 9.090909090909091e-06, "loss": 5.8141, "mean_token_accuracy": 0.7420907914638519, "step": 50 }, { "epoch": 0.408, "grad_norm": 7.876557350158691, "learning_rate": 8.181818181818183e-06, "loss": 4.959, "mean_token_accuracy": 0.7740796357393265, "step": 51 }, { "epoch": 0.416, "grad_norm": 8.242363929748535, "learning_rate": 7.272727272727272e-06, "loss": 4.3558, "mean_token_accuracy": 0.7940836101770401, "step": 52 }, { "epoch": 0.424, "grad_norm": 7.2721452713012695, "learning_rate": 6.363636363636363e-06, "loss": 5.3993, "mean_token_accuracy": 0.750448003411293, "step": 53 }, { "epoch": 0.432, "grad_norm": 8.326936721801758, "learning_rate": 5.4545454545454545e-06, "loss": 5.2227, "mean_token_accuracy": 0.7582048922777176, "step": 54 }, { "epoch": 0.44, "grad_norm": 7.052926063537598, "learning_rate": 4.5454545454545455e-06, "loss": 5.415, "mean_token_accuracy": 0.7697479426860809, "step": 55 }, { "epoch": 0.448, "grad_norm": 7.102321147918701, "learning_rate": 3.636363636363636e-06, "loss": 4.7006, "mean_token_accuracy": 0.7812397330999374, "step": 56 }, { "epoch": 0.456, "grad_norm": 6.185770034790039, "learning_rate": 2.7272727272727272e-06, "loss": 5.3694, "mean_token_accuracy": 0.7675666660070419, "step": 57 }, { "epoch": 0.464, "grad_norm": 6.738717079162598, "learning_rate": 1.818181818181818e-06, "loss": 4.4993, "mean_token_accuracy": 0.7917324602603912, "step": 58 }, { "epoch": 0.472, "grad_norm": 7.1644697189331055, "learning_rate": 9.09090909090909e-07, "loss": 5.039, "mean_token_accuracy": 0.7702366560697556, "step": 59 }, { "epoch": 0.48, "grad_norm": 6.943840503692627, "learning_rate": 0.0, "loss": 4.9739, "mean_token_accuracy": 0.7831375449895859, "step": 60 } ], "logging_steps": 1, "max_steps": 60, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 927896961024000.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }