| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.48, | |
| "eval_steps": 500, | |
| "global_step": 60, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.008, | |
| "grad_norm": 34.44804382324219, | |
| "learning_rate": 1e-05, | |
| "loss": 13.0101, | |
| "mean_token_accuracy": 0.4696590006351471, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 30.779788970947266, | |
| "learning_rate": 2e-05, | |
| "loss": 12.3851, | |
| "mean_token_accuracy": 0.47303473204374313, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.024, | |
| "grad_norm": 29.67559242248535, | |
| "learning_rate": 3e-05, | |
| "loss": 12.3488, | |
| "mean_token_accuracy": 0.49709559231996536, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 26.862010955810547, | |
| "learning_rate": 4e-05, | |
| "loss": 11.6596, | |
| "mean_token_accuracy": 0.5584611147642136, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 22.10072135925293, | |
| "learning_rate": 5e-05, | |
| "loss": 10.1384, | |
| "mean_token_accuracy": 0.5924926251173019, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 20.171361923217773, | |
| "learning_rate": 4.909090909090909e-05, | |
| "loss": 9.5421, | |
| "mean_token_accuracy": 0.5888276249170303, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.056, | |
| "grad_norm": 16.452842712402344, | |
| "learning_rate": 4.8181818181818186e-05, | |
| "loss": 8.4344, | |
| "mean_token_accuracy": 0.632336363196373, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 12.617485046386719, | |
| "learning_rate": 4.7272727272727275e-05, | |
| "loss": 7.7811, | |
| "mean_token_accuracy": 0.6625082790851593, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.072, | |
| "grad_norm": 11.546710968017578, | |
| "learning_rate": 4.636363636363636e-05, | |
| "loss": 7.5684, | |
| "mean_token_accuracy": 0.6712179630994797, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 9.277382850646973, | |
| "learning_rate": 4.545454545454546e-05, | |
| "loss": 7.5668, | |
| "mean_token_accuracy": 0.6923484355211258, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.088, | |
| "grad_norm": 9.137121200561523, | |
| "learning_rate": 4.454545454545455e-05, | |
| "loss": 7.2181, | |
| "mean_token_accuracy": 0.6936477273702621, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 9.180350303649902, | |
| "learning_rate": 4.3636363636363636e-05, | |
| "loss": 6.8036, | |
| "mean_token_accuracy": 0.713180348277092, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.104, | |
| "grad_norm": 8.957921028137207, | |
| "learning_rate": 4.2727272727272724e-05, | |
| "loss": 6.8041, | |
| "mean_token_accuracy": 0.7040871828794479, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 8.476972579956055, | |
| "learning_rate": 4.181818181818182e-05, | |
| "loss": 6.6941, | |
| "mean_token_accuracy": 0.6894638538360596, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 7.86570930480957, | |
| "learning_rate": 4.0909090909090915e-05, | |
| "loss": 6.1827, | |
| "mean_token_accuracy": 0.7322177290916443, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 10.238632202148438, | |
| "learning_rate": 4e-05, | |
| "loss": 6.4157, | |
| "mean_token_accuracy": 0.706741139292717, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.136, | |
| "grad_norm": 9.224650382995605, | |
| "learning_rate": 3.909090909090909e-05, | |
| "loss": 6.0898, | |
| "mean_token_accuracy": 0.7200081646442413, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": 9.348233222961426, | |
| "learning_rate": 3.818181818181819e-05, | |
| "loss": 6.2038, | |
| "mean_token_accuracy": 0.7200899869203568, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.152, | |
| "grad_norm": 7.475156307220459, | |
| "learning_rate": 3.7272727272727276e-05, | |
| "loss": 6.4592, | |
| "mean_token_accuracy": 0.715986579656601, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 7.94195556640625, | |
| "learning_rate": 3.6363636363636364e-05, | |
| "loss": 6.2736, | |
| "mean_token_accuracy": 0.7263044863939285, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.168, | |
| "grad_norm": 8.899458885192871, | |
| "learning_rate": 3.545454545454546e-05, | |
| "loss": 6.0882, | |
| "mean_token_accuracy": 0.7329658418893814, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "grad_norm": 7.235325813293457, | |
| "learning_rate": 3.454545454545455e-05, | |
| "loss": 5.2582, | |
| "mean_token_accuracy": 0.764240637421608, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.184, | |
| "grad_norm": 7.569215774536133, | |
| "learning_rate": 3.3636363636363636e-05, | |
| "loss": 6.3141, | |
| "mean_token_accuracy": 0.7120286226272583, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 6.983068943023682, | |
| "learning_rate": 3.272727272727273e-05, | |
| "loss": 5.6318, | |
| "mean_token_accuracy": 0.7469822317361832, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 8.990687370300293, | |
| "learning_rate": 3.181818181818182e-05, | |
| "loss": 5.3612, | |
| "mean_token_accuracy": 0.7554384022951126, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.208, | |
| "grad_norm": 7.9287848472595215, | |
| "learning_rate": 3.090909090909091e-05, | |
| "loss": 5.4804, | |
| "mean_token_accuracy": 0.7457730770111084, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.216, | |
| "grad_norm": 9.334953308105469, | |
| "learning_rate": 3e-05, | |
| "loss": 5.7448, | |
| "mean_token_accuracy": 0.7454082369804382, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 8.032136917114258, | |
| "learning_rate": 2.909090909090909e-05, | |
| "loss": 5.5671, | |
| "mean_token_accuracy": 0.7557599395513535, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.232, | |
| "grad_norm": 7.999697208404541, | |
| "learning_rate": 2.818181818181818e-05, | |
| "loss": 5.7043, | |
| "mean_token_accuracy": 0.7504953891038895, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 7.705344200134277, | |
| "learning_rate": 2.7272727272727273e-05, | |
| "loss": 5.8932, | |
| "mean_token_accuracy": 0.7292879223823547, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.248, | |
| "grad_norm": 7.740062236785889, | |
| "learning_rate": 2.636363636363636e-05, | |
| "loss": 6.1266, | |
| "mean_token_accuracy": 0.7393394261598587, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 7.92697286605835, | |
| "learning_rate": 2.5454545454545454e-05, | |
| "loss": 5.2805, | |
| "mean_token_accuracy": 0.764379158616066, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.264, | |
| "grad_norm": 6.873337268829346, | |
| "learning_rate": 2.4545454545454545e-05, | |
| "loss": 5.1379, | |
| "mean_token_accuracy": 0.7647270262241364, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.272, | |
| "grad_norm": 6.499383449554443, | |
| "learning_rate": 2.3636363636363637e-05, | |
| "loss": 5.4832, | |
| "mean_token_accuracy": 0.7581320852041245, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 7.361469745635986, | |
| "learning_rate": 2.272727272727273e-05, | |
| "loss": 5.821, | |
| "mean_token_accuracy": 0.7482968121767044, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 6.693004131317139, | |
| "learning_rate": 2.1818181818181818e-05, | |
| "loss": 5.6741, | |
| "mean_token_accuracy": 0.7488989531993866, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.296, | |
| "grad_norm": 6.434469223022461, | |
| "learning_rate": 2.090909090909091e-05, | |
| "loss": 4.9946, | |
| "mean_token_accuracy": 0.7904711812734604, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.304, | |
| "grad_norm": 7.098775386810303, | |
| "learning_rate": 2e-05, | |
| "loss": 5.5143, | |
| "mean_token_accuracy": 0.7486915439367294, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.312, | |
| "grad_norm": 7.341176509857178, | |
| "learning_rate": 1.9090909090909094e-05, | |
| "loss": 5.7145, | |
| "mean_token_accuracy": 0.7340664714574814, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 6.959894180297852, | |
| "learning_rate": 1.8181818181818182e-05, | |
| "loss": 5.6219, | |
| "mean_token_accuracy": 0.7580364942550659, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.328, | |
| "grad_norm": 7.867330074310303, | |
| "learning_rate": 1.7272727272727274e-05, | |
| "loss": 5.1341, | |
| "mean_token_accuracy": 0.7646168619394302, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "grad_norm": 6.413680076599121, | |
| "learning_rate": 1.6363636363636366e-05, | |
| "loss": 5.2441, | |
| "mean_token_accuracy": 0.7653319537639618, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.344, | |
| "grad_norm": 6.393170356750488, | |
| "learning_rate": 1.5454545454545454e-05, | |
| "loss": 4.8617, | |
| "mean_token_accuracy": 0.7705393433570862, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 7.7843523025512695, | |
| "learning_rate": 1.4545454545454545e-05, | |
| "loss": 4.9986, | |
| "mean_token_accuracy": 0.7739708423614502, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 6.432173728942871, | |
| "learning_rate": 1.3636363636363637e-05, | |
| "loss": 4.998, | |
| "mean_token_accuracy": 0.7664141207933426, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.368, | |
| "grad_norm": 8.786981582641602, | |
| "learning_rate": 1.2727272727272727e-05, | |
| "loss": 4.3787, | |
| "mean_token_accuracy": 0.794494241476059, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.376, | |
| "grad_norm": 7.160989284515381, | |
| "learning_rate": 1.1818181818181819e-05, | |
| "loss": 5.0146, | |
| "mean_token_accuracy": 0.7729983627796173, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 7.532926559448242, | |
| "learning_rate": 1.0909090909090909e-05, | |
| "loss": 5.3312, | |
| "mean_token_accuracy": 0.757838249206543, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.392, | |
| "grad_norm": 7.59615421295166, | |
| "learning_rate": 1e-05, | |
| "loss": 5.4593, | |
| "mean_token_accuracy": 0.7495291978120804, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 7.142901420593262, | |
| "learning_rate": 9.090909090909091e-06, | |
| "loss": 5.8141, | |
| "mean_token_accuracy": 0.7420907914638519, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.408, | |
| "grad_norm": 7.876557350158691, | |
| "learning_rate": 8.181818181818183e-06, | |
| "loss": 4.959, | |
| "mean_token_accuracy": 0.7740796357393265, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 8.242363929748535, | |
| "learning_rate": 7.272727272727272e-06, | |
| "loss": 4.3558, | |
| "mean_token_accuracy": 0.7940836101770401, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.424, | |
| "grad_norm": 7.2721452713012695, | |
| "learning_rate": 6.363636363636363e-06, | |
| "loss": 5.3993, | |
| "mean_token_accuracy": 0.750448003411293, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "grad_norm": 8.326936721801758, | |
| "learning_rate": 5.4545454545454545e-06, | |
| "loss": 5.2227, | |
| "mean_token_accuracy": 0.7582048922777176, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 7.052926063537598, | |
| "learning_rate": 4.5454545454545455e-06, | |
| "loss": 5.415, | |
| "mean_token_accuracy": 0.7697479426860809, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 7.102321147918701, | |
| "learning_rate": 3.636363636363636e-06, | |
| "loss": 4.7006, | |
| "mean_token_accuracy": 0.7812397330999374, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.456, | |
| "grad_norm": 6.185770034790039, | |
| "learning_rate": 2.7272727272727272e-06, | |
| "loss": 5.3694, | |
| "mean_token_accuracy": 0.7675666660070419, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.464, | |
| "grad_norm": 6.738717079162598, | |
| "learning_rate": 1.818181818181818e-06, | |
| "loss": 4.4993, | |
| "mean_token_accuracy": 0.7917324602603912, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.472, | |
| "grad_norm": 7.1644697189331055, | |
| "learning_rate": 9.09090909090909e-07, | |
| "loss": 5.039, | |
| "mean_token_accuracy": 0.7702366560697556, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 6.943840503692627, | |
| "learning_rate": 0.0, | |
| "loss": 4.9739, | |
| "mean_token_accuracy": 0.7831375449895859, | |
| "step": 60 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 60, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 927896961024000.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |