| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.48, | |
| "eval_steps": 500, | |
| "global_step": 60, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.008, | |
| "grad_norm": 29.132591247558594, | |
| "learning_rate": 1e-05, | |
| "loss": 14.6502, | |
| "mean_token_accuracy": 0.44139818847179413, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 26.889101028442383, | |
| "learning_rate": 2e-05, | |
| "loss": 14.1037, | |
| "mean_token_accuracy": 0.46050838381052017, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.024, | |
| "grad_norm": 25.33008575439453, | |
| "learning_rate": 3e-05, | |
| "loss": 13.9492, | |
| "mean_token_accuracy": 0.45162031054496765, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 21.760456085205078, | |
| "learning_rate": 4e-05, | |
| "loss": 13.5332, | |
| "mean_token_accuracy": 0.47065450996160507, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 19.281452178955078, | |
| "learning_rate": 5e-05, | |
| "loss": 11.718, | |
| "mean_token_accuracy": 0.5306272506713867, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 17.71002197265625, | |
| "learning_rate": 4.909090909090909e-05, | |
| "loss": 11.6275, | |
| "mean_token_accuracy": 0.5233139544725418, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.056, | |
| "grad_norm": 18.48400115966797, | |
| "learning_rate": 4.8181818181818186e-05, | |
| "loss": 10.769, | |
| "mean_token_accuracy": 0.538578063249588, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 17.125812530517578, | |
| "learning_rate": 4.7272727272727275e-05, | |
| "loss": 10.1511, | |
| "mean_token_accuracy": 0.57903091609478, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.072, | |
| "grad_norm": 16.800792694091797, | |
| "learning_rate": 4.636363636363636e-05, | |
| "loss": 9.5746, | |
| "mean_token_accuracy": 0.5945043712854385, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 13.959136009216309, | |
| "learning_rate": 4.545454545454546e-05, | |
| "loss": 9.5138, | |
| "mean_token_accuracy": 0.6166220307350159, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.088, | |
| "grad_norm": 13.918156623840332, | |
| "learning_rate": 4.454545454545455e-05, | |
| "loss": 9.3013, | |
| "mean_token_accuracy": 0.6187369078397751, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 13.260101318359375, | |
| "learning_rate": 4.3636363636363636e-05, | |
| "loss": 8.5085, | |
| "mean_token_accuracy": 0.6477507650852203, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.104, | |
| "grad_norm": 11.593629837036133, | |
| "learning_rate": 4.2727272727272724e-05, | |
| "loss": 8.5987, | |
| "mean_token_accuracy": 0.6391231864690781, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 10.642716407775879, | |
| "learning_rate": 4.181818181818182e-05, | |
| "loss": 8.0015, | |
| "mean_token_accuracy": 0.6515648812055588, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 10.687582015991211, | |
| "learning_rate": 4.0909090909090915e-05, | |
| "loss": 7.8515, | |
| "mean_token_accuracy": 0.659519150853157, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 11.342368125915527, | |
| "learning_rate": 4e-05, | |
| "loss": 8.1734, | |
| "mean_token_accuracy": 0.6454901546239853, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.136, | |
| "grad_norm": 10.889402389526367, | |
| "learning_rate": 3.909090909090909e-05, | |
| "loss": 7.7197, | |
| "mean_token_accuracy": 0.6463980078697205, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": 11.274605751037598, | |
| "learning_rate": 3.818181818181819e-05, | |
| "loss": 7.7246, | |
| "mean_token_accuracy": 0.6666717827320099, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.152, | |
| "grad_norm": 9.856607437133789, | |
| "learning_rate": 3.7272727272727276e-05, | |
| "loss": 7.9621, | |
| "mean_token_accuracy": 0.6616508513689041, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 11.277185440063477, | |
| "learning_rate": 3.6363636363636364e-05, | |
| "loss": 7.6901, | |
| "mean_token_accuracy": 0.6638920903205872, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.168, | |
| "grad_norm": 11.014278411865234, | |
| "learning_rate": 3.545454545454546e-05, | |
| "loss": 7.4989, | |
| "mean_token_accuracy": 0.6643838137388229, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "grad_norm": 10.972683906555176, | |
| "learning_rate": 3.454545454545455e-05, | |
| "loss": 6.8229, | |
| "mean_token_accuracy": 0.7015579491853714, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.184, | |
| "grad_norm": 9.975464820861816, | |
| "learning_rate": 3.3636363636363636e-05, | |
| "loss": 7.3764, | |
| "mean_token_accuracy": 0.6741979718208313, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 9.958491325378418, | |
| "learning_rate": 3.272727272727273e-05, | |
| "loss": 7.0166, | |
| "mean_token_accuracy": 0.698300376534462, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 10.115056037902832, | |
| "learning_rate": 3.181818181818182e-05, | |
| "loss": 6.4697, | |
| "mean_token_accuracy": 0.7078270465135574, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.208, | |
| "grad_norm": 10.057324409484863, | |
| "learning_rate": 3.090909090909091e-05, | |
| "loss": 6.6724, | |
| "mean_token_accuracy": 0.7051044702529907, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.216, | |
| "grad_norm": 9.80183219909668, | |
| "learning_rate": 3e-05, | |
| "loss": 7.0968, | |
| "mean_token_accuracy": 0.6842809170484543, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 10.466309547424316, | |
| "learning_rate": 2.909090909090909e-05, | |
| "loss": 6.8699, | |
| "mean_token_accuracy": 0.7123119533061981, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.232, | |
| "grad_norm": 10.540712356567383, | |
| "learning_rate": 2.818181818181818e-05, | |
| "loss": 6.8986, | |
| "mean_token_accuracy": 0.6990974545478821, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 10.496529579162598, | |
| "learning_rate": 2.7272727272727273e-05, | |
| "loss": 7.1639, | |
| "mean_token_accuracy": 0.6841171979904175, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.248, | |
| "grad_norm": 10.347450256347656, | |
| "learning_rate": 2.636363636363636e-05, | |
| "loss": 7.1243, | |
| "mean_token_accuracy": 0.7038989663124084, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 10.918871879577637, | |
| "learning_rate": 2.5454545454545454e-05, | |
| "loss": 6.5305, | |
| "mean_token_accuracy": 0.7208158075809479, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.264, | |
| "grad_norm": 9.658543586730957, | |
| "learning_rate": 2.4545454545454545e-05, | |
| "loss": 6.4891, | |
| "mean_token_accuracy": 0.7414443045854568, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.272, | |
| "grad_norm": 9.635473251342773, | |
| "learning_rate": 2.3636363636363637e-05, | |
| "loss": 6.6357, | |
| "mean_token_accuracy": 0.73957559466362, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 9.881387710571289, | |
| "learning_rate": 2.272727272727273e-05, | |
| "loss": 6.8124, | |
| "mean_token_accuracy": 0.7286674231290817, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 9.951563835144043, | |
| "learning_rate": 2.1818181818181818e-05, | |
| "loss": 6.8861, | |
| "mean_token_accuracy": 0.7150181531906128, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.296, | |
| "grad_norm": 8.651752471923828, | |
| "learning_rate": 2.090909090909091e-05, | |
| "loss": 6.1332, | |
| "mean_token_accuracy": 0.7416634857654572, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.304, | |
| "grad_norm": 9.848931312561035, | |
| "learning_rate": 2e-05, | |
| "loss": 6.2172, | |
| "mean_token_accuracy": 0.737366572022438, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.312, | |
| "grad_norm": 9.531893730163574, | |
| "learning_rate": 1.9090909090909094e-05, | |
| "loss": 6.8838, | |
| "mean_token_accuracy": 0.6978575885295868, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 9.349637031555176, | |
| "learning_rate": 1.8181818181818182e-05, | |
| "loss": 6.739, | |
| "mean_token_accuracy": 0.7097857445478439, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.328, | |
| "grad_norm": 9.532185554504395, | |
| "learning_rate": 1.7272727272727274e-05, | |
| "loss": 6.0505, | |
| "mean_token_accuracy": 0.7424566149711609, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "grad_norm": 8.855209350585938, | |
| "learning_rate": 1.6363636363636366e-05, | |
| "loss": 6.4006, | |
| "mean_token_accuracy": 0.7251224219799042, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.344, | |
| "grad_norm": 8.668252944946289, | |
| "learning_rate": 1.5454545454545454e-05, | |
| "loss": 6.0441, | |
| "mean_token_accuracy": 0.7409052848815918, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 10.111790657043457, | |
| "learning_rate": 1.4545454545454545e-05, | |
| "loss": 6.173, | |
| "mean_token_accuracy": 0.7369689494371414, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 8.958393096923828, | |
| "learning_rate": 1.3636363636363637e-05, | |
| "loss": 6.2257, | |
| "mean_token_accuracy": 0.7140295207500458, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.368, | |
| "grad_norm": 10.362472534179688, | |
| "learning_rate": 1.2727272727272727e-05, | |
| "loss": 5.2547, | |
| "mean_token_accuracy": 0.7552689164876938, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.376, | |
| "grad_norm": 9.94664478302002, | |
| "learning_rate": 1.1818181818181819e-05, | |
| "loss": 5.992, | |
| "mean_token_accuracy": 0.7395860105752945, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 8.77272891998291, | |
| "learning_rate": 1.0909090909090909e-05, | |
| "loss": 6.4631, | |
| "mean_token_accuracy": 0.7185734361410141, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.392, | |
| "grad_norm": 8.110837936401367, | |
| "learning_rate": 1e-05, | |
| "loss": 6.3143, | |
| "mean_token_accuracy": 0.7158133089542389, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 8.038437843322754, | |
| "learning_rate": 9.090909090909091e-06, | |
| "loss": 6.8103, | |
| "mean_token_accuracy": 0.7134602963924408, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.408, | |
| "grad_norm": 8.795354843139648, | |
| "learning_rate": 8.181818181818183e-06, | |
| "loss": 6.0269, | |
| "mean_token_accuracy": 0.7261971086263657, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 8.536953926086426, | |
| "learning_rate": 7.272727272727272e-06, | |
| "loss": 5.355, | |
| "mean_token_accuracy": 0.7585936486721039, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.424, | |
| "grad_norm": 8.2533540725708, | |
| "learning_rate": 6.363636363636363e-06, | |
| "loss": 6.5612, | |
| "mean_token_accuracy": 0.7138941884040833, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "grad_norm": 9.274043083190918, | |
| "learning_rate": 5.4545454545454545e-06, | |
| "loss": 6.4098, | |
| "mean_token_accuracy": 0.7215629369020462, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 9.127202987670898, | |
| "learning_rate": 4.5454545454545455e-06, | |
| "loss": 6.5323, | |
| "mean_token_accuracy": 0.7295349985361099, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 8.477712631225586, | |
| "learning_rate": 3.636363636363636e-06, | |
| "loss": 5.4184, | |
| "mean_token_accuracy": 0.7586539834737778, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.456, | |
| "grad_norm": 7.758520126342773, | |
| "learning_rate": 2.7272727272727272e-06, | |
| "loss": 6.2795, | |
| "mean_token_accuracy": 0.7345724701881409, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.464, | |
| "grad_norm": 8.064261436462402, | |
| "learning_rate": 1.818181818181818e-06, | |
| "loss": 5.3877, | |
| "mean_token_accuracy": 0.7539169639348984, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.472, | |
| "grad_norm": 8.512259483337402, | |
| "learning_rate": 9.09090909090909e-07, | |
| "loss": 5.8114, | |
| "mean_token_accuracy": 0.7418429106473923, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 8.13984203338623, | |
| "learning_rate": 0.0, | |
| "loss": 5.8393, | |
| "mean_token_accuracy": 0.7534664273262024, | |
| "step": 60 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 60, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 313206736158720.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |