{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 0, "global_step": 95, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010526315789473684, "grad_norm": 23.386537551879883, "learning_rate": 1e-05, "loss": 1.9167, "step": 1 }, { "epoch": 0.021052631578947368, "grad_norm": 10.429777145385742, "learning_rate": 9.894736842105264e-06, "loss": 1.3625, "step": 2 }, { "epoch": 0.031578947368421054, "grad_norm": 5.902014255523682, "learning_rate": 9.789473684210527e-06, "loss": 1.0434, "step": 3 }, { "epoch": 0.042105263157894736, "grad_norm": 3.8165690898895264, "learning_rate": 9.68421052631579e-06, "loss": 0.9712, "step": 4 }, { "epoch": 0.05263157894736842, "grad_norm": 2.740595579147339, "learning_rate": 9.578947368421054e-06, "loss": 0.8903, "step": 5 }, { "epoch": 0.06315789473684211, "grad_norm": 2.1069369316101074, "learning_rate": 9.473684210526315e-06, "loss": 0.8538, "step": 6 }, { "epoch": 0.07368421052631578, "grad_norm": 1.8523842096328735, "learning_rate": 9.36842105263158e-06, "loss": 0.9114, "step": 7 }, { "epoch": 0.08421052631578947, "grad_norm": 2.3119540214538574, "learning_rate": 9.263157894736842e-06, "loss": 0.8689, "step": 8 }, { "epoch": 0.09473684210526316, "grad_norm": 1.5400722026824951, "learning_rate": 9.157894736842105e-06, "loss": 0.8396, "step": 9 }, { "epoch": 0.10526315789473684, "grad_norm": 1.361148715019226, "learning_rate": 9.05263157894737e-06, "loss": 0.8039, "step": 10 }, { "epoch": 0.11578947368421053, "grad_norm": 1.372304081916809, "learning_rate": 8.947368421052632e-06, "loss": 0.7709, "step": 11 }, { "epoch": 0.12631578947368421, "grad_norm": 1.1872608661651611, "learning_rate": 8.842105263157895e-06, "loss": 0.6855, "step": 12 }, { "epoch": 0.1368421052631579, "grad_norm": 1.40359365940094, "learning_rate": 8.736842105263158e-06, "loss": 0.7116, "step": 13 }, { "epoch": 0.14736842105263157, "grad_norm": 1.5963401794433594, "learning_rate": 8.631578947368422e-06, "loss": 0.7409, "step": 14 }, { "epoch": 0.15789473684210525, "grad_norm": 1.281571388244629, "learning_rate": 8.526315789473685e-06, "loss": 0.7726, "step": 15 }, { "epoch": 0.16842105263157894, "grad_norm": 1.175497055053711, "learning_rate": 8.421052631578948e-06, "loss": 0.7512, "step": 16 }, { "epoch": 0.17894736842105263, "grad_norm": 1.2076319456100464, "learning_rate": 8.315789473684212e-06, "loss": 0.7557, "step": 17 }, { "epoch": 0.18947368421052632, "grad_norm": 1.5126558542251587, "learning_rate": 8.210526315789475e-06, "loss": 0.7726, "step": 18 }, { "epoch": 0.2, "grad_norm": 1.0856505632400513, "learning_rate": 8.105263157894736e-06, "loss": 0.7186, "step": 19 }, { "epoch": 0.21052631578947367, "grad_norm": 1.5049703121185303, "learning_rate": 8.000000000000001e-06, "loss": 0.7609, "step": 20 }, { "epoch": 0.22105263157894736, "grad_norm": 1.3303167819976807, "learning_rate": 7.894736842105265e-06, "loss": 0.7687, "step": 21 }, { "epoch": 0.23157894736842105, "grad_norm": 1.452388048171997, "learning_rate": 7.789473684210526e-06, "loss": 0.7491, "step": 22 }, { "epoch": 0.24210526315789474, "grad_norm": 1.1407661437988281, "learning_rate": 7.68421052631579e-06, "loss": 0.7664, "step": 23 }, { "epoch": 0.25263157894736843, "grad_norm": 1.1311556100845337, "learning_rate": 7.578947368421054e-06, "loss": 0.7121, "step": 24 }, { "epoch": 0.2631578947368421, "grad_norm": 1.2517611980438232, "learning_rate": 7.473684210526316e-06, "loss": 0.7488, "step": 25 }, { "epoch": 0.2736842105263158, "grad_norm": 1.3132089376449585, "learning_rate": 7.368421052631579e-06, "loss": 0.7614, "step": 26 }, { "epoch": 0.28421052631578947, "grad_norm": 1.1150217056274414, "learning_rate": 7.263157894736843e-06, "loss": 0.696, "step": 27 }, { "epoch": 0.29473684210526313, "grad_norm": 1.4403979778289795, "learning_rate": 7.157894736842106e-06, "loss": 0.6517, "step": 28 }, { "epoch": 0.30526315789473685, "grad_norm": 1.2276042699813843, "learning_rate": 7.052631578947369e-06, "loss": 0.7429, "step": 29 }, { "epoch": 0.3157894736842105, "grad_norm": 1.221534252166748, "learning_rate": 6.947368421052632e-06, "loss": 0.7094, "step": 30 }, { "epoch": 0.3263157894736842, "grad_norm": 1.1484534740447998, "learning_rate": 6.842105263157896e-06, "loss": 0.7411, "step": 31 }, { "epoch": 0.3368421052631579, "grad_norm": 1.2985154390335083, "learning_rate": 6.736842105263158e-06, "loss": 0.6867, "step": 32 }, { "epoch": 0.3473684210526316, "grad_norm": 1.177146077156067, "learning_rate": 6.631578947368421e-06, "loss": 0.683, "step": 33 }, { "epoch": 0.35789473684210527, "grad_norm": 1.4374433755874634, "learning_rate": 6.526315789473685e-06, "loss": 0.7092, "step": 34 }, { "epoch": 0.3684210526315789, "grad_norm": 1.2070837020874023, "learning_rate": 6.421052631578948e-06, "loss": 0.7503, "step": 35 }, { "epoch": 0.37894736842105264, "grad_norm": 1.227997899055481, "learning_rate": 6.31578947368421e-06, "loss": 0.7568, "step": 36 }, { "epoch": 0.3894736842105263, "grad_norm": 1.0868141651153564, "learning_rate": 6.2105263157894745e-06, "loss": 0.6607, "step": 37 }, { "epoch": 0.4, "grad_norm": 1.2848992347717285, "learning_rate": 6.105263157894738e-06, "loss": 0.7026, "step": 38 }, { "epoch": 0.4105263157894737, "grad_norm": 1.3217140436172485, "learning_rate": 6e-06, "loss": 0.737, "step": 39 }, { "epoch": 0.42105263157894735, "grad_norm": 1.161586046218872, "learning_rate": 5.8947368421052634e-06, "loss": 0.6788, "step": 40 }, { "epoch": 0.43157894736842106, "grad_norm": 1.046071171760559, "learning_rate": 5.789473684210527e-06, "loss": 0.6617, "step": 41 }, { "epoch": 0.4421052631578947, "grad_norm": 1.1674532890319824, "learning_rate": 5.68421052631579e-06, "loss": 0.7044, "step": 42 }, { "epoch": 0.45263157894736844, "grad_norm": 1.3029122352600098, "learning_rate": 5.578947368421052e-06, "loss": 0.6678, "step": 43 }, { "epoch": 0.4631578947368421, "grad_norm": 1.1042920351028442, "learning_rate": 5.4736842105263165e-06, "loss": 0.6793, "step": 44 }, { "epoch": 0.47368421052631576, "grad_norm": 1.1280156373977661, "learning_rate": 5.36842105263158e-06, "loss": 0.7237, "step": 45 }, { "epoch": 0.4842105263157895, "grad_norm": 1.0256831645965576, "learning_rate": 5.263157894736842e-06, "loss": 0.7494, "step": 46 }, { "epoch": 0.49473684210526314, "grad_norm": 1.6717668771743774, "learning_rate": 5.157894736842106e-06, "loss": 0.6832, "step": 47 }, { "epoch": 0.5052631578947369, "grad_norm": 1.1831214427947998, "learning_rate": 5.052631578947369e-06, "loss": 0.6373, "step": 48 }, { "epoch": 0.5157894736842106, "grad_norm": 1.233420729637146, "learning_rate": 4.947368421052632e-06, "loss": 0.5988, "step": 49 }, { "epoch": 0.5263157894736842, "grad_norm": 1.3069719076156616, "learning_rate": 4.842105263157895e-06, "loss": 0.6621, "step": 50 }, { "epoch": 0.5368421052631579, "grad_norm": 1.0840635299682617, "learning_rate": 4.736842105263158e-06, "loss": 0.6282, "step": 51 }, { "epoch": 0.5473684210526316, "grad_norm": 1.312338948249817, "learning_rate": 4.631578947368421e-06, "loss": 0.6492, "step": 52 }, { "epoch": 0.5578947368421052, "grad_norm": 1.155627727508545, "learning_rate": 4.526315789473685e-06, "loss": 0.6231, "step": 53 }, { "epoch": 0.5684210526315789, "grad_norm": 1.1554993391036987, "learning_rate": 4.4210526315789476e-06, "loss": 0.6046, "step": 54 }, { "epoch": 0.5789473684210527, "grad_norm": 1.149007797241211, "learning_rate": 4.315789473684211e-06, "loss": 0.6111, "step": 55 }, { "epoch": 0.5894736842105263, "grad_norm": 1.0409859418869019, "learning_rate": 4.210526315789474e-06, "loss": 0.6147, "step": 56 }, { "epoch": 0.6, "grad_norm": 1.1579722166061401, "learning_rate": 4.105263157894737e-06, "loss": 0.5838, "step": 57 }, { "epoch": 0.6105263157894737, "grad_norm": 1.0128191709518433, "learning_rate": 4.000000000000001e-06, "loss": 0.6059, "step": 58 }, { "epoch": 0.6210526315789474, "grad_norm": 1.2369976043701172, "learning_rate": 3.894736842105263e-06, "loss": 0.6007, "step": 59 }, { "epoch": 0.631578947368421, "grad_norm": 1.4128843545913696, "learning_rate": 3.789473684210527e-06, "loss": 0.6647, "step": 60 }, { "epoch": 0.6421052631578947, "grad_norm": 1.320022702217102, "learning_rate": 3.6842105263157896e-06, "loss": 0.6834, "step": 61 }, { "epoch": 0.6526315789473685, "grad_norm": 0.996947169303894, "learning_rate": 3.578947368421053e-06, "loss": 0.5553, "step": 62 }, { "epoch": 0.6631578947368421, "grad_norm": 1.0932636260986328, "learning_rate": 3.473684210526316e-06, "loss": 0.5997, "step": 63 }, { "epoch": 0.6736842105263158, "grad_norm": 1.1183348894119263, "learning_rate": 3.368421052631579e-06, "loss": 0.58, "step": 64 }, { "epoch": 0.6842105263157895, "grad_norm": 1.2177517414093018, "learning_rate": 3.2631578947368423e-06, "loss": 0.6384, "step": 65 }, { "epoch": 0.6947368421052632, "grad_norm": 1.2820733785629272, "learning_rate": 3.157894736842105e-06, "loss": 0.6548, "step": 66 }, { "epoch": 0.7052631578947368, "grad_norm": 1.27961003780365, "learning_rate": 3.052631578947369e-06, "loss": 0.6085, "step": 67 }, { "epoch": 0.7157894736842105, "grad_norm": 1.1323752403259277, "learning_rate": 2.9473684210526317e-06, "loss": 0.6521, "step": 68 }, { "epoch": 0.7263157894736842, "grad_norm": 1.0661325454711914, "learning_rate": 2.842105263157895e-06, "loss": 0.6587, "step": 69 }, { "epoch": 0.7368421052631579, "grad_norm": 1.112278699874878, "learning_rate": 2.7368421052631583e-06, "loss": 0.5834, "step": 70 }, { "epoch": 0.7473684210526316, "grad_norm": 1.2987736463546753, "learning_rate": 2.631578947368421e-06, "loss": 0.6459, "step": 71 }, { "epoch": 0.7578947368421053, "grad_norm": 1.196009874343872, "learning_rate": 2.5263157894736844e-06, "loss": 0.587, "step": 72 }, { "epoch": 0.7684210526315789, "grad_norm": 1.2561204433441162, "learning_rate": 2.4210526315789477e-06, "loss": 0.5874, "step": 73 }, { "epoch": 0.7789473684210526, "grad_norm": 1.0731667280197144, "learning_rate": 2.3157894736842105e-06, "loss": 0.6109, "step": 74 }, { "epoch": 0.7894736842105263, "grad_norm": 1.1367136240005493, "learning_rate": 2.2105263157894738e-06, "loss": 0.6368, "step": 75 }, { "epoch": 0.8, "grad_norm": 1.1752535104751587, "learning_rate": 2.105263157894737e-06, "loss": 0.6562, "step": 76 }, { "epoch": 0.8105263157894737, "grad_norm": 1.4213778972625732, "learning_rate": 2.0000000000000003e-06, "loss": 0.6511, "step": 77 }, { "epoch": 0.8210526315789474, "grad_norm": 1.342801570892334, "learning_rate": 1.8947368421052634e-06, "loss": 0.6833, "step": 78 }, { "epoch": 0.8315789473684211, "grad_norm": 0.9640871286392212, "learning_rate": 1.7894736842105265e-06, "loss": 0.5962, "step": 79 }, { "epoch": 0.8421052631578947, "grad_norm": 1.0870826244354248, "learning_rate": 1.6842105263157895e-06, "loss": 0.6084, "step": 80 }, { "epoch": 0.8526315789473684, "grad_norm": 1.0977897644042969, "learning_rate": 1.5789473684210526e-06, "loss": 0.6475, "step": 81 }, { "epoch": 0.8631578947368421, "grad_norm": 1.0268280506134033, "learning_rate": 1.4736842105263159e-06, "loss": 0.6048, "step": 82 }, { "epoch": 0.8736842105263158, "grad_norm": 1.1313055753707886, "learning_rate": 1.3684210526315791e-06, "loss": 0.6514, "step": 83 }, { "epoch": 0.8842105263157894, "grad_norm": 1.0519204139709473, "learning_rate": 1.2631578947368422e-06, "loss": 0.5872, "step": 84 }, { "epoch": 0.8947368421052632, "grad_norm": 0.9330918788909912, "learning_rate": 1.1578947368421053e-06, "loss": 0.568, "step": 85 }, { "epoch": 0.9052631578947369, "grad_norm": 0.9565713405609131, "learning_rate": 1.0526315789473685e-06, "loss": 0.5445, "step": 86 }, { "epoch": 0.9157894736842105, "grad_norm": 1.0495901107788086, "learning_rate": 9.473684210526317e-07, "loss": 0.5917, "step": 87 }, { "epoch": 0.9263157894736842, "grad_norm": 0.9997472167015076, "learning_rate": 8.421052631578948e-07, "loss": 0.5896, "step": 88 }, { "epoch": 0.9368421052631579, "grad_norm": 1.1609445810317993, "learning_rate": 7.368421052631579e-07, "loss": 0.5559, "step": 89 }, { "epoch": 0.9473684210526315, "grad_norm": 1.0440585613250732, "learning_rate": 6.315789473684211e-07, "loss": 0.6345, "step": 90 }, { "epoch": 0.9578947368421052, "grad_norm": 1.073056697845459, "learning_rate": 5.263157894736843e-07, "loss": 0.6147, "step": 91 }, { "epoch": 0.968421052631579, "grad_norm": 1.022722601890564, "learning_rate": 4.210526315789474e-07, "loss": 0.6041, "step": 92 }, { "epoch": 0.9789473684210527, "grad_norm": 0.9993295073509216, "learning_rate": 3.1578947368421055e-07, "loss": 0.5827, "step": 93 }, { "epoch": 0.9894736842105263, "grad_norm": 1.5678324699401855, "learning_rate": 2.105263157894737e-07, "loss": 0.5515, "step": 94 }, { "epoch": 1.0, "grad_norm": 1.1239935159683228, "learning_rate": 1.0526315789473685e-07, "loss": 0.5996, "step": 95 } ], "logging_steps": 1.0, "max_steps": 95, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.28204549301207e+16, "train_batch_size": 3, "trial_name": null, "trial_params": null }