{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 0, "global_step": 123, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008130081300813009, "grad_norm": 0.47265625, "learning_rate": 9.91869918699187e-06, "loss": 1.7914, "step": 1 }, { "epoch": 0.016260162601626018, "grad_norm": 0.482421875, "learning_rate": 9.837398373983741e-06, "loss": 1.8308, "step": 2 }, { "epoch": 0.024390243902439025, "grad_norm": 0.451171875, "learning_rate": 9.756097560975611e-06, "loss": 1.8346, "step": 3 }, { "epoch": 0.032520325203252036, "grad_norm": 0.4609375, "learning_rate": 9.67479674796748e-06, "loss": 1.8078, "step": 4 }, { "epoch": 0.04065040650406504, "grad_norm": 0.4765625, "learning_rate": 9.59349593495935e-06, "loss": 1.83, "step": 5 }, { "epoch": 0.04878048780487805, "grad_norm": 0.60546875, "learning_rate": 9.51219512195122e-06, "loss": 1.9395, "step": 6 }, { "epoch": 0.056910569105691054, "grad_norm": 0.431640625, "learning_rate": 9.43089430894309e-06, "loss": 1.8757, "step": 7 }, { "epoch": 0.06504065040650407, "grad_norm": 0.36328125, "learning_rate": 9.34959349593496e-06, "loss": 1.8858, "step": 8 }, { "epoch": 0.07317073170731707, "grad_norm": 0.408203125, "learning_rate": 9.268292682926831e-06, "loss": 1.6752, "step": 9 }, { "epoch": 0.08130081300813008, "grad_norm": 0.30859375, "learning_rate": 9.1869918699187e-06, "loss": 1.7763, "step": 10 }, { "epoch": 0.08943089430894309, "grad_norm": 0.27734375, "learning_rate": 9.10569105691057e-06, "loss": 1.7691, "step": 11 }, { "epoch": 0.0975609756097561, "grad_norm": 0.2734375, "learning_rate": 9.02439024390244e-06, "loss": 1.6506, "step": 12 }, { "epoch": 0.10569105691056911, "grad_norm": 0.310546875, "learning_rate": 8.94308943089431e-06, "loss": 1.733, "step": 13 }, { "epoch": 0.11382113821138211, "grad_norm": 0.26171875, "learning_rate": 8.86178861788618e-06, "loss": 1.7344, "step": 14 }, { "epoch": 0.12195121951219512, "grad_norm": 0.32421875, "learning_rate": 8.78048780487805e-06, "loss": 1.7125, "step": 15 }, { "epoch": 0.13008130081300814, "grad_norm": 0.3515625, "learning_rate": 8.69918699186992e-06, "loss": 1.7315, "step": 16 }, { "epoch": 0.13821138211382114, "grad_norm": 0.2470703125, "learning_rate": 8.617886178861789e-06, "loss": 1.6819, "step": 17 }, { "epoch": 0.14634146341463414, "grad_norm": 0.2333984375, "learning_rate": 8.536585365853658e-06, "loss": 1.6793, "step": 18 }, { "epoch": 0.15447154471544716, "grad_norm": 0.263671875, "learning_rate": 8.45528455284553e-06, "loss": 1.6847, "step": 19 }, { "epoch": 0.16260162601626016, "grad_norm": 0.2333984375, "learning_rate": 8.373983739837399e-06, "loss": 1.6838, "step": 20 }, { "epoch": 0.17073170731707318, "grad_norm": 0.2099609375, "learning_rate": 8.292682926829268e-06, "loss": 1.63, "step": 21 }, { "epoch": 0.17886178861788618, "grad_norm": 0.2392578125, "learning_rate": 8.21138211382114e-06, "loss": 1.6067, "step": 22 }, { "epoch": 0.18699186991869918, "grad_norm": 0.1962890625, "learning_rate": 8.130081300813009e-06, "loss": 1.638, "step": 23 }, { "epoch": 0.1951219512195122, "grad_norm": 0.1982421875, "learning_rate": 8.048780487804879e-06, "loss": 1.6493, "step": 24 }, { "epoch": 0.2032520325203252, "grad_norm": 0.19140625, "learning_rate": 7.967479674796748e-06, "loss": 1.6309, "step": 25 }, { "epoch": 0.21138211382113822, "grad_norm": 0.212890625, "learning_rate": 7.886178861788618e-06, "loss": 1.653, "step": 26 }, { "epoch": 0.21951219512195122, "grad_norm": 0.2236328125, "learning_rate": 7.804878048780489e-06, "loss": 1.5796, "step": 27 }, { "epoch": 0.22764227642276422, "grad_norm": 0.1787109375, "learning_rate": 7.723577235772358e-06, "loss": 1.6617, "step": 28 }, { "epoch": 0.23577235772357724, "grad_norm": 0.1923828125, "learning_rate": 7.64227642276423e-06, "loss": 1.6065, "step": 29 }, { "epoch": 0.24390243902439024, "grad_norm": 0.17578125, "learning_rate": 7.560975609756098e-06, "loss": 1.635, "step": 30 }, { "epoch": 0.25203252032520324, "grad_norm": 0.1787109375, "learning_rate": 7.4796747967479676e-06, "loss": 1.593, "step": 31 }, { "epoch": 0.2601626016260163, "grad_norm": 0.201171875, "learning_rate": 7.398373983739838e-06, "loss": 1.6044, "step": 32 }, { "epoch": 0.2682926829268293, "grad_norm": 0.1806640625, "learning_rate": 7.317073170731707e-06, "loss": 1.6211, "step": 33 }, { "epoch": 0.2764227642276423, "grad_norm": 0.1689453125, "learning_rate": 7.2357723577235786e-06, "loss": 1.6082, "step": 34 }, { "epoch": 0.2845528455284553, "grad_norm": 0.1875, "learning_rate": 7.154471544715448e-06, "loss": 1.5812, "step": 35 }, { "epoch": 0.2926829268292683, "grad_norm": 0.166015625, "learning_rate": 7.0731707317073175e-06, "loss": 1.6133, "step": 36 }, { "epoch": 0.3008130081300813, "grad_norm": 0.1923828125, "learning_rate": 6.991869918699188e-06, "loss": 1.5672, "step": 37 }, { "epoch": 0.3089430894308943, "grad_norm": 0.166015625, "learning_rate": 6.910569105691057e-06, "loss": 1.6293, "step": 38 }, { "epoch": 0.3170731707317073, "grad_norm": 0.197265625, "learning_rate": 6.829268292682928e-06, "loss": 1.6276, "step": 39 }, { "epoch": 0.3252032520325203, "grad_norm": 0.1572265625, "learning_rate": 6.747967479674797e-06, "loss": 1.5441, "step": 40 }, { "epoch": 0.3333333333333333, "grad_norm": 0.1787109375, "learning_rate": 6.666666666666667e-06, "loss": 1.564, "step": 41 }, { "epoch": 0.34146341463414637, "grad_norm": 0.1962890625, "learning_rate": 6.585365853658538e-06, "loss": 1.5339, "step": 42 }, { "epoch": 0.34959349593495936, "grad_norm": 0.1923828125, "learning_rate": 6.504065040650407e-06, "loss": 1.5444, "step": 43 }, { "epoch": 0.35772357723577236, "grad_norm": 0.201171875, "learning_rate": 6.422764227642278e-06, "loss": 1.5525, "step": 44 }, { "epoch": 0.36585365853658536, "grad_norm": 0.169921875, "learning_rate": 6.341463414634147e-06, "loss": 1.5702, "step": 45 }, { "epoch": 0.37398373983739835, "grad_norm": 0.27734375, "learning_rate": 6.260162601626017e-06, "loss": 1.6304, "step": 46 }, { "epoch": 0.3821138211382114, "grad_norm": 0.1962890625, "learning_rate": 6.178861788617887e-06, "loss": 1.5507, "step": 47 }, { "epoch": 0.3902439024390244, "grad_norm": 0.166015625, "learning_rate": 6.0975609756097564e-06, "loss": 1.4688, "step": 48 }, { "epoch": 0.3983739837398374, "grad_norm": 0.19140625, "learning_rate": 6.016260162601627e-06, "loss": 1.5426, "step": 49 }, { "epoch": 0.4065040650406504, "grad_norm": 0.1767578125, "learning_rate": 5.934959349593496e-06, "loss": 1.575, "step": 50 }, { "epoch": 0.4146341463414634, "grad_norm": 0.162109375, "learning_rate": 5.853658536585366e-06, "loss": 1.572, "step": 51 }, { "epoch": 0.42276422764227645, "grad_norm": 0.197265625, "learning_rate": 5.772357723577237e-06, "loss": 1.5698, "step": 52 }, { "epoch": 0.43089430894308944, "grad_norm": 0.287109375, "learning_rate": 5.691056910569106e-06, "loss": 1.5296, "step": 53 }, { "epoch": 0.43902439024390244, "grad_norm": 0.1708984375, "learning_rate": 5.609756097560977e-06, "loss": 1.5567, "step": 54 }, { "epoch": 0.44715447154471544, "grad_norm": 0.1708984375, "learning_rate": 5.528455284552846e-06, "loss": 1.5867, "step": 55 }, { "epoch": 0.45528455284552843, "grad_norm": 0.173828125, "learning_rate": 5.447154471544716e-06, "loss": 1.522, "step": 56 }, { "epoch": 0.4634146341463415, "grad_norm": 0.16015625, "learning_rate": 5.365853658536586e-06, "loss": 1.5443, "step": 57 }, { "epoch": 0.4715447154471545, "grad_norm": 0.2099609375, "learning_rate": 5.2845528455284555e-06, "loss": 1.5125, "step": 58 }, { "epoch": 0.4796747967479675, "grad_norm": 0.212890625, "learning_rate": 5.203252032520326e-06, "loss": 1.6069, "step": 59 }, { "epoch": 0.4878048780487805, "grad_norm": 0.15625, "learning_rate": 5.121951219512195e-06, "loss": 1.5464, "step": 60 }, { "epoch": 0.4959349593495935, "grad_norm": 0.2109375, "learning_rate": 5.040650406504065e-06, "loss": 1.5834, "step": 61 }, { "epoch": 0.5040650406504065, "grad_norm": 0.1796875, "learning_rate": 4.959349593495935e-06, "loss": 1.5126, "step": 62 }, { "epoch": 0.5121951219512195, "grad_norm": 0.177734375, "learning_rate": 4.8780487804878055e-06, "loss": 1.5238, "step": 63 }, { "epoch": 0.5203252032520326, "grad_norm": 0.1455078125, "learning_rate": 4.796747967479675e-06, "loss": 1.5456, "step": 64 }, { "epoch": 0.5284552845528455, "grad_norm": 0.1513671875, "learning_rate": 4.715447154471545e-06, "loss": 1.5332, "step": 65 }, { "epoch": 0.5365853658536586, "grad_norm": 0.1484375, "learning_rate": 4.634146341463416e-06, "loss": 1.5083, "step": 66 }, { "epoch": 0.5447154471544715, "grad_norm": 0.150390625, "learning_rate": 4.552845528455285e-06, "loss": 1.5709, "step": 67 }, { "epoch": 0.5528455284552846, "grad_norm": 0.138671875, "learning_rate": 4.471544715447155e-06, "loss": 1.5517, "step": 68 }, { "epoch": 0.5609756097560976, "grad_norm": 0.14453125, "learning_rate": 4.390243902439025e-06, "loss": 1.4947, "step": 69 }, { "epoch": 0.5691056910569106, "grad_norm": 0.244140625, "learning_rate": 4.308943089430894e-06, "loss": 1.4829, "step": 70 }, { "epoch": 0.5772357723577236, "grad_norm": 0.1630859375, "learning_rate": 4.227642276422765e-06, "loss": 1.5183, "step": 71 }, { "epoch": 0.5853658536585366, "grad_norm": 0.171875, "learning_rate": 4.146341463414634e-06, "loss": 1.5429, "step": 72 }, { "epoch": 0.5934959349593496, "grad_norm": 0.13671875, "learning_rate": 4.0650406504065046e-06, "loss": 1.5305, "step": 73 }, { "epoch": 0.6016260162601627, "grad_norm": 0.1533203125, "learning_rate": 3.983739837398374e-06, "loss": 1.4979, "step": 74 }, { "epoch": 0.6097560975609756, "grad_norm": 0.1640625, "learning_rate": 3.902439024390244e-06, "loss": 1.5252, "step": 75 }, { "epoch": 0.6178861788617886, "grad_norm": 0.1591796875, "learning_rate": 3.821138211382115e-06, "loss": 1.4131, "step": 76 }, { "epoch": 0.6260162601626016, "grad_norm": 0.1484375, "learning_rate": 3.7398373983739838e-06, "loss": 1.4648, "step": 77 }, { "epoch": 0.6341463414634146, "grad_norm": 0.298828125, "learning_rate": 3.6585365853658537e-06, "loss": 1.5375, "step": 78 }, { "epoch": 0.6422764227642277, "grad_norm": 0.2119140625, "learning_rate": 3.577235772357724e-06, "loss": 1.4335, "step": 79 }, { "epoch": 0.6504065040650406, "grad_norm": 0.142578125, "learning_rate": 3.495934959349594e-06, "loss": 1.5052, "step": 80 }, { "epoch": 0.6585365853658537, "grad_norm": 0.1474609375, "learning_rate": 3.414634146341464e-06, "loss": 1.5025, "step": 81 }, { "epoch": 0.6666666666666666, "grad_norm": 0.248046875, "learning_rate": 3.3333333333333333e-06, "loss": 1.4299, "step": 82 }, { "epoch": 0.6747967479674797, "grad_norm": 0.1689453125, "learning_rate": 3.2520325203252037e-06, "loss": 1.4853, "step": 83 }, { "epoch": 0.6829268292682927, "grad_norm": 0.138671875, "learning_rate": 3.1707317073170736e-06, "loss": 1.5545, "step": 84 }, { "epoch": 0.6910569105691057, "grad_norm": 0.158203125, "learning_rate": 3.0894308943089435e-06, "loss": 1.4281, "step": 85 }, { "epoch": 0.6991869918699187, "grad_norm": 0.1396484375, "learning_rate": 3.0081300813008134e-06, "loss": 1.4572, "step": 86 }, { "epoch": 0.7073170731707317, "grad_norm": 0.1357421875, "learning_rate": 2.926829268292683e-06, "loss": 1.4792, "step": 87 }, { "epoch": 0.7154471544715447, "grad_norm": 0.15625, "learning_rate": 2.845528455284553e-06, "loss": 1.4081, "step": 88 }, { "epoch": 0.7235772357723578, "grad_norm": 0.13671875, "learning_rate": 2.764227642276423e-06, "loss": 1.4979, "step": 89 }, { "epoch": 0.7317073170731707, "grad_norm": 0.1298828125, "learning_rate": 2.682926829268293e-06, "loss": 1.4775, "step": 90 }, { "epoch": 0.7398373983739838, "grad_norm": 0.140625, "learning_rate": 2.601626016260163e-06, "loss": 1.4734, "step": 91 }, { "epoch": 0.7479674796747967, "grad_norm": 0.1455078125, "learning_rate": 2.5203252032520324e-06, "loss": 1.3895, "step": 92 }, { "epoch": 0.7560975609756098, "grad_norm": 0.1611328125, "learning_rate": 2.4390243902439027e-06, "loss": 1.4338, "step": 93 }, { "epoch": 0.7642276422764228, "grad_norm": 0.150390625, "learning_rate": 2.3577235772357727e-06, "loss": 1.4483, "step": 94 }, { "epoch": 0.7723577235772358, "grad_norm": 0.1396484375, "learning_rate": 2.2764227642276426e-06, "loss": 1.4468, "step": 95 }, { "epoch": 0.7804878048780488, "grad_norm": 0.16796875, "learning_rate": 2.1951219512195125e-06, "loss": 1.4327, "step": 96 }, { "epoch": 0.7886178861788617, "grad_norm": 0.1796875, "learning_rate": 2.1138211382113824e-06, "loss": 1.5092, "step": 97 }, { "epoch": 0.7967479674796748, "grad_norm": 0.1630859375, "learning_rate": 2.0325203252032523e-06, "loss": 1.499, "step": 98 }, { "epoch": 0.8048780487804879, "grad_norm": 0.1474609375, "learning_rate": 1.951219512195122e-06, "loss": 1.4513, "step": 99 }, { "epoch": 0.8130081300813008, "grad_norm": 0.2099609375, "learning_rate": 1.8699186991869919e-06, "loss": 1.473, "step": 100 }, { "epoch": 0.8211382113821138, "grad_norm": 0.185546875, "learning_rate": 1.788617886178862e-06, "loss": 1.4495, "step": 101 }, { "epoch": 0.8292682926829268, "grad_norm": 0.1416015625, "learning_rate": 1.707317073170732e-06, "loss": 1.504, "step": 102 }, { "epoch": 0.8373983739837398, "grad_norm": 0.20703125, "learning_rate": 1.6260162601626018e-06, "loss": 1.4803, "step": 103 }, { "epoch": 0.8455284552845529, "grad_norm": 0.1376953125, "learning_rate": 1.5447154471544717e-06, "loss": 1.4936, "step": 104 }, { "epoch": 0.8536585365853658, "grad_norm": 0.1552734375, "learning_rate": 1.4634146341463414e-06, "loss": 1.5159, "step": 105 }, { "epoch": 0.8617886178861789, "grad_norm": 0.140625, "learning_rate": 1.3821138211382116e-06, "loss": 1.5042, "step": 106 }, { "epoch": 0.8699186991869918, "grad_norm": 0.380859375, "learning_rate": 1.3008130081300815e-06, "loss": 1.3785, "step": 107 }, { "epoch": 0.8780487804878049, "grad_norm": 0.15625, "learning_rate": 1.2195121951219514e-06, "loss": 1.4519, "step": 108 }, { "epoch": 0.8861788617886179, "grad_norm": 0.2080078125, "learning_rate": 1.1382113821138213e-06, "loss": 1.4291, "step": 109 }, { "epoch": 0.8943089430894309, "grad_norm": 0.158203125, "learning_rate": 1.0569105691056912e-06, "loss": 1.3927, "step": 110 }, { "epoch": 0.9024390243902439, "grad_norm": 0.138671875, "learning_rate": 9.75609756097561e-07, "loss": 1.4396, "step": 111 }, { "epoch": 0.9105691056910569, "grad_norm": 0.1416015625, "learning_rate": 8.94308943089431e-07, "loss": 1.4591, "step": 112 }, { "epoch": 0.9186991869918699, "grad_norm": 0.1748046875, "learning_rate": 8.130081300813009e-07, "loss": 1.5032, "step": 113 }, { "epoch": 0.926829268292683, "grad_norm": 0.15234375, "learning_rate": 7.317073170731707e-07, "loss": 1.4562, "step": 114 }, { "epoch": 0.9349593495934959, "grad_norm": 0.169921875, "learning_rate": 6.504065040650407e-07, "loss": 1.4683, "step": 115 }, { "epoch": 0.943089430894309, "grad_norm": 0.1396484375, "learning_rate": 5.691056910569106e-07, "loss": 1.5243, "step": 116 }, { "epoch": 0.9512195121951219, "grad_norm": 0.1884765625, "learning_rate": 4.878048780487805e-07, "loss": 1.4146, "step": 117 }, { "epoch": 0.959349593495935, "grad_norm": 0.1552734375, "learning_rate": 4.0650406504065046e-07, "loss": 1.3936, "step": 118 }, { "epoch": 0.967479674796748, "grad_norm": 0.1552734375, "learning_rate": 3.2520325203252037e-07, "loss": 1.4421, "step": 119 }, { "epoch": 0.975609756097561, "grad_norm": 0.251953125, "learning_rate": 2.439024390243903e-07, "loss": 1.3564, "step": 120 }, { "epoch": 0.983739837398374, "grad_norm": 0.1708984375, "learning_rate": 1.6260162601626018e-07, "loss": 1.5465, "step": 121 }, { "epoch": 0.991869918699187, "grad_norm": 0.19140625, "learning_rate": 8.130081300813009e-08, "loss": 1.4304, "step": 122 }, { "epoch": 1.0, "grad_norm": 0.1474609375, "learning_rate": 0.0, "loss": 1.4253, "step": 123 } ], "logging_steps": 1.0, "max_steps": 123, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.31844774623445e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }