{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 0, "global_step": 116, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008620689655172414, "grad_norm": 0.490234375, "learning_rate": 9.913793103448277e-06, "loss": 1.8897, "step": 1 }, { "epoch": 0.017241379310344827, "grad_norm": 0.44921875, "learning_rate": 9.827586206896553e-06, "loss": 1.7847, "step": 2 }, { "epoch": 0.02586206896551724, "grad_norm": 0.470703125, "learning_rate": 9.741379310344829e-06, "loss": 1.8435, "step": 3 }, { "epoch": 0.034482758620689655, "grad_norm": 0.435546875, "learning_rate": 9.655172413793105e-06, "loss": 1.8, "step": 4 }, { "epoch": 0.04310344827586207, "grad_norm": 0.41796875, "learning_rate": 9.56896551724138e-06, "loss": 1.8393, "step": 5 }, { "epoch": 0.05172413793103448, "grad_norm": 0.466796875, "learning_rate": 9.482758620689655e-06, "loss": 1.8275, "step": 6 }, { "epoch": 0.0603448275862069, "grad_norm": 0.365234375, "learning_rate": 9.396551724137931e-06, "loss": 1.7559, "step": 7 }, { "epoch": 0.06896551724137931, "grad_norm": 0.3203125, "learning_rate": 9.310344827586207e-06, "loss": 1.7784, "step": 8 }, { "epoch": 0.07758620689655173, "grad_norm": 0.3046875, "learning_rate": 9.224137931034484e-06, "loss": 1.7637, "step": 9 }, { "epoch": 0.08620689655172414, "grad_norm": 0.28515625, "learning_rate": 9.13793103448276e-06, "loss": 1.7164, "step": 10 }, { "epoch": 0.09482758620689655, "grad_norm": 0.294921875, "learning_rate": 9.051724137931036e-06, "loss": 1.7796, "step": 11 }, { "epoch": 0.10344827586206896, "grad_norm": 0.265625, "learning_rate": 8.965517241379312e-06, "loss": 1.698, "step": 12 }, { "epoch": 0.11206896551724138, "grad_norm": 0.2412109375, "learning_rate": 8.879310344827588e-06, "loss": 1.7032, "step": 13 }, { "epoch": 0.1206896551724138, "grad_norm": 0.2392578125, "learning_rate": 8.793103448275862e-06, "loss": 1.7151, "step": 14 }, { "epoch": 0.12931034482758622, "grad_norm": 0.265625, "learning_rate": 8.706896551724138e-06, "loss": 1.6807, "step": 15 }, { "epoch": 0.13793103448275862, "grad_norm": 0.2373046875, "learning_rate": 8.620689655172414e-06, "loss": 1.7027, "step": 16 }, { "epoch": 0.14655172413793102, "grad_norm": 0.244140625, "learning_rate": 8.53448275862069e-06, "loss": 1.6944, "step": 17 }, { "epoch": 0.15517241379310345, "grad_norm": 0.2392578125, "learning_rate": 8.448275862068966e-06, "loss": 1.7346, "step": 18 }, { "epoch": 0.16379310344827586, "grad_norm": 0.22265625, "learning_rate": 8.362068965517242e-06, "loss": 1.5981, "step": 19 }, { "epoch": 0.1724137931034483, "grad_norm": 0.2451171875, "learning_rate": 8.275862068965518e-06, "loss": 1.6794, "step": 20 }, { "epoch": 0.1810344827586207, "grad_norm": 0.212890625, "learning_rate": 8.189655172413794e-06, "loss": 1.6385, "step": 21 }, { "epoch": 0.1896551724137931, "grad_norm": 0.205078125, "learning_rate": 8.103448275862069e-06, "loss": 1.6853, "step": 22 }, { "epoch": 0.19827586206896552, "grad_norm": 0.2021484375, "learning_rate": 8.017241379310345e-06, "loss": 1.6833, "step": 23 }, { "epoch": 0.20689655172413793, "grad_norm": 0.1875, "learning_rate": 7.93103448275862e-06, "loss": 1.5735, "step": 24 }, { "epoch": 0.21551724137931033, "grad_norm": 0.1806640625, "learning_rate": 7.844827586206897e-06, "loss": 1.6481, "step": 25 }, { "epoch": 0.22413793103448276, "grad_norm": 0.1796875, "learning_rate": 7.758620689655173e-06, "loss": 1.6415, "step": 26 }, { "epoch": 0.23275862068965517, "grad_norm": 0.2109375, "learning_rate": 7.672413793103449e-06, "loss": 1.697, "step": 27 }, { "epoch": 0.2413793103448276, "grad_norm": 0.16796875, "learning_rate": 7.586206896551724e-06, "loss": 1.6084, "step": 28 }, { "epoch": 0.25, "grad_norm": 0.1884765625, "learning_rate": 7.500000000000001e-06, "loss": 1.6449, "step": 29 }, { "epoch": 0.25862068965517243, "grad_norm": 0.216796875, "learning_rate": 7.413793103448277e-06, "loss": 1.6571, "step": 30 }, { "epoch": 0.2672413793103448, "grad_norm": 0.193359375, "learning_rate": 7.327586206896552e-06, "loss": 1.5427, "step": 31 }, { "epoch": 0.27586206896551724, "grad_norm": 0.1708984375, "learning_rate": 7.241379310344828e-06, "loss": 1.498, "step": 32 }, { "epoch": 0.28448275862068967, "grad_norm": 0.1865234375, "learning_rate": 7.155172413793104e-06, "loss": 1.6642, "step": 33 }, { "epoch": 0.29310344827586204, "grad_norm": 0.212890625, "learning_rate": 7.0689655172413796e-06, "loss": 1.6553, "step": 34 }, { "epoch": 0.3017241379310345, "grad_norm": 0.1796875, "learning_rate": 6.982758620689656e-06, "loss": 1.5789, "step": 35 }, { "epoch": 0.3103448275862069, "grad_norm": 0.2041015625, "learning_rate": 6.896551724137932e-06, "loss": 1.568, "step": 36 }, { "epoch": 0.31896551724137934, "grad_norm": 0.189453125, "learning_rate": 6.810344827586207e-06, "loss": 1.5269, "step": 37 }, { "epoch": 0.3275862068965517, "grad_norm": 0.2001953125, "learning_rate": 6.724137931034484e-06, "loss": 1.6063, "step": 38 }, { "epoch": 0.33620689655172414, "grad_norm": 0.181640625, "learning_rate": 6.63793103448276e-06, "loss": 1.5043, "step": 39 }, { "epoch": 0.3448275862068966, "grad_norm": 0.1904296875, "learning_rate": 6.551724137931035e-06, "loss": 1.6623, "step": 40 }, { "epoch": 0.35344827586206895, "grad_norm": 0.1875, "learning_rate": 6.465517241379311e-06, "loss": 1.5848, "step": 41 }, { "epoch": 0.3620689655172414, "grad_norm": 0.17578125, "learning_rate": 6.379310344827587e-06, "loss": 1.5198, "step": 42 }, { "epoch": 0.3706896551724138, "grad_norm": 0.1982421875, "learning_rate": 6.293103448275862e-06, "loss": 1.5934, "step": 43 }, { "epoch": 0.3793103448275862, "grad_norm": 0.23828125, "learning_rate": 6.206896551724138e-06, "loss": 1.5823, "step": 44 }, { "epoch": 0.3879310344827586, "grad_norm": 0.1533203125, "learning_rate": 6.1206896551724135e-06, "loss": 1.5819, "step": 45 }, { "epoch": 0.39655172413793105, "grad_norm": 0.162109375, "learning_rate": 6.03448275862069e-06, "loss": 1.5034, "step": 46 }, { "epoch": 0.4051724137931034, "grad_norm": 0.1943359375, "learning_rate": 5.9482758620689665e-06, "loss": 1.5815, "step": 47 }, { "epoch": 0.41379310344827586, "grad_norm": 0.1796875, "learning_rate": 5.862068965517242e-06, "loss": 1.5191, "step": 48 }, { "epoch": 0.4224137931034483, "grad_norm": 0.1708984375, "learning_rate": 5.775862068965518e-06, "loss": 1.5469, "step": 49 }, { "epoch": 0.43103448275862066, "grad_norm": 0.158203125, "learning_rate": 5.689655172413794e-06, "loss": 1.5112, "step": 50 }, { "epoch": 0.4396551724137931, "grad_norm": 0.1748046875, "learning_rate": 5.603448275862069e-06, "loss": 1.5272, "step": 51 }, { "epoch": 0.4482758620689655, "grad_norm": 0.14453125, "learning_rate": 5.517241379310345e-06, "loss": 1.4435, "step": 52 }, { "epoch": 0.45689655172413796, "grad_norm": 0.185546875, "learning_rate": 5.431034482758621e-06, "loss": 1.5291, "step": 53 }, { "epoch": 0.46551724137931033, "grad_norm": 0.1923828125, "learning_rate": 5.344827586206896e-06, "loss": 1.4499, "step": 54 }, { "epoch": 0.47413793103448276, "grad_norm": 0.2294921875, "learning_rate": 5.258620689655173e-06, "loss": 1.5481, "step": 55 }, { "epoch": 0.4827586206896552, "grad_norm": 0.16796875, "learning_rate": 5.172413793103449e-06, "loss": 1.4589, "step": 56 }, { "epoch": 0.49137931034482757, "grad_norm": 0.166015625, "learning_rate": 5.086206896551724e-06, "loss": 1.5175, "step": 57 }, { "epoch": 0.5, "grad_norm": 0.1630859375, "learning_rate": 5e-06, "loss": 1.5369, "step": 58 }, { "epoch": 0.5086206896551724, "grad_norm": 0.1826171875, "learning_rate": 4.9137931034482765e-06, "loss": 1.5117, "step": 59 }, { "epoch": 0.5172413793103449, "grad_norm": 0.1376953125, "learning_rate": 4.8275862068965525e-06, "loss": 1.5455, "step": 60 }, { "epoch": 0.5258620689655172, "grad_norm": 0.1650390625, "learning_rate": 4.741379310344828e-06, "loss": 1.5353, "step": 61 }, { "epoch": 0.5344827586206896, "grad_norm": 0.1591796875, "learning_rate": 4.655172413793104e-06, "loss": 1.4975, "step": 62 }, { "epoch": 0.5431034482758621, "grad_norm": 0.15234375, "learning_rate": 4.56896551724138e-06, "loss": 1.5309, "step": 63 }, { "epoch": 0.5517241379310345, "grad_norm": 0.14453125, "learning_rate": 4.482758620689656e-06, "loss": 1.5189, "step": 64 }, { "epoch": 0.5603448275862069, "grad_norm": 0.134765625, "learning_rate": 4.396551724137931e-06, "loss": 1.4481, "step": 65 }, { "epoch": 0.5689655172413793, "grad_norm": 0.1484375, "learning_rate": 4.310344827586207e-06, "loss": 1.5394, "step": 66 }, { "epoch": 0.5775862068965517, "grad_norm": 0.1572265625, "learning_rate": 4.224137931034483e-06, "loss": 1.5147, "step": 67 }, { "epoch": 0.5862068965517241, "grad_norm": 0.14453125, "learning_rate": 4.137931034482759e-06, "loss": 1.5097, "step": 68 }, { "epoch": 0.5948275862068966, "grad_norm": 0.1826171875, "learning_rate": 4.051724137931034e-06, "loss": 1.5006, "step": 69 }, { "epoch": 0.603448275862069, "grad_norm": 0.166015625, "learning_rate": 3.96551724137931e-06, "loss": 1.5042, "step": 70 }, { "epoch": 0.6120689655172413, "grad_norm": 0.1591796875, "learning_rate": 3.8793103448275865e-06, "loss": 1.5101, "step": 71 }, { "epoch": 0.6206896551724138, "grad_norm": 0.16796875, "learning_rate": 3.793103448275862e-06, "loss": 1.4706, "step": 72 }, { "epoch": 0.6293103448275862, "grad_norm": 0.1669921875, "learning_rate": 3.7068965517241385e-06, "loss": 1.513, "step": 73 }, { "epoch": 0.6379310344827587, "grad_norm": 0.1650390625, "learning_rate": 3.620689655172414e-06, "loss": 1.4574, "step": 74 }, { "epoch": 0.646551724137931, "grad_norm": 0.18359375, "learning_rate": 3.5344827586206898e-06, "loss": 1.4928, "step": 75 }, { "epoch": 0.6551724137931034, "grad_norm": 0.1572265625, "learning_rate": 3.448275862068966e-06, "loss": 1.4709, "step": 76 }, { "epoch": 0.6637931034482759, "grad_norm": 0.12890625, "learning_rate": 3.362068965517242e-06, "loss": 1.428, "step": 77 }, { "epoch": 0.6724137931034483, "grad_norm": 0.1669921875, "learning_rate": 3.2758620689655175e-06, "loss": 1.5212, "step": 78 }, { "epoch": 0.6810344827586207, "grad_norm": 0.1416015625, "learning_rate": 3.1896551724137935e-06, "loss": 1.492, "step": 79 }, { "epoch": 0.6896551724137931, "grad_norm": 0.1572265625, "learning_rate": 3.103448275862069e-06, "loss": 1.3807, "step": 80 }, { "epoch": 0.6982758620689655, "grad_norm": 0.1416015625, "learning_rate": 3.017241379310345e-06, "loss": 1.4559, "step": 81 }, { "epoch": 0.7068965517241379, "grad_norm": 0.1484375, "learning_rate": 2.931034482758621e-06, "loss": 1.5057, "step": 82 }, { "epoch": 0.7155172413793104, "grad_norm": 0.185546875, "learning_rate": 2.844827586206897e-06, "loss": 1.4579, "step": 83 }, { "epoch": 0.7241379310344828, "grad_norm": 0.1455078125, "learning_rate": 2.7586206896551725e-06, "loss": 1.4931, "step": 84 }, { "epoch": 0.7327586206896551, "grad_norm": 0.1552734375, "learning_rate": 2.672413793103448e-06, "loss": 1.4055, "step": 85 }, { "epoch": 0.7413793103448276, "grad_norm": 0.1484375, "learning_rate": 2.5862068965517246e-06, "loss": 1.4233, "step": 86 }, { "epoch": 0.75, "grad_norm": 0.2041015625, "learning_rate": 2.5e-06, "loss": 1.4571, "step": 87 }, { "epoch": 0.7586206896551724, "grad_norm": 0.1748046875, "learning_rate": 2.4137931034482762e-06, "loss": 1.503, "step": 88 }, { "epoch": 0.7672413793103449, "grad_norm": 0.18359375, "learning_rate": 2.327586206896552e-06, "loss": 1.4796, "step": 89 }, { "epoch": 0.7758620689655172, "grad_norm": 0.1591796875, "learning_rate": 2.241379310344828e-06, "loss": 1.4198, "step": 90 }, { "epoch": 0.7844827586206896, "grad_norm": 0.1845703125, "learning_rate": 2.1551724137931035e-06, "loss": 1.4005, "step": 91 }, { "epoch": 0.7931034482758621, "grad_norm": 0.1328125, "learning_rate": 2.0689655172413796e-06, "loss": 1.4767, "step": 92 }, { "epoch": 0.8017241379310345, "grad_norm": 0.185546875, "learning_rate": 1.982758620689655e-06, "loss": 1.5157, "step": 93 }, { "epoch": 0.8103448275862069, "grad_norm": 0.2138671875, "learning_rate": 1.896551724137931e-06, "loss": 1.4623, "step": 94 }, { "epoch": 0.8189655172413793, "grad_norm": 0.1826171875, "learning_rate": 1.810344827586207e-06, "loss": 1.4235, "step": 95 }, { "epoch": 0.8275862068965517, "grad_norm": 0.173828125, "learning_rate": 1.724137931034483e-06, "loss": 1.5049, "step": 96 }, { "epoch": 0.8362068965517241, "grad_norm": 0.13671875, "learning_rate": 1.6379310344827587e-06, "loss": 1.4492, "step": 97 }, { "epoch": 0.8448275862068966, "grad_norm": 0.1943359375, "learning_rate": 1.5517241379310346e-06, "loss": 1.4447, "step": 98 }, { "epoch": 0.853448275862069, "grad_norm": 0.1455078125, "learning_rate": 1.4655172413793104e-06, "loss": 1.4843, "step": 99 }, { "epoch": 0.8620689655172413, "grad_norm": 0.1767578125, "learning_rate": 1.3793103448275862e-06, "loss": 1.3772, "step": 100 }, { "epoch": 0.8706896551724138, "grad_norm": 0.2109375, "learning_rate": 1.2931034482758623e-06, "loss": 1.5097, "step": 101 }, { "epoch": 0.8793103448275862, "grad_norm": 0.1689453125, "learning_rate": 1.2068965517241381e-06, "loss": 1.4115, "step": 102 }, { "epoch": 0.8879310344827587, "grad_norm": 0.13671875, "learning_rate": 1.120689655172414e-06, "loss": 1.4794, "step": 103 }, { "epoch": 0.896551724137931, "grad_norm": 0.17578125, "learning_rate": 1.0344827586206898e-06, "loss": 1.4197, "step": 104 }, { "epoch": 0.9051724137931034, "grad_norm": 0.2080078125, "learning_rate": 9.482758620689655e-07, "loss": 1.4999, "step": 105 }, { "epoch": 0.9137931034482759, "grad_norm": 0.1337890625, "learning_rate": 8.620689655172415e-07, "loss": 1.4207, "step": 106 }, { "epoch": 0.9224137931034483, "grad_norm": 0.1416015625, "learning_rate": 7.758620689655173e-07, "loss": 1.396, "step": 107 }, { "epoch": 0.9310344827586207, "grad_norm": 0.162109375, "learning_rate": 6.896551724137931e-07, "loss": 1.4244, "step": 108 }, { "epoch": 0.9396551724137931, "grad_norm": 0.185546875, "learning_rate": 6.034482758620691e-07, "loss": 1.4731, "step": 109 }, { "epoch": 0.9482758620689655, "grad_norm": 0.1396484375, "learning_rate": 5.172413793103449e-07, "loss": 1.5113, "step": 110 }, { "epoch": 0.9568965517241379, "grad_norm": 0.1630859375, "learning_rate": 4.3103448275862073e-07, "loss": 1.468, "step": 111 }, { "epoch": 0.9655172413793104, "grad_norm": 0.162109375, "learning_rate": 3.4482758620689656e-07, "loss": 1.4651, "step": 112 }, { "epoch": 0.9741379310344828, "grad_norm": 0.2001953125, "learning_rate": 2.5862068965517245e-07, "loss": 1.4632, "step": 113 }, { "epoch": 0.9827586206896551, "grad_norm": 0.1337890625, "learning_rate": 1.7241379310344828e-07, "loss": 1.4311, "step": 114 }, { "epoch": 0.9913793103448276, "grad_norm": 0.134765625, "learning_rate": 8.620689655172414e-08, "loss": 1.423, "step": 115 }, { "epoch": 1.0, "grad_norm": 0.1494140625, "learning_rate": 0.0, "loss": 1.4911, "step": 116 } ], "logging_steps": 1.0, "max_steps": 116, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.1862449256372634e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }