Stewy Slocum
Add fine-tuned model
c653d88
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 0,
"global_step": 676,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0014792899408284023,
"grad_norm": 0.55859375,
"learning_rate": 1e-05,
"loss": 2.4231,
"step": 1
},
{
"epoch": 0.0029585798816568047,
"grad_norm": 0.52734375,
"learning_rate": 9.985207100591717e-06,
"loss": 2.4086,
"step": 2
},
{
"epoch": 0.004437869822485207,
"grad_norm": 0.5,
"learning_rate": 9.970414201183432e-06,
"loss": 2.4096,
"step": 3
},
{
"epoch": 0.005917159763313609,
"grad_norm": 0.490234375,
"learning_rate": 9.95562130177515e-06,
"loss": 2.4641,
"step": 4
},
{
"epoch": 0.0073964497041420114,
"grad_norm": 0.498046875,
"learning_rate": 9.940828402366864e-06,
"loss": 2.3734,
"step": 5
},
{
"epoch": 0.008875739644970414,
"grad_norm": 0.455078125,
"learning_rate": 9.926035502958581e-06,
"loss": 2.3715,
"step": 6
},
{
"epoch": 0.010355029585798817,
"grad_norm": 0.455078125,
"learning_rate": 9.911242603550297e-06,
"loss": 2.4129,
"step": 7
},
{
"epoch": 0.011834319526627219,
"grad_norm": 0.44921875,
"learning_rate": 9.896449704142012e-06,
"loss": 2.3551,
"step": 8
},
{
"epoch": 0.013313609467455622,
"grad_norm": 0.419921875,
"learning_rate": 9.88165680473373e-06,
"loss": 2.3445,
"step": 9
},
{
"epoch": 0.014792899408284023,
"grad_norm": 0.41015625,
"learning_rate": 9.866863905325444e-06,
"loss": 2.2953,
"step": 10
},
{
"epoch": 0.016272189349112426,
"grad_norm": 0.41015625,
"learning_rate": 9.852071005917161e-06,
"loss": 2.3048,
"step": 11
},
{
"epoch": 0.01775147928994083,
"grad_norm": 0.40234375,
"learning_rate": 9.837278106508877e-06,
"loss": 2.2406,
"step": 12
},
{
"epoch": 0.019230769230769232,
"grad_norm": 0.40234375,
"learning_rate": 9.822485207100593e-06,
"loss": 2.2238,
"step": 13
},
{
"epoch": 0.020710059171597635,
"grad_norm": 0.38671875,
"learning_rate": 9.807692307692308e-06,
"loss": 2.1735,
"step": 14
},
{
"epoch": 0.022189349112426034,
"grad_norm": 0.400390625,
"learning_rate": 9.792899408284024e-06,
"loss": 2.163,
"step": 15
},
{
"epoch": 0.023668639053254437,
"grad_norm": 0.3984375,
"learning_rate": 9.778106508875741e-06,
"loss": 2.1651,
"step": 16
},
{
"epoch": 0.02514792899408284,
"grad_norm": 0.42578125,
"learning_rate": 9.763313609467457e-06,
"loss": 2.135,
"step": 17
},
{
"epoch": 0.026627218934911243,
"grad_norm": 0.423828125,
"learning_rate": 9.748520710059173e-06,
"loss": 2.102,
"step": 18
},
{
"epoch": 0.028106508875739646,
"grad_norm": 0.392578125,
"learning_rate": 9.733727810650888e-06,
"loss": 2.0616,
"step": 19
},
{
"epoch": 0.029585798816568046,
"grad_norm": 0.396484375,
"learning_rate": 9.718934911242604e-06,
"loss": 2.0527,
"step": 20
},
{
"epoch": 0.03106508875739645,
"grad_norm": 0.37109375,
"learning_rate": 9.70414201183432e-06,
"loss": 1.9993,
"step": 21
},
{
"epoch": 0.03254437869822485,
"grad_norm": 0.36328125,
"learning_rate": 9.689349112426036e-06,
"loss": 2.021,
"step": 22
},
{
"epoch": 0.034023668639053255,
"grad_norm": 0.36328125,
"learning_rate": 9.674556213017751e-06,
"loss": 1.9456,
"step": 23
},
{
"epoch": 0.03550295857988166,
"grad_norm": 0.361328125,
"learning_rate": 9.659763313609469e-06,
"loss": 1.9524,
"step": 24
},
{
"epoch": 0.03698224852071006,
"grad_norm": 0.390625,
"learning_rate": 9.644970414201184e-06,
"loss": 1.9964,
"step": 25
},
{
"epoch": 0.038461538461538464,
"grad_norm": 0.373046875,
"learning_rate": 9.6301775147929e-06,
"loss": 1.9319,
"step": 26
},
{
"epoch": 0.03994082840236687,
"grad_norm": 0.3671875,
"learning_rate": 9.615384615384616e-06,
"loss": 1.9021,
"step": 27
},
{
"epoch": 0.04142011834319527,
"grad_norm": 0.361328125,
"learning_rate": 9.600591715976331e-06,
"loss": 1.8616,
"step": 28
},
{
"epoch": 0.042899408284023666,
"grad_norm": 0.36328125,
"learning_rate": 9.585798816568049e-06,
"loss": 1.8125,
"step": 29
},
{
"epoch": 0.04437869822485207,
"grad_norm": 0.353515625,
"learning_rate": 9.571005917159763e-06,
"loss": 1.8167,
"step": 30
},
{
"epoch": 0.04585798816568047,
"grad_norm": 0.369140625,
"learning_rate": 9.55621301775148e-06,
"loss": 1.7553,
"step": 31
},
{
"epoch": 0.047337278106508875,
"grad_norm": 0.357421875,
"learning_rate": 9.541420118343196e-06,
"loss": 1.7764,
"step": 32
},
{
"epoch": 0.04881656804733728,
"grad_norm": 0.392578125,
"learning_rate": 9.526627218934912e-06,
"loss": 1.7717,
"step": 33
},
{
"epoch": 0.05029585798816568,
"grad_norm": 0.341796875,
"learning_rate": 9.511834319526629e-06,
"loss": 1.7104,
"step": 34
},
{
"epoch": 0.051775147928994084,
"grad_norm": 0.349609375,
"learning_rate": 9.497041420118343e-06,
"loss": 1.6314,
"step": 35
},
{
"epoch": 0.05325443786982249,
"grad_norm": 0.34375,
"learning_rate": 9.48224852071006e-06,
"loss": 1.668,
"step": 36
},
{
"epoch": 0.05473372781065089,
"grad_norm": 0.369140625,
"learning_rate": 9.467455621301776e-06,
"loss": 1.6516,
"step": 37
},
{
"epoch": 0.05621301775147929,
"grad_norm": 0.3359375,
"learning_rate": 9.452662721893492e-06,
"loss": 1.6512,
"step": 38
},
{
"epoch": 0.057692307692307696,
"grad_norm": 0.34375,
"learning_rate": 9.43786982248521e-06,
"loss": 1.6004,
"step": 39
},
{
"epoch": 0.05917159763313609,
"grad_norm": 0.34375,
"learning_rate": 9.423076923076923e-06,
"loss": 1.5672,
"step": 40
},
{
"epoch": 0.060650887573964495,
"grad_norm": 0.314453125,
"learning_rate": 9.40828402366864e-06,
"loss": 1.5295,
"step": 41
},
{
"epoch": 0.0621301775147929,
"grad_norm": 0.3828125,
"learning_rate": 9.393491124260356e-06,
"loss": 1.5748,
"step": 42
},
{
"epoch": 0.06360946745562131,
"grad_norm": 0.328125,
"learning_rate": 9.378698224852072e-06,
"loss": 1.55,
"step": 43
},
{
"epoch": 0.0650887573964497,
"grad_norm": 0.318359375,
"learning_rate": 9.363905325443788e-06,
"loss": 1.4825,
"step": 44
},
{
"epoch": 0.06656804733727811,
"grad_norm": 0.3359375,
"learning_rate": 9.349112426035503e-06,
"loss": 1.489,
"step": 45
},
{
"epoch": 0.06804733727810651,
"grad_norm": 0.3359375,
"learning_rate": 9.33431952662722e-06,
"loss": 1.4201,
"step": 46
},
{
"epoch": 0.0695266272189349,
"grad_norm": 0.33203125,
"learning_rate": 9.319526627218935e-06,
"loss": 1.4269,
"step": 47
},
{
"epoch": 0.07100591715976332,
"grad_norm": 0.314453125,
"learning_rate": 9.304733727810652e-06,
"loss": 1.4084,
"step": 48
},
{
"epoch": 0.07248520710059171,
"grad_norm": 0.314453125,
"learning_rate": 9.289940828402368e-06,
"loss": 1.3724,
"step": 49
},
{
"epoch": 0.07396449704142012,
"grad_norm": 0.3203125,
"learning_rate": 9.275147928994084e-06,
"loss": 1.4076,
"step": 50
},
{
"epoch": 0.07544378698224852,
"grad_norm": 0.310546875,
"learning_rate": 9.2603550295858e-06,
"loss": 1.3456,
"step": 51
},
{
"epoch": 0.07692307692307693,
"grad_norm": 0.294921875,
"learning_rate": 9.245562130177515e-06,
"loss": 1.3309,
"step": 52
},
{
"epoch": 0.07840236686390532,
"grad_norm": 0.291015625,
"learning_rate": 9.230769230769232e-06,
"loss": 1.3331,
"step": 53
},
{
"epoch": 0.07988165680473373,
"grad_norm": 0.298828125,
"learning_rate": 9.215976331360948e-06,
"loss": 1.3032,
"step": 54
},
{
"epoch": 0.08136094674556213,
"grad_norm": 0.275390625,
"learning_rate": 9.201183431952664e-06,
"loss": 1.2559,
"step": 55
},
{
"epoch": 0.08284023668639054,
"grad_norm": 0.287109375,
"learning_rate": 9.18639053254438e-06,
"loss": 1.289,
"step": 56
},
{
"epoch": 0.08431952662721894,
"grad_norm": 0.279296875,
"learning_rate": 9.171597633136095e-06,
"loss": 1.2803,
"step": 57
},
{
"epoch": 0.08579881656804733,
"grad_norm": 0.2890625,
"learning_rate": 9.15680473372781e-06,
"loss": 1.2438,
"step": 58
},
{
"epoch": 0.08727810650887574,
"grad_norm": 0.275390625,
"learning_rate": 9.142011834319528e-06,
"loss": 1.1926,
"step": 59
},
{
"epoch": 0.08875739644970414,
"grad_norm": 0.27734375,
"learning_rate": 9.127218934911244e-06,
"loss": 1.2031,
"step": 60
},
{
"epoch": 0.09023668639053255,
"grad_norm": 0.26953125,
"learning_rate": 9.11242603550296e-06,
"loss": 1.1792,
"step": 61
},
{
"epoch": 0.09171597633136094,
"grad_norm": 0.28125,
"learning_rate": 9.097633136094675e-06,
"loss": 1.1981,
"step": 62
},
{
"epoch": 0.09319526627218935,
"grad_norm": 0.341796875,
"learning_rate": 9.082840236686391e-06,
"loss": 1.1847,
"step": 63
},
{
"epoch": 0.09467455621301775,
"grad_norm": 0.26953125,
"learning_rate": 9.068047337278107e-06,
"loss": 1.1543,
"step": 64
},
{
"epoch": 0.09615384615384616,
"grad_norm": 0.267578125,
"learning_rate": 9.053254437869822e-06,
"loss": 1.1336,
"step": 65
},
{
"epoch": 0.09763313609467456,
"grad_norm": 0.275390625,
"learning_rate": 9.03846153846154e-06,
"loss": 1.1526,
"step": 66
},
{
"epoch": 0.09911242603550297,
"grad_norm": 0.28515625,
"learning_rate": 9.023668639053255e-06,
"loss": 1.1002,
"step": 67
},
{
"epoch": 0.10059171597633136,
"grad_norm": 0.28515625,
"learning_rate": 9.008875739644971e-06,
"loss": 1.1317,
"step": 68
},
{
"epoch": 0.10207100591715976,
"grad_norm": 0.2890625,
"learning_rate": 8.994082840236687e-06,
"loss": 1.0843,
"step": 69
},
{
"epoch": 0.10355029585798817,
"grad_norm": 0.30078125,
"learning_rate": 8.979289940828403e-06,
"loss": 1.1229,
"step": 70
},
{
"epoch": 0.10502958579881656,
"grad_norm": 0.2490234375,
"learning_rate": 8.96449704142012e-06,
"loss": 1.0765,
"step": 71
},
{
"epoch": 0.10650887573964497,
"grad_norm": 0.263671875,
"learning_rate": 8.949704142011834e-06,
"loss": 1.0386,
"step": 72
},
{
"epoch": 0.10798816568047337,
"grad_norm": 0.25,
"learning_rate": 8.934911242603551e-06,
"loss": 1.0286,
"step": 73
},
{
"epoch": 0.10946745562130178,
"grad_norm": 0.267578125,
"learning_rate": 8.920118343195267e-06,
"loss": 1.0265,
"step": 74
},
{
"epoch": 0.11094674556213018,
"grad_norm": 0.2490234375,
"learning_rate": 8.905325443786983e-06,
"loss": 1.034,
"step": 75
},
{
"epoch": 0.11242603550295859,
"grad_norm": 0.2451171875,
"learning_rate": 8.8905325443787e-06,
"loss": 1.0409,
"step": 76
},
{
"epoch": 0.11390532544378698,
"grad_norm": 0.2490234375,
"learning_rate": 8.875739644970414e-06,
"loss": 0.9947,
"step": 77
},
{
"epoch": 0.11538461538461539,
"grad_norm": 0.2890625,
"learning_rate": 8.860946745562132e-06,
"loss": 1.0118,
"step": 78
},
{
"epoch": 0.11686390532544379,
"grad_norm": 0.275390625,
"learning_rate": 8.846153846153847e-06,
"loss": 1.0052,
"step": 79
},
{
"epoch": 0.11834319526627218,
"grad_norm": 0.279296875,
"learning_rate": 8.831360946745563e-06,
"loss": 0.9843,
"step": 80
},
{
"epoch": 0.11982248520710059,
"grad_norm": 0.244140625,
"learning_rate": 8.816568047337279e-06,
"loss": 0.9775,
"step": 81
},
{
"epoch": 0.12130177514792899,
"grad_norm": 0.2412109375,
"learning_rate": 8.801775147928994e-06,
"loss": 0.9824,
"step": 82
},
{
"epoch": 0.1227810650887574,
"grad_norm": 0.26953125,
"learning_rate": 8.786982248520712e-06,
"loss": 0.9429,
"step": 83
},
{
"epoch": 0.1242603550295858,
"grad_norm": 0.25390625,
"learning_rate": 8.772189349112427e-06,
"loss": 0.9227,
"step": 84
},
{
"epoch": 0.1257396449704142,
"grad_norm": 0.27734375,
"learning_rate": 8.757396449704143e-06,
"loss": 0.9233,
"step": 85
},
{
"epoch": 0.12721893491124261,
"grad_norm": 0.283203125,
"learning_rate": 8.742603550295859e-06,
"loss": 0.9505,
"step": 86
},
{
"epoch": 0.128698224852071,
"grad_norm": 0.314453125,
"learning_rate": 8.727810650887574e-06,
"loss": 0.9382,
"step": 87
},
{
"epoch": 0.1301775147928994,
"grad_norm": 0.318359375,
"learning_rate": 8.71301775147929e-06,
"loss": 0.9702,
"step": 88
},
{
"epoch": 0.13165680473372782,
"grad_norm": 0.265625,
"learning_rate": 8.698224852071006e-06,
"loss": 0.9179,
"step": 89
},
{
"epoch": 0.13313609467455623,
"grad_norm": 0.3125,
"learning_rate": 8.683431952662723e-06,
"loss": 0.9399,
"step": 90
},
{
"epoch": 0.1346153846153846,
"grad_norm": 0.275390625,
"learning_rate": 8.668639053254439e-06,
"loss": 0.907,
"step": 91
},
{
"epoch": 0.13609467455621302,
"grad_norm": 0.271484375,
"learning_rate": 8.653846153846155e-06,
"loss": 0.9184,
"step": 92
},
{
"epoch": 0.13757396449704143,
"grad_norm": 0.24609375,
"learning_rate": 8.63905325443787e-06,
"loss": 0.9028,
"step": 93
},
{
"epoch": 0.1390532544378698,
"grad_norm": 0.265625,
"learning_rate": 8.624260355029586e-06,
"loss": 0.9114,
"step": 94
},
{
"epoch": 0.14053254437869822,
"grad_norm": 0.2734375,
"learning_rate": 8.609467455621302e-06,
"loss": 0.8842,
"step": 95
},
{
"epoch": 0.14201183431952663,
"grad_norm": 0.287109375,
"learning_rate": 8.594674556213019e-06,
"loss": 0.9058,
"step": 96
},
{
"epoch": 0.14349112426035504,
"grad_norm": 0.240234375,
"learning_rate": 8.579881656804735e-06,
"loss": 0.8886,
"step": 97
},
{
"epoch": 0.14497041420118342,
"grad_norm": 0.25,
"learning_rate": 8.56508875739645e-06,
"loss": 0.8919,
"step": 98
},
{
"epoch": 0.14644970414201183,
"grad_norm": 0.2578125,
"learning_rate": 8.550295857988166e-06,
"loss": 0.9077,
"step": 99
},
{
"epoch": 0.14792899408284024,
"grad_norm": 0.2470703125,
"learning_rate": 8.535502958579882e-06,
"loss": 0.864,
"step": 100
},
{
"epoch": 0.14940828402366865,
"grad_norm": 0.2578125,
"learning_rate": 8.5207100591716e-06,
"loss": 0.852,
"step": 101
},
{
"epoch": 0.15088757396449703,
"grad_norm": 0.263671875,
"learning_rate": 8.505917159763313e-06,
"loss": 0.8532,
"step": 102
},
{
"epoch": 0.15236686390532544,
"grad_norm": 0.271484375,
"learning_rate": 8.49112426035503e-06,
"loss": 0.8782,
"step": 103
},
{
"epoch": 0.15384615384615385,
"grad_norm": 0.2578125,
"learning_rate": 8.476331360946746e-06,
"loss": 0.8341,
"step": 104
},
{
"epoch": 0.15532544378698224,
"grad_norm": 0.2431640625,
"learning_rate": 8.461538461538462e-06,
"loss": 0.8387,
"step": 105
},
{
"epoch": 0.15680473372781065,
"grad_norm": 0.275390625,
"learning_rate": 8.446745562130178e-06,
"loss": 0.8476,
"step": 106
},
{
"epoch": 0.15828402366863906,
"grad_norm": 0.2431640625,
"learning_rate": 8.431952662721893e-06,
"loss": 0.8373,
"step": 107
},
{
"epoch": 0.15976331360946747,
"grad_norm": 0.287109375,
"learning_rate": 8.417159763313611e-06,
"loss": 0.85,
"step": 108
},
{
"epoch": 0.16124260355029585,
"grad_norm": 0.2470703125,
"learning_rate": 8.402366863905327e-06,
"loss": 0.8027,
"step": 109
},
{
"epoch": 0.16272189349112426,
"grad_norm": 0.2578125,
"learning_rate": 8.387573964497042e-06,
"loss": 0.8286,
"step": 110
},
{
"epoch": 0.16420118343195267,
"grad_norm": 0.26953125,
"learning_rate": 8.372781065088758e-06,
"loss": 0.8317,
"step": 111
},
{
"epoch": 0.16568047337278108,
"grad_norm": 0.251953125,
"learning_rate": 8.357988165680474e-06,
"loss": 0.8442,
"step": 112
},
{
"epoch": 0.16715976331360946,
"grad_norm": 0.2490234375,
"learning_rate": 8.343195266272191e-06,
"loss": 0.8062,
"step": 113
},
{
"epoch": 0.16863905325443787,
"grad_norm": 0.259765625,
"learning_rate": 8.328402366863905e-06,
"loss": 0.7936,
"step": 114
},
{
"epoch": 0.17011834319526628,
"grad_norm": 0.263671875,
"learning_rate": 8.313609467455622e-06,
"loss": 0.7832,
"step": 115
},
{
"epoch": 0.17159763313609466,
"grad_norm": 0.294921875,
"learning_rate": 8.298816568047338e-06,
"loss": 0.8205,
"step": 116
},
{
"epoch": 0.17307692307692307,
"grad_norm": 0.287109375,
"learning_rate": 8.284023668639054e-06,
"loss": 0.8161,
"step": 117
},
{
"epoch": 0.17455621301775148,
"grad_norm": 0.26171875,
"learning_rate": 8.26923076923077e-06,
"loss": 0.7978,
"step": 118
},
{
"epoch": 0.1760355029585799,
"grad_norm": 0.365234375,
"learning_rate": 8.254437869822485e-06,
"loss": 0.8196,
"step": 119
},
{
"epoch": 0.17751479289940827,
"grad_norm": 0.298828125,
"learning_rate": 8.239644970414203e-06,
"loss": 0.8038,
"step": 120
},
{
"epoch": 0.17899408284023668,
"grad_norm": 0.361328125,
"learning_rate": 8.224852071005918e-06,
"loss": 0.8437,
"step": 121
},
{
"epoch": 0.1804733727810651,
"grad_norm": 0.341796875,
"learning_rate": 8.210059171597634e-06,
"loss": 0.8056,
"step": 122
},
{
"epoch": 0.1819526627218935,
"grad_norm": 0.357421875,
"learning_rate": 8.19526627218935e-06,
"loss": 0.7825,
"step": 123
},
{
"epoch": 0.1834319526627219,
"grad_norm": 0.31640625,
"learning_rate": 8.180473372781065e-06,
"loss": 0.804,
"step": 124
},
{
"epoch": 0.1849112426035503,
"grad_norm": 0.31640625,
"learning_rate": 8.165680473372781e-06,
"loss": 0.8087,
"step": 125
},
{
"epoch": 0.1863905325443787,
"grad_norm": 0.271484375,
"learning_rate": 8.150887573964499e-06,
"loss": 0.7488,
"step": 126
},
{
"epoch": 0.1878698224852071,
"grad_norm": 0.36328125,
"learning_rate": 8.136094674556214e-06,
"loss": 0.7905,
"step": 127
},
{
"epoch": 0.1893491124260355,
"grad_norm": 0.30078125,
"learning_rate": 8.12130177514793e-06,
"loss": 0.7792,
"step": 128
},
{
"epoch": 0.1908284023668639,
"grad_norm": 0.2734375,
"learning_rate": 8.106508875739646e-06,
"loss": 0.7933,
"step": 129
},
{
"epoch": 0.19230769230769232,
"grad_norm": 0.2890625,
"learning_rate": 8.091715976331361e-06,
"loss": 0.74,
"step": 130
},
{
"epoch": 0.1937869822485207,
"grad_norm": 0.349609375,
"learning_rate": 8.076923076923077e-06,
"loss": 0.771,
"step": 131
},
{
"epoch": 0.1952662721893491,
"grad_norm": 0.302734375,
"learning_rate": 8.062130177514793e-06,
"loss": 0.7907,
"step": 132
},
{
"epoch": 0.19674556213017752,
"grad_norm": 0.296875,
"learning_rate": 8.04733727810651e-06,
"loss": 0.758,
"step": 133
},
{
"epoch": 0.19822485207100593,
"grad_norm": 0.298828125,
"learning_rate": 8.032544378698226e-06,
"loss": 0.7607,
"step": 134
},
{
"epoch": 0.1997041420118343,
"grad_norm": 0.267578125,
"learning_rate": 8.017751479289941e-06,
"loss": 0.7552,
"step": 135
},
{
"epoch": 0.20118343195266272,
"grad_norm": 0.3515625,
"learning_rate": 8.002958579881657e-06,
"loss": 0.73,
"step": 136
},
{
"epoch": 0.20266272189349113,
"grad_norm": 0.26953125,
"learning_rate": 7.988165680473373e-06,
"loss": 0.7354,
"step": 137
},
{
"epoch": 0.20414201183431951,
"grad_norm": 0.2890625,
"learning_rate": 7.97337278106509e-06,
"loss": 0.7569,
"step": 138
},
{
"epoch": 0.20562130177514792,
"grad_norm": 0.2890625,
"learning_rate": 7.958579881656804e-06,
"loss": 0.736,
"step": 139
},
{
"epoch": 0.20710059171597633,
"grad_norm": 0.2578125,
"learning_rate": 7.943786982248522e-06,
"loss": 0.7334,
"step": 140
},
{
"epoch": 0.20857988165680474,
"grad_norm": 0.267578125,
"learning_rate": 7.928994082840237e-06,
"loss": 0.7484,
"step": 141
},
{
"epoch": 0.21005917159763313,
"grad_norm": 0.279296875,
"learning_rate": 7.914201183431953e-06,
"loss": 0.7334,
"step": 142
},
{
"epoch": 0.21153846153846154,
"grad_norm": 0.279296875,
"learning_rate": 7.89940828402367e-06,
"loss": 0.7489,
"step": 143
},
{
"epoch": 0.21301775147928995,
"grad_norm": 0.26953125,
"learning_rate": 7.884615384615384e-06,
"loss": 0.7366,
"step": 144
},
{
"epoch": 0.21449704142011836,
"grad_norm": 0.29296875,
"learning_rate": 7.869822485207102e-06,
"loss": 0.738,
"step": 145
},
{
"epoch": 0.21597633136094674,
"grad_norm": 0.26953125,
"learning_rate": 7.855029585798818e-06,
"loss": 0.7493,
"step": 146
},
{
"epoch": 0.21745562130177515,
"grad_norm": 0.306640625,
"learning_rate": 7.840236686390533e-06,
"loss": 0.728,
"step": 147
},
{
"epoch": 0.21893491124260356,
"grad_norm": 0.283203125,
"learning_rate": 7.825443786982249e-06,
"loss": 0.7144,
"step": 148
},
{
"epoch": 0.22041420118343194,
"grad_norm": 0.265625,
"learning_rate": 7.810650887573965e-06,
"loss": 0.7199,
"step": 149
},
{
"epoch": 0.22189349112426035,
"grad_norm": 0.279296875,
"learning_rate": 7.795857988165682e-06,
"loss": 0.7147,
"step": 150
},
{
"epoch": 0.22337278106508876,
"grad_norm": 0.2578125,
"learning_rate": 7.781065088757396e-06,
"loss": 0.7019,
"step": 151
},
{
"epoch": 0.22485207100591717,
"grad_norm": 0.3203125,
"learning_rate": 7.766272189349113e-06,
"loss": 0.7455,
"step": 152
},
{
"epoch": 0.22633136094674555,
"grad_norm": 0.298828125,
"learning_rate": 7.751479289940829e-06,
"loss": 0.7111,
"step": 153
},
{
"epoch": 0.22781065088757396,
"grad_norm": 0.4921875,
"learning_rate": 7.736686390532545e-06,
"loss": 0.7537,
"step": 154
},
{
"epoch": 0.22928994082840237,
"grad_norm": 0.25390625,
"learning_rate": 7.72189349112426e-06,
"loss": 0.7252,
"step": 155
},
{
"epoch": 0.23076923076923078,
"grad_norm": 0.3515625,
"learning_rate": 7.707100591715976e-06,
"loss": 0.7351,
"step": 156
},
{
"epoch": 0.23224852071005916,
"grad_norm": 0.30859375,
"learning_rate": 7.692307692307694e-06,
"loss": 0.7266,
"step": 157
},
{
"epoch": 0.23372781065088757,
"grad_norm": 0.28125,
"learning_rate": 7.67751479289941e-06,
"loss": 0.7077,
"step": 158
},
{
"epoch": 0.23520710059171598,
"grad_norm": 0.30859375,
"learning_rate": 7.662721893491125e-06,
"loss": 0.7025,
"step": 159
},
{
"epoch": 0.23668639053254437,
"grad_norm": 0.33203125,
"learning_rate": 7.64792899408284e-06,
"loss": 0.7028,
"step": 160
},
{
"epoch": 0.23816568047337278,
"grad_norm": 0.3046875,
"learning_rate": 7.633136094674556e-06,
"loss": 0.7053,
"step": 161
},
{
"epoch": 0.23964497041420119,
"grad_norm": 0.318359375,
"learning_rate": 7.618343195266272e-06,
"loss": 0.6992,
"step": 162
},
{
"epoch": 0.2411242603550296,
"grad_norm": 0.318359375,
"learning_rate": 7.603550295857989e-06,
"loss": 0.6892,
"step": 163
},
{
"epoch": 0.24260355029585798,
"grad_norm": 0.384765625,
"learning_rate": 7.588757396449705e-06,
"loss": 0.7409,
"step": 164
},
{
"epoch": 0.2440828402366864,
"grad_norm": 0.34375,
"learning_rate": 7.573964497041421e-06,
"loss": 0.6857,
"step": 165
},
{
"epoch": 0.2455621301775148,
"grad_norm": 0.294921875,
"learning_rate": 7.559171597633137e-06,
"loss": 0.6914,
"step": 166
},
{
"epoch": 0.2470414201183432,
"grad_norm": 0.283203125,
"learning_rate": 7.544378698224852e-06,
"loss": 0.7089,
"step": 167
},
{
"epoch": 0.2485207100591716,
"grad_norm": 0.33984375,
"learning_rate": 7.529585798816569e-06,
"loss": 0.6887,
"step": 168
},
{
"epoch": 0.25,
"grad_norm": 0.4296875,
"learning_rate": 7.5147928994082845e-06,
"loss": 0.6972,
"step": 169
},
{
"epoch": 0.2514792899408284,
"grad_norm": 0.328125,
"learning_rate": 7.500000000000001e-06,
"loss": 0.6865,
"step": 170
},
{
"epoch": 0.2529585798816568,
"grad_norm": 0.306640625,
"learning_rate": 7.485207100591717e-06,
"loss": 0.6829,
"step": 171
},
{
"epoch": 0.25443786982248523,
"grad_norm": 0.328125,
"learning_rate": 7.4704142011834324e-06,
"loss": 0.7128,
"step": 172
},
{
"epoch": 0.2559171597633136,
"grad_norm": 0.314453125,
"learning_rate": 7.455621301775149e-06,
"loss": 0.7017,
"step": 173
},
{
"epoch": 0.257396449704142,
"grad_norm": 0.310546875,
"learning_rate": 7.440828402366864e-06,
"loss": 0.6775,
"step": 174
},
{
"epoch": 0.2588757396449704,
"grad_norm": 0.29296875,
"learning_rate": 7.42603550295858e-06,
"loss": 0.6938,
"step": 175
},
{
"epoch": 0.2603550295857988,
"grad_norm": 0.30859375,
"learning_rate": 7.411242603550296e-06,
"loss": 0.6828,
"step": 176
},
{
"epoch": 0.2618343195266272,
"grad_norm": 0.40625,
"learning_rate": 7.396449704142013e-06,
"loss": 0.65,
"step": 177
},
{
"epoch": 0.26331360946745563,
"grad_norm": 0.314453125,
"learning_rate": 7.381656804733729e-06,
"loss": 0.6895,
"step": 178
},
{
"epoch": 0.26479289940828404,
"grad_norm": 0.3046875,
"learning_rate": 7.366863905325444e-06,
"loss": 0.6829,
"step": 179
},
{
"epoch": 0.26627218934911245,
"grad_norm": 0.30078125,
"learning_rate": 7.3520710059171605e-06,
"loss": 0.7053,
"step": 180
},
{
"epoch": 0.2677514792899408,
"grad_norm": 0.39453125,
"learning_rate": 7.337278106508876e-06,
"loss": 0.6982,
"step": 181
},
{
"epoch": 0.2692307692307692,
"grad_norm": 0.314453125,
"learning_rate": 7.322485207100593e-06,
"loss": 0.6965,
"step": 182
},
{
"epoch": 0.27071005917159763,
"grad_norm": 0.318359375,
"learning_rate": 7.307692307692308e-06,
"loss": 0.6846,
"step": 183
},
{
"epoch": 0.27218934911242604,
"grad_norm": 0.30078125,
"learning_rate": 7.292899408284024e-06,
"loss": 0.6661,
"step": 184
},
{
"epoch": 0.27366863905325445,
"grad_norm": 0.330078125,
"learning_rate": 7.278106508875741e-06,
"loss": 0.6783,
"step": 185
},
{
"epoch": 0.27514792899408286,
"grad_norm": 0.322265625,
"learning_rate": 7.263313609467456e-06,
"loss": 0.6838,
"step": 186
},
{
"epoch": 0.27662721893491127,
"grad_norm": 0.310546875,
"learning_rate": 7.248520710059173e-06,
"loss": 0.6933,
"step": 187
},
{
"epoch": 0.2781065088757396,
"grad_norm": 0.30078125,
"learning_rate": 7.233727810650888e-06,
"loss": 0.6857,
"step": 188
},
{
"epoch": 0.27958579881656803,
"grad_norm": 0.291015625,
"learning_rate": 7.218934911242604e-06,
"loss": 0.6602,
"step": 189
},
{
"epoch": 0.28106508875739644,
"grad_norm": 0.298828125,
"learning_rate": 7.20414201183432e-06,
"loss": 0.6678,
"step": 190
},
{
"epoch": 0.28254437869822485,
"grad_norm": 0.3125,
"learning_rate": 7.189349112426036e-06,
"loss": 0.6593,
"step": 191
},
{
"epoch": 0.28402366863905326,
"grad_norm": 0.314453125,
"learning_rate": 7.1745562130177515e-06,
"loss": 0.6704,
"step": 192
},
{
"epoch": 0.28550295857988167,
"grad_norm": 0.439453125,
"learning_rate": 7.159763313609468e-06,
"loss": 0.6192,
"step": 193
},
{
"epoch": 0.2869822485207101,
"grad_norm": 0.3046875,
"learning_rate": 7.1449704142011845e-06,
"loss": 0.6726,
"step": 194
},
{
"epoch": 0.28846153846153844,
"grad_norm": 0.388671875,
"learning_rate": 7.130177514792899e-06,
"loss": 0.6707,
"step": 195
},
{
"epoch": 0.28994082840236685,
"grad_norm": 0.357421875,
"learning_rate": 7.115384615384616e-06,
"loss": 0.6881,
"step": 196
},
{
"epoch": 0.29142011834319526,
"grad_norm": 0.328125,
"learning_rate": 7.100591715976332e-06,
"loss": 0.6747,
"step": 197
},
{
"epoch": 0.29289940828402367,
"grad_norm": 0.322265625,
"learning_rate": 7.085798816568048e-06,
"loss": 0.6782,
"step": 198
},
{
"epoch": 0.2943786982248521,
"grad_norm": 0.29296875,
"learning_rate": 7.071005917159763e-06,
"loss": 0.6684,
"step": 199
},
{
"epoch": 0.2958579881656805,
"grad_norm": 0.328125,
"learning_rate": 7.0562130177514796e-06,
"loss": 0.6706,
"step": 200
},
{
"epoch": 0.2973372781065089,
"grad_norm": 0.328125,
"learning_rate": 7.041420118343196e-06,
"loss": 0.6579,
"step": 201
},
{
"epoch": 0.2988165680473373,
"grad_norm": 0.404296875,
"learning_rate": 7.026627218934912e-06,
"loss": 0.6951,
"step": 202
},
{
"epoch": 0.30029585798816566,
"grad_norm": 0.32421875,
"learning_rate": 7.011834319526628e-06,
"loss": 0.6768,
"step": 203
},
{
"epoch": 0.30177514792899407,
"grad_norm": 0.37109375,
"learning_rate": 6.997041420118343e-06,
"loss": 0.6589,
"step": 204
},
{
"epoch": 0.3032544378698225,
"grad_norm": 0.37890625,
"learning_rate": 6.98224852071006e-06,
"loss": 0.6396,
"step": 205
},
{
"epoch": 0.3047337278106509,
"grad_norm": 0.349609375,
"learning_rate": 6.9674556213017754e-06,
"loss": 0.6342,
"step": 206
},
{
"epoch": 0.3062130177514793,
"grad_norm": 0.322265625,
"learning_rate": 6.952662721893492e-06,
"loss": 0.6468,
"step": 207
},
{
"epoch": 0.3076923076923077,
"grad_norm": 0.333984375,
"learning_rate": 6.9378698224852085e-06,
"loss": 0.6714,
"step": 208
},
{
"epoch": 0.3091715976331361,
"grad_norm": 0.328125,
"learning_rate": 6.923076923076923e-06,
"loss": 0.6485,
"step": 209
},
{
"epoch": 0.3106508875739645,
"grad_norm": 0.34375,
"learning_rate": 6.90828402366864e-06,
"loss": 0.6593,
"step": 210
},
{
"epoch": 0.3121301775147929,
"grad_norm": 0.353515625,
"learning_rate": 6.893491124260356e-06,
"loss": 0.6596,
"step": 211
},
{
"epoch": 0.3136094674556213,
"grad_norm": 0.279296875,
"learning_rate": 6.878698224852071e-06,
"loss": 0.6367,
"step": 212
},
{
"epoch": 0.3150887573964497,
"grad_norm": 0.33203125,
"learning_rate": 6.863905325443787e-06,
"loss": 0.6649,
"step": 213
},
{
"epoch": 0.3165680473372781,
"grad_norm": 0.32421875,
"learning_rate": 6.8491124260355036e-06,
"loss": 0.673,
"step": 214
},
{
"epoch": 0.3180473372781065,
"grad_norm": 0.310546875,
"learning_rate": 6.83431952662722e-06,
"loss": 0.6336,
"step": 215
},
{
"epoch": 0.31952662721893493,
"grad_norm": 0.330078125,
"learning_rate": 6.819526627218935e-06,
"loss": 0.642,
"step": 216
},
{
"epoch": 0.3210059171597633,
"grad_norm": 0.423828125,
"learning_rate": 6.8047337278106515e-06,
"loss": 0.6383,
"step": 217
},
{
"epoch": 0.3224852071005917,
"grad_norm": 0.3203125,
"learning_rate": 6.789940828402367e-06,
"loss": 0.6429,
"step": 218
},
{
"epoch": 0.3239644970414201,
"grad_norm": 0.34765625,
"learning_rate": 6.775147928994084e-06,
"loss": 0.6526,
"step": 219
},
{
"epoch": 0.3254437869822485,
"grad_norm": 0.421875,
"learning_rate": 6.760355029585799e-06,
"loss": 0.6615,
"step": 220
},
{
"epoch": 0.3269230769230769,
"grad_norm": 0.4296875,
"learning_rate": 6.745562130177515e-06,
"loss": 0.6427,
"step": 221
},
{
"epoch": 0.32840236686390534,
"grad_norm": 0.3203125,
"learning_rate": 6.730769230769232e-06,
"loss": 0.6501,
"step": 222
},
{
"epoch": 0.32988165680473375,
"grad_norm": 0.34765625,
"learning_rate": 6.715976331360947e-06,
"loss": 0.6404,
"step": 223
},
{
"epoch": 0.33136094674556216,
"grad_norm": 0.318359375,
"learning_rate": 6.701183431952664e-06,
"loss": 0.6448,
"step": 224
},
{
"epoch": 0.3328402366863905,
"grad_norm": 0.30859375,
"learning_rate": 6.686390532544379e-06,
"loss": 0.6687,
"step": 225
},
{
"epoch": 0.3343195266272189,
"grad_norm": 0.302734375,
"learning_rate": 6.671597633136095e-06,
"loss": 0.6425,
"step": 226
},
{
"epoch": 0.33579881656804733,
"grad_norm": 0.296875,
"learning_rate": 6.656804733727811e-06,
"loss": 0.6159,
"step": 227
},
{
"epoch": 0.33727810650887574,
"grad_norm": 0.310546875,
"learning_rate": 6.6420118343195276e-06,
"loss": 0.6343,
"step": 228
},
{
"epoch": 0.33875739644970415,
"grad_norm": 0.3125,
"learning_rate": 6.627218934911244e-06,
"loss": 0.6225,
"step": 229
},
{
"epoch": 0.34023668639053256,
"grad_norm": 0.3125,
"learning_rate": 6.612426035502959e-06,
"loss": 0.6291,
"step": 230
},
{
"epoch": 0.34171597633136097,
"grad_norm": 0.3125,
"learning_rate": 6.5976331360946755e-06,
"loss": 0.6427,
"step": 231
},
{
"epoch": 0.3431952662721893,
"grad_norm": 0.298828125,
"learning_rate": 6.582840236686391e-06,
"loss": 0.648,
"step": 232
},
{
"epoch": 0.34467455621301774,
"grad_norm": 0.29296875,
"learning_rate": 6.568047337278107e-06,
"loss": 0.6437,
"step": 233
},
{
"epoch": 0.34615384615384615,
"grad_norm": 0.322265625,
"learning_rate": 6.553254437869823e-06,
"loss": 0.6264,
"step": 234
},
{
"epoch": 0.34763313609467456,
"grad_norm": 0.328125,
"learning_rate": 6.538461538461539e-06,
"loss": 0.637,
"step": 235
},
{
"epoch": 0.34911242603550297,
"grad_norm": 0.333984375,
"learning_rate": 6.523668639053255e-06,
"loss": 0.6304,
"step": 236
},
{
"epoch": 0.3505917159763314,
"grad_norm": 0.310546875,
"learning_rate": 6.5088757396449705e-06,
"loss": 0.6199,
"step": 237
},
{
"epoch": 0.3520710059171598,
"grad_norm": 0.30078125,
"learning_rate": 6.494082840236687e-06,
"loss": 0.6256,
"step": 238
},
{
"epoch": 0.35355029585798814,
"grad_norm": 0.30078125,
"learning_rate": 6.479289940828403e-06,
"loss": 0.6254,
"step": 239
},
{
"epoch": 0.35502958579881655,
"grad_norm": 0.37109375,
"learning_rate": 6.464497041420119e-06,
"loss": 0.6292,
"step": 240
},
{
"epoch": 0.35650887573964496,
"grad_norm": 0.44140625,
"learning_rate": 6.449704142011834e-06,
"loss": 0.637,
"step": 241
},
{
"epoch": 0.35798816568047337,
"grad_norm": 0.3046875,
"learning_rate": 6.434911242603551e-06,
"loss": 0.6202,
"step": 242
},
{
"epoch": 0.3594674556213018,
"grad_norm": 0.30859375,
"learning_rate": 6.420118343195266e-06,
"loss": 0.6164,
"step": 243
},
{
"epoch": 0.3609467455621302,
"grad_norm": 0.330078125,
"learning_rate": 6.405325443786983e-06,
"loss": 0.6334,
"step": 244
},
{
"epoch": 0.3624260355029586,
"grad_norm": 0.416015625,
"learning_rate": 6.3905325443786995e-06,
"loss": 0.6492,
"step": 245
},
{
"epoch": 0.363905325443787,
"grad_norm": 0.404296875,
"learning_rate": 6.375739644970414e-06,
"loss": 0.643,
"step": 246
},
{
"epoch": 0.36538461538461536,
"grad_norm": 0.33203125,
"learning_rate": 6.360946745562131e-06,
"loss": 0.6303,
"step": 247
},
{
"epoch": 0.3668639053254438,
"grad_norm": 0.345703125,
"learning_rate": 6.3461538461538466e-06,
"loss": 0.6305,
"step": 248
},
{
"epoch": 0.3683431952662722,
"grad_norm": 0.32421875,
"learning_rate": 6.331360946745563e-06,
"loss": 0.6165,
"step": 249
},
{
"epoch": 0.3698224852071006,
"grad_norm": 0.31640625,
"learning_rate": 6.316568047337278e-06,
"loss": 0.6329,
"step": 250
},
{
"epoch": 0.371301775147929,
"grad_norm": 0.423828125,
"learning_rate": 6.3017751479289945e-06,
"loss": 0.6441,
"step": 251
},
{
"epoch": 0.3727810650887574,
"grad_norm": 0.353515625,
"learning_rate": 6.286982248520711e-06,
"loss": 0.6525,
"step": 252
},
{
"epoch": 0.3742603550295858,
"grad_norm": 0.294921875,
"learning_rate": 6.272189349112427e-06,
"loss": 0.6411,
"step": 253
},
{
"epoch": 0.3757396449704142,
"grad_norm": 0.33203125,
"learning_rate": 6.2573964497041425e-06,
"loss": 0.6141,
"step": 254
},
{
"epoch": 0.3772189349112426,
"grad_norm": 0.318359375,
"learning_rate": 6.242603550295858e-06,
"loss": 0.6489,
"step": 255
},
{
"epoch": 0.378698224852071,
"grad_norm": 0.3359375,
"learning_rate": 6.227810650887575e-06,
"loss": 0.5926,
"step": 256
},
{
"epoch": 0.3801775147928994,
"grad_norm": 0.302734375,
"learning_rate": 6.21301775147929e-06,
"loss": 0.6306,
"step": 257
},
{
"epoch": 0.3816568047337278,
"grad_norm": 0.3125,
"learning_rate": 6.198224852071006e-06,
"loss": 0.6222,
"step": 258
},
{
"epoch": 0.3831360946745562,
"grad_norm": 0.30859375,
"learning_rate": 6.183431952662723e-06,
"loss": 0.6073,
"step": 259
},
{
"epoch": 0.38461538461538464,
"grad_norm": 0.33203125,
"learning_rate": 6.168639053254438e-06,
"loss": 0.6484,
"step": 260
},
{
"epoch": 0.386094674556213,
"grad_norm": 0.31640625,
"learning_rate": 6.153846153846155e-06,
"loss": 0.6061,
"step": 261
},
{
"epoch": 0.3875739644970414,
"grad_norm": 0.341796875,
"learning_rate": 6.13905325443787e-06,
"loss": 0.6175,
"step": 262
},
{
"epoch": 0.3890532544378698,
"grad_norm": 0.330078125,
"learning_rate": 6.124260355029586e-06,
"loss": 0.6283,
"step": 263
},
{
"epoch": 0.3905325443786982,
"grad_norm": 0.34375,
"learning_rate": 6.109467455621302e-06,
"loss": 0.6029,
"step": 264
},
{
"epoch": 0.39201183431952663,
"grad_norm": 0.333984375,
"learning_rate": 6.0946745562130185e-06,
"loss": 0.624,
"step": 265
},
{
"epoch": 0.39349112426035504,
"grad_norm": 0.373046875,
"learning_rate": 6.079881656804735e-06,
"loss": 0.6114,
"step": 266
},
{
"epoch": 0.39497041420118345,
"grad_norm": 0.365234375,
"learning_rate": 6.06508875739645e-06,
"loss": 0.6175,
"step": 267
},
{
"epoch": 0.39644970414201186,
"grad_norm": 0.39453125,
"learning_rate": 6.0502958579881664e-06,
"loss": 0.617,
"step": 268
},
{
"epoch": 0.3979289940828402,
"grad_norm": 0.3046875,
"learning_rate": 6.035502958579882e-06,
"loss": 0.6179,
"step": 269
},
{
"epoch": 0.3994082840236686,
"grad_norm": 0.341796875,
"learning_rate": 6.020710059171599e-06,
"loss": 0.6236,
"step": 270
},
{
"epoch": 0.40088757396449703,
"grad_norm": 0.310546875,
"learning_rate": 6.0059171597633135e-06,
"loss": 0.6317,
"step": 271
},
{
"epoch": 0.40236686390532544,
"grad_norm": 0.423828125,
"learning_rate": 5.99112426035503e-06,
"loss": 0.6417,
"step": 272
},
{
"epoch": 0.40384615384615385,
"grad_norm": 0.322265625,
"learning_rate": 5.976331360946747e-06,
"loss": 0.6208,
"step": 273
},
{
"epoch": 0.40532544378698226,
"grad_norm": 0.3203125,
"learning_rate": 5.961538461538462e-06,
"loss": 0.6333,
"step": 274
},
{
"epoch": 0.4068047337278107,
"grad_norm": 0.3203125,
"learning_rate": 5.946745562130178e-06,
"loss": 0.6247,
"step": 275
},
{
"epoch": 0.40828402366863903,
"grad_norm": 0.330078125,
"learning_rate": 5.931952662721894e-06,
"loss": 0.6073,
"step": 276
},
{
"epoch": 0.40976331360946744,
"grad_norm": 0.314453125,
"learning_rate": 5.91715976331361e-06,
"loss": 0.5878,
"step": 277
},
{
"epoch": 0.41124260355029585,
"grad_norm": 0.361328125,
"learning_rate": 5.902366863905326e-06,
"loss": 0.6207,
"step": 278
},
{
"epoch": 0.41272189349112426,
"grad_norm": 0.3515625,
"learning_rate": 5.887573964497042e-06,
"loss": 0.607,
"step": 279
},
{
"epoch": 0.41420118343195267,
"grad_norm": 0.322265625,
"learning_rate": 5.872781065088757e-06,
"loss": 0.6165,
"step": 280
},
{
"epoch": 0.4156804733727811,
"grad_norm": 0.349609375,
"learning_rate": 5.857988165680474e-06,
"loss": 0.6268,
"step": 281
},
{
"epoch": 0.4171597633136095,
"grad_norm": 0.37109375,
"learning_rate": 5.8431952662721904e-06,
"loss": 0.6327,
"step": 282
},
{
"epoch": 0.41863905325443784,
"grad_norm": 0.41796875,
"learning_rate": 5.828402366863905e-06,
"loss": 0.6148,
"step": 283
},
{
"epoch": 0.42011834319526625,
"grad_norm": 0.369140625,
"learning_rate": 5.813609467455622e-06,
"loss": 0.6173,
"step": 284
},
{
"epoch": 0.42159763313609466,
"grad_norm": 0.38671875,
"learning_rate": 5.7988165680473375e-06,
"loss": 0.6027,
"step": 285
},
{
"epoch": 0.4230769230769231,
"grad_norm": 0.369140625,
"learning_rate": 5.784023668639054e-06,
"loss": 0.6129,
"step": 286
},
{
"epoch": 0.4245562130177515,
"grad_norm": 0.337890625,
"learning_rate": 5.769230769230769e-06,
"loss": 0.6279,
"step": 287
},
{
"epoch": 0.4260355029585799,
"grad_norm": 0.357421875,
"learning_rate": 5.7544378698224855e-06,
"loss": 0.5993,
"step": 288
},
{
"epoch": 0.4275147928994083,
"grad_norm": 0.380859375,
"learning_rate": 5.739644970414202e-06,
"loss": 0.5982,
"step": 289
},
{
"epoch": 0.4289940828402367,
"grad_norm": 0.400390625,
"learning_rate": 5.724852071005918e-06,
"loss": 0.6081,
"step": 290
},
{
"epoch": 0.43047337278106507,
"grad_norm": 0.474609375,
"learning_rate": 5.710059171597634e-06,
"loss": 0.5879,
"step": 291
},
{
"epoch": 0.4319526627218935,
"grad_norm": 0.376953125,
"learning_rate": 5.695266272189349e-06,
"loss": 0.5918,
"step": 292
},
{
"epoch": 0.4334319526627219,
"grad_norm": 0.337890625,
"learning_rate": 5.680473372781066e-06,
"loss": 0.608,
"step": 293
},
{
"epoch": 0.4349112426035503,
"grad_norm": 0.3515625,
"learning_rate": 5.665680473372781e-06,
"loss": 0.6122,
"step": 294
},
{
"epoch": 0.4363905325443787,
"grad_norm": 0.34765625,
"learning_rate": 5.650887573964498e-06,
"loss": 0.6209,
"step": 295
},
{
"epoch": 0.4378698224852071,
"grad_norm": 0.33203125,
"learning_rate": 5.636094674556214e-06,
"loss": 0.6056,
"step": 296
},
{
"epoch": 0.4393491124260355,
"grad_norm": 0.3125,
"learning_rate": 5.621301775147929e-06,
"loss": 0.6166,
"step": 297
},
{
"epoch": 0.4408284023668639,
"grad_norm": 0.3359375,
"learning_rate": 5.606508875739646e-06,
"loss": 0.6296,
"step": 298
},
{
"epoch": 0.4423076923076923,
"grad_norm": 0.35546875,
"learning_rate": 5.591715976331361e-06,
"loss": 0.618,
"step": 299
},
{
"epoch": 0.4437869822485207,
"grad_norm": 0.421875,
"learning_rate": 5.576923076923077e-06,
"loss": 0.598,
"step": 300
},
{
"epoch": 0.4452662721893491,
"grad_norm": 0.3203125,
"learning_rate": 5.562130177514793e-06,
"loss": 0.5973,
"step": 301
},
{
"epoch": 0.4467455621301775,
"grad_norm": 0.3125,
"learning_rate": 5.5473372781065095e-06,
"loss": 0.6228,
"step": 302
},
{
"epoch": 0.44822485207100593,
"grad_norm": 0.30859375,
"learning_rate": 5.532544378698226e-06,
"loss": 0.611,
"step": 303
},
{
"epoch": 0.44970414201183434,
"grad_norm": 0.30859375,
"learning_rate": 5.517751479289941e-06,
"loss": 0.6094,
"step": 304
},
{
"epoch": 0.4511834319526627,
"grad_norm": 0.349609375,
"learning_rate": 5.502958579881657e-06,
"loss": 0.6156,
"step": 305
},
{
"epoch": 0.4526627218934911,
"grad_norm": 0.328125,
"learning_rate": 5.488165680473373e-06,
"loss": 0.6077,
"step": 306
},
{
"epoch": 0.4541420118343195,
"grad_norm": 0.359375,
"learning_rate": 5.47337278106509e-06,
"loss": 0.5891,
"step": 307
},
{
"epoch": 0.4556213017751479,
"grad_norm": 0.349609375,
"learning_rate": 5.4585798816568045e-06,
"loss": 0.6089,
"step": 308
},
{
"epoch": 0.45710059171597633,
"grad_norm": 0.337890625,
"learning_rate": 5.443786982248521e-06,
"loss": 0.5967,
"step": 309
},
{
"epoch": 0.45857988165680474,
"grad_norm": 0.333984375,
"learning_rate": 5.4289940828402376e-06,
"loss": 0.5958,
"step": 310
},
{
"epoch": 0.46005917159763315,
"grad_norm": 0.33984375,
"learning_rate": 5.414201183431953e-06,
"loss": 0.5989,
"step": 311
},
{
"epoch": 0.46153846153846156,
"grad_norm": 0.3359375,
"learning_rate": 5.39940828402367e-06,
"loss": 0.588,
"step": 312
},
{
"epoch": 0.4630177514792899,
"grad_norm": 0.32421875,
"learning_rate": 5.384615384615385e-06,
"loss": 0.6028,
"step": 313
},
{
"epoch": 0.46449704142011833,
"grad_norm": 0.34375,
"learning_rate": 5.369822485207101e-06,
"loss": 0.5966,
"step": 314
},
{
"epoch": 0.46597633136094674,
"grad_norm": 0.3359375,
"learning_rate": 5.355029585798817e-06,
"loss": 0.5906,
"step": 315
},
{
"epoch": 0.46745562130177515,
"grad_norm": 0.298828125,
"learning_rate": 5.3402366863905334e-06,
"loss": 0.6084,
"step": 316
},
{
"epoch": 0.46893491124260356,
"grad_norm": 0.32421875,
"learning_rate": 5.325443786982249e-06,
"loss": 0.6294,
"step": 317
},
{
"epoch": 0.47041420118343197,
"grad_norm": 0.36328125,
"learning_rate": 5.310650887573965e-06,
"loss": 0.5992,
"step": 318
},
{
"epoch": 0.4718934911242604,
"grad_norm": 0.328125,
"learning_rate": 5.295857988165681e-06,
"loss": 0.6162,
"step": 319
},
{
"epoch": 0.47337278106508873,
"grad_norm": 0.36328125,
"learning_rate": 5.281065088757396e-06,
"loss": 0.5896,
"step": 320
},
{
"epoch": 0.47485207100591714,
"grad_norm": 0.33203125,
"learning_rate": 5.266272189349113e-06,
"loss": 0.6053,
"step": 321
},
{
"epoch": 0.47633136094674555,
"grad_norm": 0.330078125,
"learning_rate": 5.2514792899408285e-06,
"loss": 0.5949,
"step": 322
},
{
"epoch": 0.47781065088757396,
"grad_norm": 0.349609375,
"learning_rate": 5.236686390532545e-06,
"loss": 0.6179,
"step": 323
},
{
"epoch": 0.47928994082840237,
"grad_norm": 0.3515625,
"learning_rate": 5.22189349112426e-06,
"loss": 0.5907,
"step": 324
},
{
"epoch": 0.4807692307692308,
"grad_norm": 0.380859375,
"learning_rate": 5.207100591715976e-06,
"loss": 0.5961,
"step": 325
},
{
"epoch": 0.4822485207100592,
"grad_norm": 0.314453125,
"learning_rate": 5.192307692307693e-06,
"loss": 0.6169,
"step": 326
},
{
"epoch": 0.48372781065088755,
"grad_norm": 0.318359375,
"learning_rate": 5.177514792899409e-06,
"loss": 0.6008,
"step": 327
},
{
"epoch": 0.48520710059171596,
"grad_norm": 0.345703125,
"learning_rate": 5.162721893491125e-06,
"loss": 0.5942,
"step": 328
},
{
"epoch": 0.48668639053254437,
"grad_norm": 0.359375,
"learning_rate": 5.14792899408284e-06,
"loss": 0.5903,
"step": 329
},
{
"epoch": 0.4881656804733728,
"grad_norm": 0.337890625,
"learning_rate": 5.133136094674557e-06,
"loss": 0.5754,
"step": 330
},
{
"epoch": 0.4896449704142012,
"grad_norm": 0.3125,
"learning_rate": 5.118343195266272e-06,
"loss": 0.5818,
"step": 331
},
{
"epoch": 0.4911242603550296,
"grad_norm": 0.326171875,
"learning_rate": 5.103550295857989e-06,
"loss": 0.5918,
"step": 332
},
{
"epoch": 0.492603550295858,
"grad_norm": 0.41796875,
"learning_rate": 5.088757396449705e-06,
"loss": 0.6153,
"step": 333
},
{
"epoch": 0.4940828402366864,
"grad_norm": 0.3125,
"learning_rate": 5.07396449704142e-06,
"loss": 0.6038,
"step": 334
},
{
"epoch": 0.49556213017751477,
"grad_norm": 0.3046875,
"learning_rate": 5.059171597633137e-06,
"loss": 0.5753,
"step": 335
},
{
"epoch": 0.4970414201183432,
"grad_norm": 0.345703125,
"learning_rate": 5.0443786982248525e-06,
"loss": 0.5963,
"step": 336
},
{
"epoch": 0.4985207100591716,
"grad_norm": 0.34765625,
"learning_rate": 5.029585798816569e-06,
"loss": 0.585,
"step": 337
},
{
"epoch": 0.5,
"grad_norm": 0.322265625,
"learning_rate": 5.014792899408284e-06,
"loss": 0.5888,
"step": 338
},
{
"epoch": 0.5014792899408284,
"grad_norm": 0.353515625,
"learning_rate": 5e-06,
"loss": 0.5802,
"step": 339
},
{
"epoch": 0.5029585798816568,
"grad_norm": 0.32421875,
"learning_rate": 4.985207100591716e-06,
"loss": 0.578,
"step": 340
},
{
"epoch": 0.5044378698224852,
"grad_norm": 0.31640625,
"learning_rate": 4.970414201183432e-06,
"loss": 0.5865,
"step": 341
},
{
"epoch": 0.5059171597633136,
"grad_norm": 0.32421875,
"learning_rate": 4.955621301775148e-06,
"loss": 0.5892,
"step": 342
},
{
"epoch": 0.507396449704142,
"grad_norm": 0.365234375,
"learning_rate": 4.940828402366865e-06,
"loss": 0.5974,
"step": 343
},
{
"epoch": 0.5088757396449705,
"grad_norm": 0.427734375,
"learning_rate": 4.926035502958581e-06,
"loss": 0.5886,
"step": 344
},
{
"epoch": 0.5103550295857988,
"grad_norm": 0.322265625,
"learning_rate": 4.911242603550296e-06,
"loss": 0.5863,
"step": 345
},
{
"epoch": 0.5118343195266272,
"grad_norm": 0.353515625,
"learning_rate": 4.896449704142012e-06,
"loss": 0.6069,
"step": 346
},
{
"epoch": 0.5133136094674556,
"grad_norm": 0.359375,
"learning_rate": 4.8816568047337285e-06,
"loss": 0.6,
"step": 347
},
{
"epoch": 0.514792899408284,
"grad_norm": 0.322265625,
"learning_rate": 4.866863905325444e-06,
"loss": 0.5811,
"step": 348
},
{
"epoch": 0.5162721893491125,
"grad_norm": 0.380859375,
"learning_rate": 4.85207100591716e-06,
"loss": 0.597,
"step": 349
},
{
"epoch": 0.5177514792899408,
"grad_norm": 0.40234375,
"learning_rate": 4.837278106508876e-06,
"loss": 0.5923,
"step": 350
},
{
"epoch": 0.5192307692307693,
"grad_norm": 0.34765625,
"learning_rate": 4.822485207100592e-06,
"loss": 0.5851,
"step": 351
},
{
"epoch": 0.5207100591715976,
"grad_norm": 0.34765625,
"learning_rate": 4.807692307692308e-06,
"loss": 0.5761,
"step": 352
},
{
"epoch": 0.522189349112426,
"grad_norm": 0.384765625,
"learning_rate": 4.792899408284024e-06,
"loss": 0.5885,
"step": 353
},
{
"epoch": 0.5236686390532544,
"grad_norm": 0.341796875,
"learning_rate": 4.77810650887574e-06,
"loss": 0.5982,
"step": 354
},
{
"epoch": 0.5251479289940828,
"grad_norm": 0.330078125,
"learning_rate": 4.763313609467456e-06,
"loss": 0.6099,
"step": 355
},
{
"epoch": 0.5266272189349113,
"grad_norm": 0.373046875,
"learning_rate": 4.7485207100591715e-06,
"loss": 0.606,
"step": 356
},
{
"epoch": 0.5281065088757396,
"grad_norm": 0.412109375,
"learning_rate": 4.733727810650888e-06,
"loss": 0.6068,
"step": 357
},
{
"epoch": 0.5295857988165681,
"grad_norm": 0.34765625,
"learning_rate": 4.718934911242605e-06,
"loss": 0.5901,
"step": 358
},
{
"epoch": 0.5310650887573964,
"grad_norm": 0.345703125,
"learning_rate": 4.70414201183432e-06,
"loss": 0.6075,
"step": 359
},
{
"epoch": 0.5325443786982249,
"grad_norm": 0.353515625,
"learning_rate": 4.689349112426036e-06,
"loss": 0.5928,
"step": 360
},
{
"epoch": 0.5340236686390533,
"grad_norm": 0.369140625,
"learning_rate": 4.674556213017752e-06,
"loss": 0.6098,
"step": 361
},
{
"epoch": 0.5355029585798816,
"grad_norm": 0.365234375,
"learning_rate": 4.659763313609467e-06,
"loss": 0.6056,
"step": 362
},
{
"epoch": 0.5369822485207101,
"grad_norm": 0.341796875,
"learning_rate": 4.644970414201184e-06,
"loss": 0.6067,
"step": 363
},
{
"epoch": 0.5384615384615384,
"grad_norm": 0.365234375,
"learning_rate": 4.6301775147929e-06,
"loss": 0.6019,
"step": 364
},
{
"epoch": 0.5399408284023669,
"grad_norm": 0.455078125,
"learning_rate": 4.615384615384616e-06,
"loss": 0.5756,
"step": 365
},
{
"epoch": 0.5414201183431953,
"grad_norm": 0.375,
"learning_rate": 4.600591715976332e-06,
"loss": 0.5908,
"step": 366
},
{
"epoch": 0.5428994082840237,
"grad_norm": 0.330078125,
"learning_rate": 4.5857988165680475e-06,
"loss": 0.5841,
"step": 367
},
{
"epoch": 0.5443786982248521,
"grad_norm": 0.375,
"learning_rate": 4.571005917159764e-06,
"loss": 0.5753,
"step": 368
},
{
"epoch": 0.5458579881656804,
"grad_norm": 0.34765625,
"learning_rate": 4.55621301775148e-06,
"loss": 0.5663,
"step": 369
},
{
"epoch": 0.5473372781065089,
"grad_norm": 0.408203125,
"learning_rate": 4.5414201183431955e-06,
"loss": 0.6038,
"step": 370
},
{
"epoch": 0.5488165680473372,
"grad_norm": 0.32421875,
"learning_rate": 4.526627218934911e-06,
"loss": 0.5855,
"step": 371
},
{
"epoch": 0.5502958579881657,
"grad_norm": 0.328125,
"learning_rate": 4.511834319526628e-06,
"loss": 0.6029,
"step": 372
},
{
"epoch": 0.5517751479289941,
"grad_norm": 0.359375,
"learning_rate": 4.497041420118343e-06,
"loss": 0.5934,
"step": 373
},
{
"epoch": 0.5532544378698225,
"grad_norm": 0.3671875,
"learning_rate": 4.48224852071006e-06,
"loss": 0.5672,
"step": 374
},
{
"epoch": 0.5547337278106509,
"grad_norm": 0.341796875,
"learning_rate": 4.467455621301776e-06,
"loss": 0.5848,
"step": 375
},
{
"epoch": 0.5562130177514792,
"grad_norm": 0.328125,
"learning_rate": 4.452662721893491e-06,
"loss": 0.6083,
"step": 376
},
{
"epoch": 0.5576923076923077,
"grad_norm": 0.53125,
"learning_rate": 4.437869822485207e-06,
"loss": 0.579,
"step": 377
},
{
"epoch": 0.5591715976331361,
"grad_norm": 0.359375,
"learning_rate": 4.423076923076924e-06,
"loss": 0.5701,
"step": 378
},
{
"epoch": 0.5606508875739645,
"grad_norm": 0.3515625,
"learning_rate": 4.408284023668639e-06,
"loss": 0.5797,
"step": 379
},
{
"epoch": 0.5621301775147929,
"grad_norm": 0.369140625,
"learning_rate": 4.393491124260356e-06,
"loss": 0.5945,
"step": 380
},
{
"epoch": 0.5636094674556213,
"grad_norm": 0.640625,
"learning_rate": 4.3786982248520715e-06,
"loss": 0.5777,
"step": 381
},
{
"epoch": 0.5650887573964497,
"grad_norm": 0.35546875,
"learning_rate": 4.363905325443787e-06,
"loss": 0.5934,
"step": 382
},
{
"epoch": 0.5665680473372781,
"grad_norm": 0.400390625,
"learning_rate": 4.349112426035503e-06,
"loss": 0.5951,
"step": 383
},
{
"epoch": 0.5680473372781065,
"grad_norm": 0.37890625,
"learning_rate": 4.3343195266272195e-06,
"loss": 0.5683,
"step": 384
},
{
"epoch": 0.5695266272189349,
"grad_norm": 0.314453125,
"learning_rate": 4.319526627218935e-06,
"loss": 0.5823,
"step": 385
},
{
"epoch": 0.5710059171597633,
"grad_norm": 0.353515625,
"learning_rate": 4.304733727810651e-06,
"loss": 0.5814,
"step": 386
},
{
"epoch": 0.5724852071005917,
"grad_norm": 0.416015625,
"learning_rate": 4.289940828402367e-06,
"loss": 0.5872,
"step": 387
},
{
"epoch": 0.5739644970414202,
"grad_norm": 0.39453125,
"learning_rate": 4.275147928994083e-06,
"loss": 0.5858,
"step": 388
},
{
"epoch": 0.5754437869822485,
"grad_norm": 0.3203125,
"learning_rate": 4.2603550295858e-06,
"loss": 0.5755,
"step": 389
},
{
"epoch": 0.5769230769230769,
"grad_norm": 0.33984375,
"learning_rate": 4.245562130177515e-06,
"loss": 0.5867,
"step": 390
},
{
"epoch": 0.5784023668639053,
"grad_norm": 0.345703125,
"learning_rate": 4.230769230769231e-06,
"loss": 0.5669,
"step": 391
},
{
"epoch": 0.5798816568047337,
"grad_norm": 0.353515625,
"learning_rate": 4.215976331360947e-06,
"loss": 0.5628,
"step": 392
},
{
"epoch": 0.5813609467455622,
"grad_norm": 0.3359375,
"learning_rate": 4.201183431952663e-06,
"loss": 0.5684,
"step": 393
},
{
"epoch": 0.5828402366863905,
"grad_norm": 0.3359375,
"learning_rate": 4.186390532544379e-06,
"loss": 0.56,
"step": 394
},
{
"epoch": 0.584319526627219,
"grad_norm": 0.34765625,
"learning_rate": 4.1715976331360955e-06,
"loss": 0.5612,
"step": 395
},
{
"epoch": 0.5857988165680473,
"grad_norm": 0.32421875,
"learning_rate": 4.156804733727811e-06,
"loss": 0.5838,
"step": 396
},
{
"epoch": 0.5872781065088757,
"grad_norm": 0.330078125,
"learning_rate": 4.142011834319527e-06,
"loss": 0.5791,
"step": 397
},
{
"epoch": 0.5887573964497042,
"grad_norm": 0.37890625,
"learning_rate": 4.127218934911243e-06,
"loss": 0.5863,
"step": 398
},
{
"epoch": 0.5902366863905325,
"grad_norm": 0.30859375,
"learning_rate": 4.112426035502959e-06,
"loss": 0.5734,
"step": 399
},
{
"epoch": 0.591715976331361,
"grad_norm": 0.30859375,
"learning_rate": 4.097633136094675e-06,
"loss": 0.563,
"step": 400
},
{
"epoch": 0.5931952662721893,
"grad_norm": 0.345703125,
"learning_rate": 4.0828402366863906e-06,
"loss": 0.5916,
"step": 401
},
{
"epoch": 0.5946745562130178,
"grad_norm": 0.3203125,
"learning_rate": 4.068047337278107e-06,
"loss": 0.5911,
"step": 402
},
{
"epoch": 0.5961538461538461,
"grad_norm": 0.412109375,
"learning_rate": 4.053254437869823e-06,
"loss": 0.5944,
"step": 403
},
{
"epoch": 0.5976331360946746,
"grad_norm": 0.31640625,
"learning_rate": 4.0384615384615385e-06,
"loss": 0.5864,
"step": 404
},
{
"epoch": 0.599112426035503,
"grad_norm": 0.32421875,
"learning_rate": 4.023668639053255e-06,
"loss": 0.5917,
"step": 405
},
{
"epoch": 0.6005917159763313,
"grad_norm": 0.32421875,
"learning_rate": 4.008875739644971e-06,
"loss": 0.5621,
"step": 406
},
{
"epoch": 0.6020710059171598,
"grad_norm": 0.328125,
"learning_rate": 3.9940828402366864e-06,
"loss": 0.5806,
"step": 407
},
{
"epoch": 0.6035502958579881,
"grad_norm": 0.349609375,
"learning_rate": 3.979289940828402e-06,
"loss": 0.5882,
"step": 408
},
{
"epoch": 0.6050295857988166,
"grad_norm": 0.330078125,
"learning_rate": 3.964497041420119e-06,
"loss": 0.5766,
"step": 409
},
{
"epoch": 0.606508875739645,
"grad_norm": 0.34375,
"learning_rate": 3.949704142011835e-06,
"loss": 0.5719,
"step": 410
},
{
"epoch": 0.6079881656804734,
"grad_norm": 0.341796875,
"learning_rate": 3.934911242603551e-06,
"loss": 0.5925,
"step": 411
},
{
"epoch": 0.6094674556213018,
"grad_norm": 0.32421875,
"learning_rate": 3.920118343195267e-06,
"loss": 0.5731,
"step": 412
},
{
"epoch": 0.6109467455621301,
"grad_norm": 0.3046875,
"learning_rate": 3.905325443786982e-06,
"loss": 0.5922,
"step": 413
},
{
"epoch": 0.6124260355029586,
"grad_norm": 0.3359375,
"learning_rate": 3.890532544378698e-06,
"loss": 0.5672,
"step": 414
},
{
"epoch": 0.613905325443787,
"grad_norm": 0.302734375,
"learning_rate": 3.8757396449704146e-06,
"loss": 0.5772,
"step": 415
},
{
"epoch": 0.6153846153846154,
"grad_norm": 0.3359375,
"learning_rate": 3.86094674556213e-06,
"loss": 0.5818,
"step": 416
},
{
"epoch": 0.6168639053254438,
"grad_norm": 0.439453125,
"learning_rate": 3.846153846153847e-06,
"loss": 0.5888,
"step": 417
},
{
"epoch": 0.6183431952662722,
"grad_norm": 0.341796875,
"learning_rate": 3.8313609467455625e-06,
"loss": 0.5716,
"step": 418
},
{
"epoch": 0.6198224852071006,
"grad_norm": 0.3359375,
"learning_rate": 3.816568047337278e-06,
"loss": 0.5679,
"step": 419
},
{
"epoch": 0.621301775147929,
"grad_norm": 0.328125,
"learning_rate": 3.8017751479289943e-06,
"loss": 0.5971,
"step": 420
},
{
"epoch": 0.6227810650887574,
"grad_norm": 0.314453125,
"learning_rate": 3.7869822485207104e-06,
"loss": 0.562,
"step": 421
},
{
"epoch": 0.6242603550295858,
"grad_norm": 0.359375,
"learning_rate": 3.772189349112426e-06,
"loss": 0.5492,
"step": 422
},
{
"epoch": 0.6257396449704142,
"grad_norm": 0.333984375,
"learning_rate": 3.7573964497041422e-06,
"loss": 0.5806,
"step": 423
},
{
"epoch": 0.6272189349112426,
"grad_norm": 0.306640625,
"learning_rate": 3.7426035502958584e-06,
"loss": 0.5723,
"step": 424
},
{
"epoch": 0.628698224852071,
"grad_norm": 0.314453125,
"learning_rate": 3.7278106508875745e-06,
"loss": 0.5735,
"step": 425
},
{
"epoch": 0.6301775147928994,
"grad_norm": 0.3125,
"learning_rate": 3.71301775147929e-06,
"loss": 0.5726,
"step": 426
},
{
"epoch": 0.6316568047337278,
"grad_norm": 0.361328125,
"learning_rate": 3.6982248520710063e-06,
"loss": 0.5876,
"step": 427
},
{
"epoch": 0.6331360946745562,
"grad_norm": 0.34375,
"learning_rate": 3.683431952662722e-06,
"loss": 0.5697,
"step": 428
},
{
"epoch": 0.6346153846153846,
"grad_norm": 0.3515625,
"learning_rate": 3.668639053254438e-06,
"loss": 0.572,
"step": 429
},
{
"epoch": 0.636094674556213,
"grad_norm": 0.3515625,
"learning_rate": 3.653846153846154e-06,
"loss": 0.5775,
"step": 430
},
{
"epoch": 0.6375739644970414,
"grad_norm": 0.33984375,
"learning_rate": 3.6390532544378704e-06,
"loss": 0.5707,
"step": 431
},
{
"epoch": 0.6390532544378699,
"grad_norm": 0.333984375,
"learning_rate": 3.6242603550295865e-06,
"loss": 0.5729,
"step": 432
},
{
"epoch": 0.6405325443786982,
"grad_norm": 0.330078125,
"learning_rate": 3.609467455621302e-06,
"loss": 0.5782,
"step": 433
},
{
"epoch": 0.6420118343195266,
"grad_norm": 0.34375,
"learning_rate": 3.594674556213018e-06,
"loss": 0.5713,
"step": 434
},
{
"epoch": 0.643491124260355,
"grad_norm": 0.322265625,
"learning_rate": 3.579881656804734e-06,
"loss": 0.5565,
"step": 435
},
{
"epoch": 0.6449704142011834,
"grad_norm": 0.33203125,
"learning_rate": 3.5650887573964497e-06,
"loss": 0.5801,
"step": 436
},
{
"epoch": 0.6464497041420119,
"grad_norm": 0.31640625,
"learning_rate": 3.550295857988166e-06,
"loss": 0.5766,
"step": 437
},
{
"epoch": 0.6479289940828402,
"grad_norm": 0.357421875,
"learning_rate": 3.5355029585798815e-06,
"loss": 0.5859,
"step": 438
},
{
"epoch": 0.6494082840236687,
"grad_norm": 0.333984375,
"learning_rate": 3.520710059171598e-06,
"loss": 0.5631,
"step": 439
},
{
"epoch": 0.650887573964497,
"grad_norm": 0.345703125,
"learning_rate": 3.505917159763314e-06,
"loss": 0.5751,
"step": 440
},
{
"epoch": 0.6523668639053254,
"grad_norm": 0.30078125,
"learning_rate": 3.49112426035503e-06,
"loss": 0.5752,
"step": 441
},
{
"epoch": 0.6538461538461539,
"grad_norm": 0.35546875,
"learning_rate": 3.476331360946746e-06,
"loss": 0.5655,
"step": 442
},
{
"epoch": 0.6553254437869822,
"grad_norm": 0.380859375,
"learning_rate": 3.4615384615384617e-06,
"loss": 0.5449,
"step": 443
},
{
"epoch": 0.6568047337278107,
"grad_norm": 0.3359375,
"learning_rate": 3.446745562130178e-06,
"loss": 0.5768,
"step": 444
},
{
"epoch": 0.658284023668639,
"grad_norm": 0.33203125,
"learning_rate": 3.4319526627218935e-06,
"loss": 0.5549,
"step": 445
},
{
"epoch": 0.6597633136094675,
"grad_norm": 0.3125,
"learning_rate": 3.41715976331361e-06,
"loss": 0.5796,
"step": 446
},
{
"epoch": 0.6612426035502958,
"grad_norm": 0.34765625,
"learning_rate": 3.4023668639053257e-06,
"loss": 0.5644,
"step": 447
},
{
"epoch": 0.6627218934911243,
"grad_norm": 0.29296875,
"learning_rate": 3.387573964497042e-06,
"loss": 0.5836,
"step": 448
},
{
"epoch": 0.6642011834319527,
"grad_norm": 0.380859375,
"learning_rate": 3.3727810650887576e-06,
"loss": 0.5699,
"step": 449
},
{
"epoch": 0.665680473372781,
"grad_norm": 0.3203125,
"learning_rate": 3.3579881656804737e-06,
"loss": 0.5646,
"step": 450
},
{
"epoch": 0.6671597633136095,
"grad_norm": 0.326171875,
"learning_rate": 3.3431952662721894e-06,
"loss": 0.5852,
"step": 451
},
{
"epoch": 0.6686390532544378,
"grad_norm": 0.32421875,
"learning_rate": 3.3284023668639055e-06,
"loss": 0.5762,
"step": 452
},
{
"epoch": 0.6701183431952663,
"grad_norm": 0.3125,
"learning_rate": 3.313609467455622e-06,
"loss": 0.5895,
"step": 453
},
{
"epoch": 0.6715976331360947,
"grad_norm": 0.330078125,
"learning_rate": 3.2988165680473377e-06,
"loss": 0.5716,
"step": 454
},
{
"epoch": 0.6730769230769231,
"grad_norm": 0.359375,
"learning_rate": 3.2840236686390534e-06,
"loss": 0.5847,
"step": 455
},
{
"epoch": 0.6745562130177515,
"grad_norm": 0.33984375,
"learning_rate": 3.2692307692307696e-06,
"loss": 0.5824,
"step": 456
},
{
"epoch": 0.6760355029585798,
"grad_norm": 0.470703125,
"learning_rate": 3.2544378698224853e-06,
"loss": 0.5666,
"step": 457
},
{
"epoch": 0.6775147928994083,
"grad_norm": 0.328125,
"learning_rate": 3.2396449704142014e-06,
"loss": 0.5785,
"step": 458
},
{
"epoch": 0.6789940828402367,
"grad_norm": 0.314453125,
"learning_rate": 3.224852071005917e-06,
"loss": 0.5636,
"step": 459
},
{
"epoch": 0.6804733727810651,
"grad_norm": 0.328125,
"learning_rate": 3.210059171597633e-06,
"loss": 0.5583,
"step": 460
},
{
"epoch": 0.6819526627218935,
"grad_norm": 0.54296875,
"learning_rate": 3.1952662721893497e-06,
"loss": 0.5774,
"step": 461
},
{
"epoch": 0.6834319526627219,
"grad_norm": 0.3203125,
"learning_rate": 3.1804733727810654e-06,
"loss": 0.5655,
"step": 462
},
{
"epoch": 0.6849112426035503,
"grad_norm": 0.30859375,
"learning_rate": 3.1656804733727816e-06,
"loss": 0.571,
"step": 463
},
{
"epoch": 0.6863905325443787,
"grad_norm": 0.373046875,
"learning_rate": 3.1508875739644973e-06,
"loss": 0.587,
"step": 464
},
{
"epoch": 0.6878698224852071,
"grad_norm": 0.380859375,
"learning_rate": 3.1360946745562134e-06,
"loss": 0.5672,
"step": 465
},
{
"epoch": 0.6893491124260355,
"grad_norm": 0.33984375,
"learning_rate": 3.121301775147929e-06,
"loss": 0.5727,
"step": 466
},
{
"epoch": 0.6908284023668639,
"grad_norm": 0.32421875,
"learning_rate": 3.106508875739645e-06,
"loss": 0.5452,
"step": 467
},
{
"epoch": 0.6923076923076923,
"grad_norm": 0.34375,
"learning_rate": 3.0917159763313613e-06,
"loss": 0.5746,
"step": 468
},
{
"epoch": 0.6937869822485208,
"grad_norm": 0.326171875,
"learning_rate": 3.0769230769230774e-06,
"loss": 0.5719,
"step": 469
},
{
"epoch": 0.6952662721893491,
"grad_norm": 0.34375,
"learning_rate": 3.062130177514793e-06,
"loss": 0.5875,
"step": 470
},
{
"epoch": 0.6967455621301775,
"grad_norm": 0.369140625,
"learning_rate": 3.0473372781065093e-06,
"loss": 0.5983,
"step": 471
},
{
"epoch": 0.6982248520710059,
"grad_norm": 0.361328125,
"learning_rate": 3.032544378698225e-06,
"loss": 0.5947,
"step": 472
},
{
"epoch": 0.6997041420118343,
"grad_norm": 0.375,
"learning_rate": 3.017751479289941e-06,
"loss": 0.5494,
"step": 473
},
{
"epoch": 0.7011834319526628,
"grad_norm": 0.3203125,
"learning_rate": 3.0029585798816568e-06,
"loss": 0.561,
"step": 474
},
{
"epoch": 0.7026627218934911,
"grad_norm": 0.33203125,
"learning_rate": 2.9881656804733733e-06,
"loss": 0.5795,
"step": 475
},
{
"epoch": 0.7041420118343196,
"grad_norm": 0.423828125,
"learning_rate": 2.973372781065089e-06,
"loss": 0.5694,
"step": 476
},
{
"epoch": 0.7056213017751479,
"grad_norm": 0.359375,
"learning_rate": 2.958579881656805e-06,
"loss": 0.5601,
"step": 477
},
{
"epoch": 0.7071005917159763,
"grad_norm": 0.33203125,
"learning_rate": 2.943786982248521e-06,
"loss": 0.5641,
"step": 478
},
{
"epoch": 0.7085798816568047,
"grad_norm": 0.318359375,
"learning_rate": 2.928994082840237e-06,
"loss": 0.5816,
"step": 479
},
{
"epoch": 0.7100591715976331,
"grad_norm": 0.330078125,
"learning_rate": 2.9142011834319526e-06,
"loss": 0.5707,
"step": 480
},
{
"epoch": 0.7115384615384616,
"grad_norm": 0.5625,
"learning_rate": 2.8994082840236688e-06,
"loss": 0.5626,
"step": 481
},
{
"epoch": 0.7130177514792899,
"grad_norm": 0.318359375,
"learning_rate": 2.8846153846153845e-06,
"loss": 0.5636,
"step": 482
},
{
"epoch": 0.7144970414201184,
"grad_norm": 0.34765625,
"learning_rate": 2.869822485207101e-06,
"loss": 0.5834,
"step": 483
},
{
"epoch": 0.7159763313609467,
"grad_norm": 0.33984375,
"learning_rate": 2.855029585798817e-06,
"loss": 0.5864,
"step": 484
},
{
"epoch": 0.7174556213017751,
"grad_norm": 0.326171875,
"learning_rate": 2.840236686390533e-06,
"loss": 0.5701,
"step": 485
},
{
"epoch": 0.7189349112426036,
"grad_norm": 0.333984375,
"learning_rate": 2.825443786982249e-06,
"loss": 0.559,
"step": 486
},
{
"epoch": 0.7204142011834319,
"grad_norm": 0.333984375,
"learning_rate": 2.8106508875739646e-06,
"loss": 0.5605,
"step": 487
},
{
"epoch": 0.7218934911242604,
"grad_norm": 0.337890625,
"learning_rate": 2.7958579881656803e-06,
"loss": 0.5703,
"step": 488
},
{
"epoch": 0.7233727810650887,
"grad_norm": 0.34765625,
"learning_rate": 2.7810650887573965e-06,
"loss": 0.5692,
"step": 489
},
{
"epoch": 0.7248520710059172,
"grad_norm": 0.314453125,
"learning_rate": 2.766272189349113e-06,
"loss": 0.5771,
"step": 490
},
{
"epoch": 0.7263313609467456,
"grad_norm": 0.3046875,
"learning_rate": 2.7514792899408287e-06,
"loss": 0.5505,
"step": 491
},
{
"epoch": 0.727810650887574,
"grad_norm": 0.328125,
"learning_rate": 2.736686390532545e-06,
"loss": 0.57,
"step": 492
},
{
"epoch": 0.7292899408284024,
"grad_norm": 0.3359375,
"learning_rate": 2.7218934911242605e-06,
"loss": 0.5648,
"step": 493
},
{
"epoch": 0.7307692307692307,
"grad_norm": 0.359375,
"learning_rate": 2.7071005917159766e-06,
"loss": 0.5493,
"step": 494
},
{
"epoch": 0.7322485207100592,
"grad_norm": 0.32421875,
"learning_rate": 2.6923076923076923e-06,
"loss": 0.5865,
"step": 495
},
{
"epoch": 0.7337278106508875,
"grad_norm": 0.333984375,
"learning_rate": 2.6775147928994085e-06,
"loss": 0.5645,
"step": 496
},
{
"epoch": 0.735207100591716,
"grad_norm": 0.32421875,
"learning_rate": 2.6627218934911246e-06,
"loss": 0.5788,
"step": 497
},
{
"epoch": 0.7366863905325444,
"grad_norm": 0.54296875,
"learning_rate": 2.6479289940828407e-06,
"loss": 0.5419,
"step": 498
},
{
"epoch": 0.7381656804733728,
"grad_norm": 0.3203125,
"learning_rate": 2.6331360946745564e-06,
"loss": 0.5738,
"step": 499
},
{
"epoch": 0.7396449704142012,
"grad_norm": 0.31640625,
"learning_rate": 2.6183431952662725e-06,
"loss": 0.5762,
"step": 500
},
{
"epoch": 0.7411242603550295,
"grad_norm": 0.314453125,
"learning_rate": 2.603550295857988e-06,
"loss": 0.565,
"step": 501
},
{
"epoch": 0.742603550295858,
"grad_norm": 0.326171875,
"learning_rate": 2.5887573964497043e-06,
"loss": 0.5741,
"step": 502
},
{
"epoch": 0.7440828402366864,
"grad_norm": 0.322265625,
"learning_rate": 2.57396449704142e-06,
"loss": 0.5627,
"step": 503
},
{
"epoch": 0.7455621301775148,
"grad_norm": 0.330078125,
"learning_rate": 2.559171597633136e-06,
"loss": 0.5762,
"step": 504
},
{
"epoch": 0.7470414201183432,
"grad_norm": 0.328125,
"learning_rate": 2.5443786982248527e-06,
"loss": 0.5589,
"step": 505
},
{
"epoch": 0.7485207100591716,
"grad_norm": 0.328125,
"learning_rate": 2.5295857988165684e-06,
"loss": 0.5686,
"step": 506
},
{
"epoch": 0.75,
"grad_norm": 0.3046875,
"learning_rate": 2.5147928994082845e-06,
"loss": 0.5663,
"step": 507
},
{
"epoch": 0.7514792899408284,
"grad_norm": 0.310546875,
"learning_rate": 2.5e-06,
"loss": 0.5551,
"step": 508
},
{
"epoch": 0.7529585798816568,
"grad_norm": 0.296875,
"learning_rate": 2.485207100591716e-06,
"loss": 0.5739,
"step": 509
},
{
"epoch": 0.7544378698224852,
"grad_norm": 0.3125,
"learning_rate": 2.4704142011834324e-06,
"loss": 0.5589,
"step": 510
},
{
"epoch": 0.7559171597633136,
"grad_norm": 0.35546875,
"learning_rate": 2.455621301775148e-06,
"loss": 0.5828,
"step": 511
},
{
"epoch": 0.757396449704142,
"grad_norm": 0.337890625,
"learning_rate": 2.4408284023668643e-06,
"loss": 0.5721,
"step": 512
},
{
"epoch": 0.7588757396449705,
"grad_norm": 0.36328125,
"learning_rate": 2.42603550295858e-06,
"loss": 0.569,
"step": 513
},
{
"epoch": 0.7603550295857988,
"grad_norm": 0.3671875,
"learning_rate": 2.411242603550296e-06,
"loss": 0.5795,
"step": 514
},
{
"epoch": 0.7618343195266272,
"grad_norm": 0.322265625,
"learning_rate": 2.396449704142012e-06,
"loss": 0.5573,
"step": 515
},
{
"epoch": 0.7633136094674556,
"grad_norm": 0.32421875,
"learning_rate": 2.381656804733728e-06,
"loss": 0.5668,
"step": 516
},
{
"epoch": 0.764792899408284,
"grad_norm": 0.361328125,
"learning_rate": 2.366863905325444e-06,
"loss": 0.5755,
"step": 517
},
{
"epoch": 0.7662721893491125,
"grad_norm": 0.341796875,
"learning_rate": 2.35207100591716e-06,
"loss": 0.5439,
"step": 518
},
{
"epoch": 0.7677514792899408,
"grad_norm": 0.34765625,
"learning_rate": 2.337278106508876e-06,
"loss": 0.6044,
"step": 519
},
{
"epoch": 0.7692307692307693,
"grad_norm": 0.349609375,
"learning_rate": 2.322485207100592e-06,
"loss": 0.5703,
"step": 520
},
{
"epoch": 0.7707100591715976,
"grad_norm": 0.3203125,
"learning_rate": 2.307692307692308e-06,
"loss": 0.5855,
"step": 521
},
{
"epoch": 0.772189349112426,
"grad_norm": 0.3125,
"learning_rate": 2.2928994082840238e-06,
"loss": 0.5794,
"step": 522
},
{
"epoch": 0.7736686390532544,
"grad_norm": 0.33203125,
"learning_rate": 2.27810650887574e-06,
"loss": 0.5678,
"step": 523
},
{
"epoch": 0.7751479289940828,
"grad_norm": 0.373046875,
"learning_rate": 2.2633136094674556e-06,
"loss": 0.571,
"step": 524
},
{
"epoch": 0.7766272189349113,
"grad_norm": 0.318359375,
"learning_rate": 2.2485207100591717e-06,
"loss": 0.5697,
"step": 525
},
{
"epoch": 0.7781065088757396,
"grad_norm": 0.318359375,
"learning_rate": 2.233727810650888e-06,
"loss": 0.5643,
"step": 526
},
{
"epoch": 0.7795857988165681,
"grad_norm": 0.33203125,
"learning_rate": 2.2189349112426035e-06,
"loss": 0.5924,
"step": 527
},
{
"epoch": 0.7810650887573964,
"grad_norm": 0.302734375,
"learning_rate": 2.2041420118343196e-06,
"loss": 0.5483,
"step": 528
},
{
"epoch": 0.7825443786982249,
"grad_norm": 0.310546875,
"learning_rate": 2.1893491124260358e-06,
"loss": 0.5689,
"step": 529
},
{
"epoch": 0.7840236686390533,
"grad_norm": 0.333984375,
"learning_rate": 2.1745562130177515e-06,
"loss": 0.5588,
"step": 530
},
{
"epoch": 0.7855029585798816,
"grad_norm": 0.34375,
"learning_rate": 2.1597633136094676e-06,
"loss": 0.5674,
"step": 531
},
{
"epoch": 0.7869822485207101,
"grad_norm": 0.3359375,
"learning_rate": 2.1449704142011837e-06,
"loss": 0.5715,
"step": 532
},
{
"epoch": 0.7884615384615384,
"grad_norm": 0.34765625,
"learning_rate": 2.1301775147929e-06,
"loss": 0.5608,
"step": 533
},
{
"epoch": 0.7899408284023669,
"grad_norm": 0.345703125,
"learning_rate": 2.1153846153846155e-06,
"loss": 0.5712,
"step": 534
},
{
"epoch": 0.7914201183431953,
"grad_norm": 0.314453125,
"learning_rate": 2.1005917159763316e-06,
"loss": 0.5642,
"step": 535
},
{
"epoch": 0.7928994082840237,
"grad_norm": 0.322265625,
"learning_rate": 2.0857988165680478e-06,
"loss": 0.5829,
"step": 536
},
{
"epoch": 0.7943786982248521,
"grad_norm": 0.330078125,
"learning_rate": 2.0710059171597635e-06,
"loss": 0.5663,
"step": 537
},
{
"epoch": 0.7958579881656804,
"grad_norm": 0.400390625,
"learning_rate": 2.0562130177514796e-06,
"loss": 0.5441,
"step": 538
},
{
"epoch": 0.7973372781065089,
"grad_norm": 0.333984375,
"learning_rate": 2.0414201183431953e-06,
"loss": 0.5885,
"step": 539
},
{
"epoch": 0.7988165680473372,
"grad_norm": 0.333984375,
"learning_rate": 2.0266272189349114e-06,
"loss": 0.5578,
"step": 540
},
{
"epoch": 0.8002958579881657,
"grad_norm": 0.365234375,
"learning_rate": 2.0118343195266275e-06,
"loss": 0.5711,
"step": 541
},
{
"epoch": 0.8017751479289941,
"grad_norm": 0.341796875,
"learning_rate": 1.9970414201183432e-06,
"loss": 0.5577,
"step": 542
},
{
"epoch": 0.8032544378698225,
"grad_norm": 0.31640625,
"learning_rate": 1.9822485207100593e-06,
"loss": 0.5473,
"step": 543
},
{
"epoch": 0.8047337278106509,
"grad_norm": 0.341796875,
"learning_rate": 1.9674556213017755e-06,
"loss": 0.5505,
"step": 544
},
{
"epoch": 0.8062130177514792,
"grad_norm": 0.40625,
"learning_rate": 1.952662721893491e-06,
"loss": 0.5484,
"step": 545
},
{
"epoch": 0.8076923076923077,
"grad_norm": 0.33984375,
"learning_rate": 1.9378698224852073e-06,
"loss": 0.5598,
"step": 546
},
{
"epoch": 0.8091715976331361,
"grad_norm": 0.486328125,
"learning_rate": 1.9230769230769234e-06,
"loss": 0.5665,
"step": 547
},
{
"epoch": 0.8106508875739645,
"grad_norm": 0.443359375,
"learning_rate": 1.908284023668639e-06,
"loss": 0.5909,
"step": 548
},
{
"epoch": 0.8121301775147929,
"grad_norm": 0.30859375,
"learning_rate": 1.8934911242603552e-06,
"loss": 0.5637,
"step": 549
},
{
"epoch": 0.8136094674556213,
"grad_norm": 0.310546875,
"learning_rate": 1.8786982248520711e-06,
"loss": 0.5815,
"step": 550
},
{
"epoch": 0.8150887573964497,
"grad_norm": 0.322265625,
"learning_rate": 1.8639053254437872e-06,
"loss": 0.5591,
"step": 551
},
{
"epoch": 0.8165680473372781,
"grad_norm": 0.34375,
"learning_rate": 1.8491124260355032e-06,
"loss": 0.5759,
"step": 552
},
{
"epoch": 0.8180473372781065,
"grad_norm": 0.333984375,
"learning_rate": 1.834319526627219e-06,
"loss": 0.5888,
"step": 553
},
{
"epoch": 0.8195266272189349,
"grad_norm": 0.310546875,
"learning_rate": 1.8195266272189352e-06,
"loss": 0.5647,
"step": 554
},
{
"epoch": 0.8210059171597633,
"grad_norm": 0.3125,
"learning_rate": 1.804733727810651e-06,
"loss": 0.5686,
"step": 555
},
{
"epoch": 0.8224852071005917,
"grad_norm": 0.31640625,
"learning_rate": 1.789940828402367e-06,
"loss": 0.5556,
"step": 556
},
{
"epoch": 0.8239644970414202,
"grad_norm": 0.333984375,
"learning_rate": 1.775147928994083e-06,
"loss": 0.5613,
"step": 557
},
{
"epoch": 0.8254437869822485,
"grad_norm": 0.306640625,
"learning_rate": 1.760355029585799e-06,
"loss": 0.5642,
"step": 558
},
{
"epoch": 0.8269230769230769,
"grad_norm": 0.41015625,
"learning_rate": 1.745562130177515e-06,
"loss": 0.6028,
"step": 559
},
{
"epoch": 0.8284023668639053,
"grad_norm": 0.35546875,
"learning_rate": 1.7307692307692308e-06,
"loss": 0.5737,
"step": 560
},
{
"epoch": 0.8298816568047337,
"grad_norm": 0.322265625,
"learning_rate": 1.7159763313609468e-06,
"loss": 0.5734,
"step": 561
},
{
"epoch": 0.8313609467455622,
"grad_norm": 0.330078125,
"learning_rate": 1.7011834319526629e-06,
"loss": 0.5527,
"step": 562
},
{
"epoch": 0.8328402366863905,
"grad_norm": 0.404296875,
"learning_rate": 1.6863905325443788e-06,
"loss": 0.5748,
"step": 563
},
{
"epoch": 0.834319526627219,
"grad_norm": 0.3203125,
"learning_rate": 1.6715976331360947e-06,
"loss": 0.5798,
"step": 564
},
{
"epoch": 0.8357988165680473,
"grad_norm": 0.3125,
"learning_rate": 1.656804733727811e-06,
"loss": 0.5765,
"step": 565
},
{
"epoch": 0.8372781065088757,
"grad_norm": 0.345703125,
"learning_rate": 1.6420118343195267e-06,
"loss": 0.573,
"step": 566
},
{
"epoch": 0.8387573964497042,
"grad_norm": 0.349609375,
"learning_rate": 1.6272189349112426e-06,
"loss": 0.5562,
"step": 567
},
{
"epoch": 0.8402366863905325,
"grad_norm": 0.33203125,
"learning_rate": 1.6124260355029585e-06,
"loss": 0.5624,
"step": 568
},
{
"epoch": 0.841715976331361,
"grad_norm": 0.341796875,
"learning_rate": 1.5976331360946749e-06,
"loss": 0.5724,
"step": 569
},
{
"epoch": 0.8431952662721893,
"grad_norm": 0.322265625,
"learning_rate": 1.5828402366863908e-06,
"loss": 0.5572,
"step": 570
},
{
"epoch": 0.8446745562130178,
"grad_norm": 0.3359375,
"learning_rate": 1.5680473372781067e-06,
"loss": 0.5413,
"step": 571
},
{
"epoch": 0.8461538461538461,
"grad_norm": 0.337890625,
"learning_rate": 1.5532544378698226e-06,
"loss": 0.5648,
"step": 572
},
{
"epoch": 0.8476331360946746,
"grad_norm": 0.3203125,
"learning_rate": 1.5384615384615387e-06,
"loss": 0.5582,
"step": 573
},
{
"epoch": 0.849112426035503,
"grad_norm": 0.314453125,
"learning_rate": 1.5236686390532546e-06,
"loss": 0.5686,
"step": 574
},
{
"epoch": 0.8505917159763313,
"grad_norm": 0.353515625,
"learning_rate": 1.5088757396449705e-06,
"loss": 0.5632,
"step": 575
},
{
"epoch": 0.8520710059171598,
"grad_norm": 0.359375,
"learning_rate": 1.4940828402366867e-06,
"loss": 0.5714,
"step": 576
},
{
"epoch": 0.8535502958579881,
"grad_norm": 0.302734375,
"learning_rate": 1.4792899408284026e-06,
"loss": 0.5606,
"step": 577
},
{
"epoch": 0.8550295857988166,
"grad_norm": 0.3046875,
"learning_rate": 1.4644970414201185e-06,
"loss": 0.556,
"step": 578
},
{
"epoch": 0.856508875739645,
"grad_norm": 0.3359375,
"learning_rate": 1.4497041420118344e-06,
"loss": 0.5748,
"step": 579
},
{
"epoch": 0.8579881656804734,
"grad_norm": 0.345703125,
"learning_rate": 1.4349112426035505e-06,
"loss": 0.5854,
"step": 580
},
{
"epoch": 0.8594674556213018,
"grad_norm": 0.32421875,
"learning_rate": 1.4201183431952664e-06,
"loss": 0.5535,
"step": 581
},
{
"epoch": 0.8609467455621301,
"grad_norm": 0.318359375,
"learning_rate": 1.4053254437869823e-06,
"loss": 0.5738,
"step": 582
},
{
"epoch": 0.8624260355029586,
"grad_norm": 0.330078125,
"learning_rate": 1.3905325443786982e-06,
"loss": 0.5597,
"step": 583
},
{
"epoch": 0.863905325443787,
"grad_norm": 0.53515625,
"learning_rate": 1.3757396449704143e-06,
"loss": 0.5788,
"step": 584
},
{
"epoch": 0.8653846153846154,
"grad_norm": 0.31640625,
"learning_rate": 1.3609467455621303e-06,
"loss": 0.5593,
"step": 585
},
{
"epoch": 0.8668639053254438,
"grad_norm": 0.31640625,
"learning_rate": 1.3461538461538462e-06,
"loss": 0.5647,
"step": 586
},
{
"epoch": 0.8683431952662722,
"grad_norm": 0.30859375,
"learning_rate": 1.3313609467455623e-06,
"loss": 0.5654,
"step": 587
},
{
"epoch": 0.8698224852071006,
"grad_norm": 0.3125,
"learning_rate": 1.3165680473372782e-06,
"loss": 0.5731,
"step": 588
},
{
"epoch": 0.871301775147929,
"grad_norm": 0.322265625,
"learning_rate": 1.301775147928994e-06,
"loss": 0.5657,
"step": 589
},
{
"epoch": 0.8727810650887574,
"grad_norm": 0.314453125,
"learning_rate": 1.28698224852071e-06,
"loss": 0.5705,
"step": 590
},
{
"epoch": 0.8742603550295858,
"grad_norm": 0.318359375,
"learning_rate": 1.2721893491124263e-06,
"loss": 0.5743,
"step": 591
},
{
"epoch": 0.8757396449704142,
"grad_norm": 0.328125,
"learning_rate": 1.2573964497041423e-06,
"loss": 0.5618,
"step": 592
},
{
"epoch": 0.8772189349112426,
"grad_norm": 0.330078125,
"learning_rate": 1.242603550295858e-06,
"loss": 0.5785,
"step": 593
},
{
"epoch": 0.878698224852071,
"grad_norm": 0.3125,
"learning_rate": 1.227810650887574e-06,
"loss": 0.5398,
"step": 594
},
{
"epoch": 0.8801775147928994,
"grad_norm": 0.337890625,
"learning_rate": 1.21301775147929e-06,
"loss": 0.5583,
"step": 595
},
{
"epoch": 0.8816568047337278,
"grad_norm": 0.326171875,
"learning_rate": 1.198224852071006e-06,
"loss": 0.5545,
"step": 596
},
{
"epoch": 0.8831360946745562,
"grad_norm": 0.345703125,
"learning_rate": 1.183431952662722e-06,
"loss": 0.5672,
"step": 597
},
{
"epoch": 0.8846153846153846,
"grad_norm": 0.3125,
"learning_rate": 1.168639053254438e-06,
"loss": 0.5593,
"step": 598
},
{
"epoch": 0.886094674556213,
"grad_norm": 0.31640625,
"learning_rate": 1.153846153846154e-06,
"loss": 0.5657,
"step": 599
},
{
"epoch": 0.8875739644970414,
"grad_norm": 0.4140625,
"learning_rate": 1.13905325443787e-06,
"loss": 0.5733,
"step": 600
},
{
"epoch": 0.8890532544378699,
"grad_norm": 0.3203125,
"learning_rate": 1.1242603550295859e-06,
"loss": 0.5593,
"step": 601
},
{
"epoch": 0.8905325443786982,
"grad_norm": 0.3203125,
"learning_rate": 1.1094674556213018e-06,
"loss": 0.5633,
"step": 602
},
{
"epoch": 0.8920118343195266,
"grad_norm": 0.330078125,
"learning_rate": 1.0946745562130179e-06,
"loss": 0.5903,
"step": 603
},
{
"epoch": 0.893491124260355,
"grad_norm": 0.384765625,
"learning_rate": 1.0798816568047338e-06,
"loss": 0.5375,
"step": 604
},
{
"epoch": 0.8949704142011834,
"grad_norm": 0.3203125,
"learning_rate": 1.06508875739645e-06,
"loss": 0.5813,
"step": 605
},
{
"epoch": 0.8964497041420119,
"grad_norm": 0.3515625,
"learning_rate": 1.0502958579881658e-06,
"loss": 0.5783,
"step": 606
},
{
"epoch": 0.8979289940828402,
"grad_norm": 0.310546875,
"learning_rate": 1.0355029585798817e-06,
"loss": 0.5687,
"step": 607
},
{
"epoch": 0.8994082840236687,
"grad_norm": 0.310546875,
"learning_rate": 1.0207100591715976e-06,
"loss": 0.5787,
"step": 608
},
{
"epoch": 0.900887573964497,
"grad_norm": 0.375,
"learning_rate": 1.0059171597633138e-06,
"loss": 0.5585,
"step": 609
},
{
"epoch": 0.9023668639053254,
"grad_norm": 0.314453125,
"learning_rate": 9.911242603550297e-07,
"loss": 0.5476,
"step": 610
},
{
"epoch": 0.9038461538461539,
"grad_norm": 0.48046875,
"learning_rate": 9.763313609467456e-07,
"loss": 0.5506,
"step": 611
},
{
"epoch": 0.9053254437869822,
"grad_norm": 0.30859375,
"learning_rate": 9.615384615384617e-07,
"loss": 0.5637,
"step": 612
},
{
"epoch": 0.9068047337278107,
"grad_norm": 0.330078125,
"learning_rate": 9.467455621301776e-07,
"loss": 0.5886,
"step": 613
},
{
"epoch": 0.908284023668639,
"grad_norm": 0.31640625,
"learning_rate": 9.319526627218936e-07,
"loss": 0.5442,
"step": 614
},
{
"epoch": 0.9097633136094675,
"grad_norm": 0.42578125,
"learning_rate": 9.171597633136095e-07,
"loss": 0.5558,
"step": 615
},
{
"epoch": 0.9112426035502958,
"grad_norm": 0.30078125,
"learning_rate": 9.023668639053255e-07,
"loss": 0.5677,
"step": 616
},
{
"epoch": 0.9127218934911243,
"grad_norm": 0.30859375,
"learning_rate": 8.875739644970415e-07,
"loss": 0.5703,
"step": 617
},
{
"epoch": 0.9142011834319527,
"grad_norm": 0.30859375,
"learning_rate": 8.727810650887575e-07,
"loss": 0.5571,
"step": 618
},
{
"epoch": 0.915680473372781,
"grad_norm": 0.326171875,
"learning_rate": 8.579881656804734e-07,
"loss": 0.5645,
"step": 619
},
{
"epoch": 0.9171597633136095,
"grad_norm": 0.30078125,
"learning_rate": 8.431952662721894e-07,
"loss": 0.5711,
"step": 620
},
{
"epoch": 0.9186390532544378,
"grad_norm": 0.3125,
"learning_rate": 8.284023668639055e-07,
"loss": 0.5501,
"step": 621
},
{
"epoch": 0.9201183431952663,
"grad_norm": 0.3046875,
"learning_rate": 8.136094674556213e-07,
"loss": 0.5633,
"step": 622
},
{
"epoch": 0.9215976331360947,
"grad_norm": 0.310546875,
"learning_rate": 7.988165680473374e-07,
"loss": 0.5796,
"step": 623
},
{
"epoch": 0.9230769230769231,
"grad_norm": 0.30078125,
"learning_rate": 7.840236686390533e-07,
"loss": 0.5626,
"step": 624
},
{
"epoch": 0.9245562130177515,
"grad_norm": 0.296875,
"learning_rate": 7.692307692307694e-07,
"loss": 0.5577,
"step": 625
},
{
"epoch": 0.9260355029585798,
"grad_norm": 0.314453125,
"learning_rate": 7.544378698224853e-07,
"loss": 0.5671,
"step": 626
},
{
"epoch": 0.9275147928994083,
"grad_norm": 0.310546875,
"learning_rate": 7.396449704142013e-07,
"loss": 0.5583,
"step": 627
},
{
"epoch": 0.9289940828402367,
"grad_norm": 0.42578125,
"learning_rate": 7.248520710059172e-07,
"loss": 0.5706,
"step": 628
},
{
"epoch": 0.9304733727810651,
"grad_norm": 0.314453125,
"learning_rate": 7.100591715976332e-07,
"loss": 0.5668,
"step": 629
},
{
"epoch": 0.9319526627218935,
"grad_norm": 0.333984375,
"learning_rate": 6.952662721893491e-07,
"loss": 0.5597,
"step": 630
},
{
"epoch": 0.9334319526627219,
"grad_norm": 0.30859375,
"learning_rate": 6.804733727810651e-07,
"loss": 0.5494,
"step": 631
},
{
"epoch": 0.9349112426035503,
"grad_norm": 0.31640625,
"learning_rate": 6.656804733727811e-07,
"loss": 0.5948,
"step": 632
},
{
"epoch": 0.9363905325443787,
"grad_norm": 0.375,
"learning_rate": 6.50887573964497e-07,
"loss": 0.5771,
"step": 633
},
{
"epoch": 0.9378698224852071,
"grad_norm": 0.30859375,
"learning_rate": 6.360946745562132e-07,
"loss": 0.5567,
"step": 634
},
{
"epoch": 0.9393491124260355,
"grad_norm": 0.298828125,
"learning_rate": 6.21301775147929e-07,
"loss": 0.5566,
"step": 635
},
{
"epoch": 0.9408284023668639,
"grad_norm": 0.359375,
"learning_rate": 6.06508875739645e-07,
"loss": 0.565,
"step": 636
},
{
"epoch": 0.9423076923076923,
"grad_norm": 0.3046875,
"learning_rate": 5.91715976331361e-07,
"loss": 0.5601,
"step": 637
},
{
"epoch": 0.9437869822485208,
"grad_norm": 0.3125,
"learning_rate": 5.76923076923077e-07,
"loss": 0.5559,
"step": 638
},
{
"epoch": 0.9452662721893491,
"grad_norm": 0.48046875,
"learning_rate": 5.621301775147929e-07,
"loss": 0.571,
"step": 639
},
{
"epoch": 0.9467455621301775,
"grad_norm": 0.302734375,
"learning_rate": 5.473372781065089e-07,
"loss": 0.5667,
"step": 640
},
{
"epoch": 0.9482248520710059,
"grad_norm": 0.314453125,
"learning_rate": 5.32544378698225e-07,
"loss": 0.5524,
"step": 641
},
{
"epoch": 0.9497041420118343,
"grad_norm": 0.3125,
"learning_rate": 5.177514792899409e-07,
"loss": 0.5736,
"step": 642
},
{
"epoch": 0.9511834319526628,
"grad_norm": 0.3203125,
"learning_rate": 5.029585798816569e-07,
"loss": 0.5692,
"step": 643
},
{
"epoch": 0.9526627218934911,
"grad_norm": 0.3671875,
"learning_rate": 4.881656804733728e-07,
"loss": 0.5535,
"step": 644
},
{
"epoch": 0.9541420118343196,
"grad_norm": 0.30078125,
"learning_rate": 4.733727810650888e-07,
"loss": 0.5674,
"step": 645
},
{
"epoch": 0.9556213017751479,
"grad_norm": 0.326171875,
"learning_rate": 4.5857988165680477e-07,
"loss": 0.5621,
"step": 646
},
{
"epoch": 0.9571005917159763,
"grad_norm": 0.306640625,
"learning_rate": 4.4378698224852073e-07,
"loss": 0.5684,
"step": 647
},
{
"epoch": 0.9585798816568047,
"grad_norm": 0.314453125,
"learning_rate": 4.289940828402367e-07,
"loss": 0.5605,
"step": 648
},
{
"epoch": 0.9600591715976331,
"grad_norm": 0.435546875,
"learning_rate": 4.1420118343195276e-07,
"loss": 0.55,
"step": 649
},
{
"epoch": 0.9615384615384616,
"grad_norm": 0.302734375,
"learning_rate": 3.994082840236687e-07,
"loss": 0.5747,
"step": 650
},
{
"epoch": 0.9630177514792899,
"grad_norm": 0.35546875,
"learning_rate": 3.846153846153847e-07,
"loss": 0.5614,
"step": 651
},
{
"epoch": 0.9644970414201184,
"grad_norm": 0.326171875,
"learning_rate": 3.6982248520710064e-07,
"loss": 0.5627,
"step": 652
},
{
"epoch": 0.9659763313609467,
"grad_norm": 0.322265625,
"learning_rate": 3.550295857988166e-07,
"loss": 0.5524,
"step": 653
},
{
"epoch": 0.9674556213017751,
"grad_norm": 0.30859375,
"learning_rate": 3.4023668639053256e-07,
"loss": 0.5469,
"step": 654
},
{
"epoch": 0.9689349112426036,
"grad_norm": 0.30078125,
"learning_rate": 3.254437869822485e-07,
"loss": 0.5692,
"step": 655
},
{
"epoch": 0.9704142011834319,
"grad_norm": 0.314453125,
"learning_rate": 3.106508875739645e-07,
"loss": 0.5611,
"step": 656
},
{
"epoch": 0.9718934911242604,
"grad_norm": 0.318359375,
"learning_rate": 2.958579881656805e-07,
"loss": 0.5744,
"step": 657
},
{
"epoch": 0.9733727810650887,
"grad_norm": 0.3125,
"learning_rate": 2.8106508875739646e-07,
"loss": 0.5563,
"step": 658
},
{
"epoch": 0.9748520710059172,
"grad_norm": 0.306640625,
"learning_rate": 2.662721893491125e-07,
"loss": 0.5503,
"step": 659
},
{
"epoch": 0.9763313609467456,
"grad_norm": 0.298828125,
"learning_rate": 2.5147928994082844e-07,
"loss": 0.5669,
"step": 660
},
{
"epoch": 0.977810650887574,
"grad_norm": 0.333984375,
"learning_rate": 2.366863905325444e-07,
"loss": 0.5935,
"step": 661
},
{
"epoch": 0.9792899408284024,
"grad_norm": 0.333984375,
"learning_rate": 2.2189349112426036e-07,
"loss": 0.5609,
"step": 662
},
{
"epoch": 0.9807692307692307,
"grad_norm": 0.314453125,
"learning_rate": 2.0710059171597638e-07,
"loss": 0.5684,
"step": 663
},
{
"epoch": 0.9822485207100592,
"grad_norm": 0.314453125,
"learning_rate": 1.9230769230769234e-07,
"loss": 0.5626,
"step": 664
},
{
"epoch": 0.9837278106508875,
"grad_norm": 0.478515625,
"learning_rate": 1.775147928994083e-07,
"loss": 0.5485,
"step": 665
},
{
"epoch": 0.985207100591716,
"grad_norm": 0.330078125,
"learning_rate": 1.6272189349112426e-07,
"loss": 0.5812,
"step": 666
},
{
"epoch": 0.9866863905325444,
"grad_norm": 0.333984375,
"learning_rate": 1.4792899408284025e-07,
"loss": 0.5705,
"step": 667
},
{
"epoch": 0.9881656804733728,
"grad_norm": 0.328125,
"learning_rate": 1.3313609467455624e-07,
"loss": 0.5608,
"step": 668
},
{
"epoch": 0.9896449704142012,
"grad_norm": 0.30859375,
"learning_rate": 1.183431952662722e-07,
"loss": 0.5619,
"step": 669
},
{
"epoch": 0.9911242603550295,
"grad_norm": 0.314453125,
"learning_rate": 1.0355029585798819e-07,
"loss": 0.5583,
"step": 670
},
{
"epoch": 0.992603550295858,
"grad_norm": 0.30078125,
"learning_rate": 8.875739644970415e-08,
"loss": 0.5623,
"step": 671
},
{
"epoch": 0.9940828402366864,
"grad_norm": 0.333984375,
"learning_rate": 7.396449704142013e-08,
"loss": 0.5511,
"step": 672
},
{
"epoch": 0.9955621301775148,
"grad_norm": 0.419921875,
"learning_rate": 5.91715976331361e-08,
"loss": 0.5555,
"step": 673
},
{
"epoch": 0.9970414201183432,
"grad_norm": 0.32421875,
"learning_rate": 4.4378698224852075e-08,
"loss": 0.5655,
"step": 674
},
{
"epoch": 0.9985207100591716,
"grad_norm": 0.326171875,
"learning_rate": 2.958579881656805e-08,
"loss": 0.5678,
"step": 675
},
{
"epoch": 1.0,
"grad_norm": 0.439453125,
"learning_rate": 1.4792899408284025e-08,
"loss": 0.5862,
"step": 676
}
],
"logging_steps": 1.0,
"max_steps": 676,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.2142270255839314e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}