Wanff
Add fine-tuned model
291f37f
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 0,
"global_step": 415,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0024096385542168677,
"grad_norm": 0.443359375,
"learning_rate": 9.975903614457833e-06,
"loss": 1.8971,
"step": 1
},
{
"epoch": 0.004819277108433735,
"grad_norm": 0.42578125,
"learning_rate": 9.951807228915663e-06,
"loss": 1.8183,
"step": 2
},
{
"epoch": 0.007228915662650603,
"grad_norm": 0.431640625,
"learning_rate": 9.927710843373494e-06,
"loss": 1.7341,
"step": 3
},
{
"epoch": 0.00963855421686747,
"grad_norm": 0.412109375,
"learning_rate": 9.903614457831326e-06,
"loss": 1.8587,
"step": 4
},
{
"epoch": 0.012048192771084338,
"grad_norm": 0.400390625,
"learning_rate": 9.879518072289156e-06,
"loss": 1.8129,
"step": 5
},
{
"epoch": 0.014457831325301205,
"grad_norm": 0.392578125,
"learning_rate": 9.855421686746988e-06,
"loss": 1.7437,
"step": 6
},
{
"epoch": 0.016867469879518072,
"grad_norm": 0.359375,
"learning_rate": 9.83132530120482e-06,
"loss": 1.8127,
"step": 7
},
{
"epoch": 0.01927710843373494,
"grad_norm": 1.09375,
"learning_rate": 9.807228915662652e-06,
"loss": 1.7577,
"step": 8
},
{
"epoch": 0.021686746987951807,
"grad_norm": 0.34375,
"learning_rate": 9.783132530120483e-06,
"loss": 1.7672,
"step": 9
},
{
"epoch": 0.024096385542168676,
"grad_norm": 0.328125,
"learning_rate": 9.759036144578315e-06,
"loss": 1.6981,
"step": 10
},
{
"epoch": 0.02650602409638554,
"grad_norm": 0.322265625,
"learning_rate": 9.734939759036145e-06,
"loss": 1.7759,
"step": 11
},
{
"epoch": 0.02891566265060241,
"grad_norm": 0.29296875,
"learning_rate": 9.710843373493977e-06,
"loss": 1.6405,
"step": 12
},
{
"epoch": 0.03132530120481928,
"grad_norm": 0.291015625,
"learning_rate": 9.686746987951809e-06,
"loss": 1.7534,
"step": 13
},
{
"epoch": 0.033734939759036145,
"grad_norm": 0.267578125,
"learning_rate": 9.662650602409639e-06,
"loss": 1.6553,
"step": 14
},
{
"epoch": 0.03614457831325301,
"grad_norm": 0.296875,
"learning_rate": 9.63855421686747e-06,
"loss": 1.7058,
"step": 15
},
{
"epoch": 0.03855421686746988,
"grad_norm": 0.283203125,
"learning_rate": 9.614457831325302e-06,
"loss": 1.7272,
"step": 16
},
{
"epoch": 0.04096385542168675,
"grad_norm": 0.291015625,
"learning_rate": 9.590361445783132e-06,
"loss": 1.6819,
"step": 17
},
{
"epoch": 0.043373493975903614,
"grad_norm": 0.30078125,
"learning_rate": 9.566265060240964e-06,
"loss": 1.7019,
"step": 18
},
{
"epoch": 0.04578313253012048,
"grad_norm": 0.287109375,
"learning_rate": 9.542168674698796e-06,
"loss": 1.6333,
"step": 19
},
{
"epoch": 0.04819277108433735,
"grad_norm": 0.25390625,
"learning_rate": 9.518072289156628e-06,
"loss": 1.5319,
"step": 20
},
{
"epoch": 0.05060240963855422,
"grad_norm": 0.259765625,
"learning_rate": 9.49397590361446e-06,
"loss": 1.6696,
"step": 21
},
{
"epoch": 0.05301204819277108,
"grad_norm": 0.271484375,
"learning_rate": 9.46987951807229e-06,
"loss": 1.6739,
"step": 22
},
{
"epoch": 0.05542168674698795,
"grad_norm": 0.251953125,
"learning_rate": 9.445783132530121e-06,
"loss": 1.6864,
"step": 23
},
{
"epoch": 0.05783132530120482,
"grad_norm": 0.2294921875,
"learning_rate": 9.421686746987953e-06,
"loss": 1.5915,
"step": 24
},
{
"epoch": 0.060240963855421686,
"grad_norm": 0.2236328125,
"learning_rate": 9.397590361445785e-06,
"loss": 1.562,
"step": 25
},
{
"epoch": 0.06265060240963856,
"grad_norm": 0.2294921875,
"learning_rate": 9.373493975903615e-06,
"loss": 1.5647,
"step": 26
},
{
"epoch": 0.06506024096385542,
"grad_norm": 0.21875,
"learning_rate": 9.349397590361446e-06,
"loss": 1.6104,
"step": 27
},
{
"epoch": 0.06746987951807229,
"grad_norm": 0.216796875,
"learning_rate": 9.325301204819278e-06,
"loss": 1.5392,
"step": 28
},
{
"epoch": 0.06987951807228916,
"grad_norm": 0.205078125,
"learning_rate": 9.301204819277108e-06,
"loss": 1.5591,
"step": 29
},
{
"epoch": 0.07228915662650602,
"grad_norm": 0.20703125,
"learning_rate": 9.27710843373494e-06,
"loss": 1.5547,
"step": 30
},
{
"epoch": 0.0746987951807229,
"grad_norm": 0.197265625,
"learning_rate": 9.253012048192772e-06,
"loss": 1.513,
"step": 31
},
{
"epoch": 0.07710843373493977,
"grad_norm": 0.1904296875,
"learning_rate": 9.228915662650602e-06,
"loss": 1.5069,
"step": 32
},
{
"epoch": 0.07951807228915662,
"grad_norm": 0.1796875,
"learning_rate": 9.204819277108434e-06,
"loss": 1.5193,
"step": 33
},
{
"epoch": 0.0819277108433735,
"grad_norm": 0.185546875,
"learning_rate": 9.180722891566265e-06,
"loss": 1.4837,
"step": 34
},
{
"epoch": 0.08433734939759036,
"grad_norm": 0.1904296875,
"learning_rate": 9.156626506024097e-06,
"loss": 1.4584,
"step": 35
},
{
"epoch": 0.08674698795180723,
"grad_norm": 0.208984375,
"learning_rate": 9.132530120481929e-06,
"loss": 1.4999,
"step": 36
},
{
"epoch": 0.0891566265060241,
"grad_norm": 0.181640625,
"learning_rate": 9.10843373493976e-06,
"loss": 1.4903,
"step": 37
},
{
"epoch": 0.09156626506024096,
"grad_norm": 0.1767578125,
"learning_rate": 9.08433734939759e-06,
"loss": 1.4731,
"step": 38
},
{
"epoch": 0.09397590361445783,
"grad_norm": 0.1904296875,
"learning_rate": 9.060240963855423e-06,
"loss": 1.4711,
"step": 39
},
{
"epoch": 0.0963855421686747,
"grad_norm": 0.18359375,
"learning_rate": 9.036144578313254e-06,
"loss": 1.5278,
"step": 40
},
{
"epoch": 0.09879518072289156,
"grad_norm": 0.19140625,
"learning_rate": 9.012048192771084e-06,
"loss": 1.4631,
"step": 41
},
{
"epoch": 0.10120481927710843,
"grad_norm": 0.1728515625,
"learning_rate": 8.987951807228916e-06,
"loss": 1.4589,
"step": 42
},
{
"epoch": 0.10361445783132531,
"grad_norm": 0.154296875,
"learning_rate": 8.963855421686748e-06,
"loss": 1.3893,
"step": 43
},
{
"epoch": 0.10602409638554217,
"grad_norm": 0.162109375,
"learning_rate": 8.939759036144578e-06,
"loss": 1.4173,
"step": 44
},
{
"epoch": 0.10843373493975904,
"grad_norm": 0.16796875,
"learning_rate": 8.91566265060241e-06,
"loss": 1.4701,
"step": 45
},
{
"epoch": 0.1108433734939759,
"grad_norm": 0.1748046875,
"learning_rate": 8.891566265060241e-06,
"loss": 1.4885,
"step": 46
},
{
"epoch": 0.11325301204819277,
"grad_norm": 0.197265625,
"learning_rate": 8.867469879518073e-06,
"loss": 1.4325,
"step": 47
},
{
"epoch": 0.11566265060240964,
"grad_norm": 0.162109375,
"learning_rate": 8.843373493975905e-06,
"loss": 1.4365,
"step": 48
},
{
"epoch": 0.1180722891566265,
"grad_norm": 0.1669921875,
"learning_rate": 8.819277108433735e-06,
"loss": 1.3758,
"step": 49
},
{
"epoch": 0.12048192771084337,
"grad_norm": 0.16796875,
"learning_rate": 8.795180722891567e-06,
"loss": 1.4265,
"step": 50
},
{
"epoch": 0.12289156626506025,
"grad_norm": 0.154296875,
"learning_rate": 8.771084337349399e-06,
"loss": 1.3894,
"step": 51
},
{
"epoch": 0.12530120481927712,
"grad_norm": 0.16015625,
"learning_rate": 8.74698795180723e-06,
"loss": 1.4533,
"step": 52
},
{
"epoch": 0.12771084337349398,
"grad_norm": 0.16796875,
"learning_rate": 8.722891566265062e-06,
"loss": 1.3708,
"step": 53
},
{
"epoch": 0.13012048192771083,
"grad_norm": 0.1630859375,
"learning_rate": 8.698795180722892e-06,
"loss": 1.4268,
"step": 54
},
{
"epoch": 0.13253012048192772,
"grad_norm": 0.15625,
"learning_rate": 8.674698795180724e-06,
"loss": 1.4155,
"step": 55
},
{
"epoch": 0.13493975903614458,
"grad_norm": 0.1630859375,
"learning_rate": 8.650602409638556e-06,
"loss": 1.4563,
"step": 56
},
{
"epoch": 0.13734939759036144,
"grad_norm": 0.158203125,
"learning_rate": 8.626506024096386e-06,
"loss": 1.4198,
"step": 57
},
{
"epoch": 0.13975903614457832,
"grad_norm": 0.1591796875,
"learning_rate": 8.602409638554217e-06,
"loss": 1.3974,
"step": 58
},
{
"epoch": 0.14216867469879518,
"grad_norm": 0.16015625,
"learning_rate": 8.57831325301205e-06,
"loss": 1.4379,
"step": 59
},
{
"epoch": 0.14457831325301204,
"grad_norm": 0.1689453125,
"learning_rate": 8.55421686746988e-06,
"loss": 1.4232,
"step": 60
},
{
"epoch": 0.14698795180722893,
"grad_norm": 0.1572265625,
"learning_rate": 8.530120481927711e-06,
"loss": 1.4119,
"step": 61
},
{
"epoch": 0.1493975903614458,
"grad_norm": 0.16015625,
"learning_rate": 8.506024096385543e-06,
"loss": 1.4066,
"step": 62
},
{
"epoch": 0.15180722891566265,
"grad_norm": 0.1591796875,
"learning_rate": 8.481927710843375e-06,
"loss": 1.3308,
"step": 63
},
{
"epoch": 0.15421686746987953,
"grad_norm": 0.1591796875,
"learning_rate": 8.457831325301206e-06,
"loss": 1.323,
"step": 64
},
{
"epoch": 0.1566265060240964,
"grad_norm": 0.16015625,
"learning_rate": 8.433734939759038e-06,
"loss": 1.4078,
"step": 65
},
{
"epoch": 0.15903614457831325,
"grad_norm": 0.16015625,
"learning_rate": 8.409638554216868e-06,
"loss": 1.3577,
"step": 66
},
{
"epoch": 0.1614457831325301,
"grad_norm": 0.1591796875,
"learning_rate": 8.3855421686747e-06,
"loss": 1.3174,
"step": 67
},
{
"epoch": 0.163855421686747,
"grad_norm": 0.158203125,
"learning_rate": 8.361445783132532e-06,
"loss": 1.3815,
"step": 68
},
{
"epoch": 0.16626506024096385,
"grad_norm": 0.1923828125,
"learning_rate": 8.337349397590362e-06,
"loss": 1.3563,
"step": 69
},
{
"epoch": 0.1686746987951807,
"grad_norm": 0.162109375,
"learning_rate": 8.313253012048194e-06,
"loss": 1.4056,
"step": 70
},
{
"epoch": 0.1710843373493976,
"grad_norm": 0.185546875,
"learning_rate": 8.289156626506025e-06,
"loss": 1.3572,
"step": 71
},
{
"epoch": 0.17349397590361446,
"grad_norm": 0.1669921875,
"learning_rate": 8.265060240963855e-06,
"loss": 1.3202,
"step": 72
},
{
"epoch": 0.17590361445783131,
"grad_norm": 0.169921875,
"learning_rate": 8.240963855421687e-06,
"loss": 1.3166,
"step": 73
},
{
"epoch": 0.1783132530120482,
"grad_norm": 0.154296875,
"learning_rate": 8.216867469879519e-06,
"loss": 1.4073,
"step": 74
},
{
"epoch": 0.18072289156626506,
"grad_norm": 0.169921875,
"learning_rate": 8.19277108433735e-06,
"loss": 1.3445,
"step": 75
},
{
"epoch": 0.18313253012048192,
"grad_norm": 0.1533203125,
"learning_rate": 8.16867469879518e-06,
"loss": 1.3258,
"step": 76
},
{
"epoch": 0.1855421686746988,
"grad_norm": 0.1533203125,
"learning_rate": 8.144578313253012e-06,
"loss": 1.3285,
"step": 77
},
{
"epoch": 0.18795180722891566,
"grad_norm": 0.15625,
"learning_rate": 8.120481927710844e-06,
"loss": 1.3979,
"step": 78
},
{
"epoch": 0.19036144578313252,
"grad_norm": 0.1611328125,
"learning_rate": 8.096385542168676e-06,
"loss": 1.3263,
"step": 79
},
{
"epoch": 0.1927710843373494,
"grad_norm": 0.171875,
"learning_rate": 8.072289156626508e-06,
"loss": 1.3622,
"step": 80
},
{
"epoch": 0.19518072289156627,
"grad_norm": 0.1552734375,
"learning_rate": 8.048192771084338e-06,
"loss": 1.3178,
"step": 81
},
{
"epoch": 0.19759036144578312,
"grad_norm": 0.1494140625,
"learning_rate": 8.02409638554217e-06,
"loss": 1.3474,
"step": 82
},
{
"epoch": 0.2,
"grad_norm": 0.1533203125,
"learning_rate": 8.000000000000001e-06,
"loss": 1.3102,
"step": 83
},
{
"epoch": 0.20240963855421687,
"grad_norm": 0.158203125,
"learning_rate": 7.975903614457831e-06,
"loss": 1.2936,
"step": 84
},
{
"epoch": 0.20481927710843373,
"grad_norm": 0.1845703125,
"learning_rate": 7.951807228915663e-06,
"loss": 1.2996,
"step": 85
},
{
"epoch": 0.20722891566265061,
"grad_norm": 0.1572265625,
"learning_rate": 7.927710843373495e-06,
"loss": 1.3398,
"step": 86
},
{
"epoch": 0.20963855421686747,
"grad_norm": 0.15234375,
"learning_rate": 7.903614457831325e-06,
"loss": 1.3283,
"step": 87
},
{
"epoch": 0.21204819277108433,
"grad_norm": 0.189453125,
"learning_rate": 7.879518072289157e-06,
"loss": 1.3627,
"step": 88
},
{
"epoch": 0.21445783132530122,
"grad_norm": 0.150390625,
"learning_rate": 7.855421686746989e-06,
"loss": 1.3591,
"step": 89
},
{
"epoch": 0.21686746987951808,
"grad_norm": 0.169921875,
"learning_rate": 7.83132530120482e-06,
"loss": 1.2912,
"step": 90
},
{
"epoch": 0.21927710843373494,
"grad_norm": 0.1689453125,
"learning_rate": 7.807228915662652e-06,
"loss": 1.2841,
"step": 91
},
{
"epoch": 0.2216867469879518,
"grad_norm": 0.1533203125,
"learning_rate": 7.783132530120484e-06,
"loss": 1.3041,
"step": 92
},
{
"epoch": 0.22409638554216868,
"grad_norm": 0.1630859375,
"learning_rate": 7.759036144578314e-06,
"loss": 1.3713,
"step": 93
},
{
"epoch": 0.22650602409638554,
"grad_norm": 0.173828125,
"learning_rate": 7.734939759036146e-06,
"loss": 1.2214,
"step": 94
},
{
"epoch": 0.2289156626506024,
"grad_norm": 0.162109375,
"learning_rate": 7.710843373493977e-06,
"loss": 1.329,
"step": 95
},
{
"epoch": 0.23132530120481928,
"grad_norm": 0.1611328125,
"learning_rate": 7.686746987951807e-06,
"loss": 1.3779,
"step": 96
},
{
"epoch": 0.23373493975903614,
"grad_norm": 0.1796875,
"learning_rate": 7.66265060240964e-06,
"loss": 1.3602,
"step": 97
},
{
"epoch": 0.236144578313253,
"grad_norm": 0.1533203125,
"learning_rate": 7.638554216867471e-06,
"loss": 1.2542,
"step": 98
},
{
"epoch": 0.2385542168674699,
"grad_norm": 0.169921875,
"learning_rate": 7.614457831325302e-06,
"loss": 1.3328,
"step": 99
},
{
"epoch": 0.24096385542168675,
"grad_norm": 0.1640625,
"learning_rate": 7.590361445783133e-06,
"loss": 1.2587,
"step": 100
},
{
"epoch": 0.2433734939759036,
"grad_norm": 0.173828125,
"learning_rate": 7.5662650602409645e-06,
"loss": 1.2801,
"step": 101
},
{
"epoch": 0.2457831325301205,
"grad_norm": 0.1767578125,
"learning_rate": 7.5421686746987955e-06,
"loss": 1.3247,
"step": 102
},
{
"epoch": 0.24819277108433735,
"grad_norm": 0.220703125,
"learning_rate": 7.518072289156627e-06,
"loss": 1.24,
"step": 103
},
{
"epoch": 0.25060240963855424,
"grad_norm": 0.1787109375,
"learning_rate": 7.493975903614459e-06,
"loss": 1.2623,
"step": 104
},
{
"epoch": 0.25301204819277107,
"grad_norm": 0.171875,
"learning_rate": 7.469879518072289e-06,
"loss": 1.2901,
"step": 105
},
{
"epoch": 0.25542168674698795,
"grad_norm": 0.162109375,
"learning_rate": 7.445783132530121e-06,
"loss": 1.2738,
"step": 106
},
{
"epoch": 0.25783132530120484,
"grad_norm": 0.224609375,
"learning_rate": 7.4216867469879526e-06,
"loss": 1.3002,
"step": 107
},
{
"epoch": 0.26024096385542167,
"grad_norm": 0.1611328125,
"learning_rate": 7.3975903614457835e-06,
"loss": 1.2721,
"step": 108
},
{
"epoch": 0.26265060240963856,
"grad_norm": 0.150390625,
"learning_rate": 7.373493975903615e-06,
"loss": 1.2388,
"step": 109
},
{
"epoch": 0.26506024096385544,
"grad_norm": 0.326171875,
"learning_rate": 7.349397590361447e-06,
"loss": 1.2336,
"step": 110
},
{
"epoch": 0.2674698795180723,
"grad_norm": 0.2099609375,
"learning_rate": 7.325301204819277e-06,
"loss": 1.2375,
"step": 111
},
{
"epoch": 0.26987951807228916,
"grad_norm": 0.154296875,
"learning_rate": 7.301204819277109e-06,
"loss": 1.3051,
"step": 112
},
{
"epoch": 0.27228915662650605,
"grad_norm": 0.1572265625,
"learning_rate": 7.277108433734941e-06,
"loss": 1.263,
"step": 113
},
{
"epoch": 0.2746987951807229,
"grad_norm": 0.1533203125,
"learning_rate": 7.2530120481927715e-06,
"loss": 1.3006,
"step": 114
},
{
"epoch": 0.27710843373493976,
"grad_norm": 0.1796875,
"learning_rate": 7.228915662650603e-06,
"loss": 1.2649,
"step": 115
},
{
"epoch": 0.27951807228915665,
"grad_norm": 0.263671875,
"learning_rate": 7.204819277108435e-06,
"loss": 1.3185,
"step": 116
},
{
"epoch": 0.2819277108433735,
"grad_norm": 0.1533203125,
"learning_rate": 7.180722891566265e-06,
"loss": 1.308,
"step": 117
},
{
"epoch": 0.28433734939759037,
"grad_norm": 0.2099609375,
"learning_rate": 7.156626506024097e-06,
"loss": 1.2969,
"step": 118
},
{
"epoch": 0.28674698795180725,
"grad_norm": 0.1572265625,
"learning_rate": 7.132530120481929e-06,
"loss": 1.2991,
"step": 119
},
{
"epoch": 0.2891566265060241,
"grad_norm": 0.1533203125,
"learning_rate": 7.1084337349397595e-06,
"loss": 1.3087,
"step": 120
},
{
"epoch": 0.29156626506024097,
"grad_norm": 0.15234375,
"learning_rate": 7.084337349397591e-06,
"loss": 1.2659,
"step": 121
},
{
"epoch": 0.29397590361445786,
"grad_norm": 0.162109375,
"learning_rate": 7.060240963855422e-06,
"loss": 1.3115,
"step": 122
},
{
"epoch": 0.2963855421686747,
"grad_norm": 0.1611328125,
"learning_rate": 7.036144578313253e-06,
"loss": 1.263,
"step": 123
},
{
"epoch": 0.2987951807228916,
"grad_norm": 0.1630859375,
"learning_rate": 7.012048192771085e-06,
"loss": 1.3126,
"step": 124
},
{
"epoch": 0.30120481927710846,
"grad_norm": 0.177734375,
"learning_rate": 6.987951807228917e-06,
"loss": 1.2533,
"step": 125
},
{
"epoch": 0.3036144578313253,
"grad_norm": 0.154296875,
"learning_rate": 6.963855421686747e-06,
"loss": 1.2449,
"step": 126
},
{
"epoch": 0.3060240963855422,
"grad_norm": 0.1650390625,
"learning_rate": 6.9397590361445784e-06,
"loss": 1.3121,
"step": 127
},
{
"epoch": 0.30843373493975906,
"grad_norm": 0.1611328125,
"learning_rate": 6.91566265060241e-06,
"loss": 1.2479,
"step": 128
},
{
"epoch": 0.3108433734939759,
"grad_norm": 0.16796875,
"learning_rate": 6.891566265060241e-06,
"loss": 1.237,
"step": 129
},
{
"epoch": 0.3132530120481928,
"grad_norm": 0.1552734375,
"learning_rate": 6.867469879518073e-06,
"loss": 1.2766,
"step": 130
},
{
"epoch": 0.3156626506024096,
"grad_norm": 0.181640625,
"learning_rate": 6.843373493975905e-06,
"loss": 1.2575,
"step": 131
},
{
"epoch": 0.3180722891566265,
"grad_norm": 0.1591796875,
"learning_rate": 6.819277108433735e-06,
"loss": 1.3013,
"step": 132
},
{
"epoch": 0.3204819277108434,
"grad_norm": 0.1630859375,
"learning_rate": 6.7951807228915665e-06,
"loss": 1.2832,
"step": 133
},
{
"epoch": 0.3228915662650602,
"grad_norm": 0.154296875,
"learning_rate": 6.771084337349398e-06,
"loss": 1.2646,
"step": 134
},
{
"epoch": 0.3253012048192771,
"grad_norm": 0.1796875,
"learning_rate": 6.746987951807229e-06,
"loss": 1.3012,
"step": 135
},
{
"epoch": 0.327710843373494,
"grad_norm": 0.1533203125,
"learning_rate": 6.722891566265061e-06,
"loss": 1.2523,
"step": 136
},
{
"epoch": 0.3301204819277108,
"grad_norm": 0.1611328125,
"learning_rate": 6.698795180722893e-06,
"loss": 1.2599,
"step": 137
},
{
"epoch": 0.3325301204819277,
"grad_norm": 0.205078125,
"learning_rate": 6.674698795180723e-06,
"loss": 1.2305,
"step": 138
},
{
"epoch": 0.3349397590361446,
"grad_norm": 3.546875,
"learning_rate": 6.6506024096385545e-06,
"loss": 1.2838,
"step": 139
},
{
"epoch": 0.3373493975903614,
"grad_norm": 0.201171875,
"learning_rate": 6.626506024096386e-06,
"loss": 1.2453,
"step": 140
},
{
"epoch": 0.3397590361445783,
"grad_norm": 0.169921875,
"learning_rate": 6.602409638554217e-06,
"loss": 1.2577,
"step": 141
},
{
"epoch": 0.3421686746987952,
"grad_norm": 0.19140625,
"learning_rate": 6.578313253012049e-06,
"loss": 1.2172,
"step": 142
},
{
"epoch": 0.344578313253012,
"grad_norm": 0.197265625,
"learning_rate": 6.554216867469881e-06,
"loss": 1.2571,
"step": 143
},
{
"epoch": 0.3469879518072289,
"grad_norm": 0.66796875,
"learning_rate": 6.530120481927711e-06,
"loss": 1.232,
"step": 144
},
{
"epoch": 0.3493975903614458,
"grad_norm": 0.162109375,
"learning_rate": 6.5060240963855425e-06,
"loss": 1.2423,
"step": 145
},
{
"epoch": 0.35180722891566263,
"grad_norm": 0.28515625,
"learning_rate": 6.481927710843374e-06,
"loss": 1.2497,
"step": 146
},
{
"epoch": 0.3542168674698795,
"grad_norm": 0.1689453125,
"learning_rate": 6.457831325301205e-06,
"loss": 1.2085,
"step": 147
},
{
"epoch": 0.3566265060240964,
"grad_norm": 0.1630859375,
"learning_rate": 6.433734939759036e-06,
"loss": 1.2676,
"step": 148
},
{
"epoch": 0.35903614457831323,
"grad_norm": 0.1630859375,
"learning_rate": 6.409638554216868e-06,
"loss": 1.234,
"step": 149
},
{
"epoch": 0.3614457831325301,
"grad_norm": 0.1630859375,
"learning_rate": 6.385542168674699e-06,
"loss": 1.2128,
"step": 150
},
{
"epoch": 0.363855421686747,
"grad_norm": 0.1591796875,
"learning_rate": 6.3614457831325305e-06,
"loss": 1.2732,
"step": 151
},
{
"epoch": 0.36626506024096384,
"grad_norm": 0.216796875,
"learning_rate": 6.337349397590362e-06,
"loss": 1.2756,
"step": 152
},
{
"epoch": 0.3686746987951807,
"grad_norm": 0.1640625,
"learning_rate": 6.313253012048192e-06,
"loss": 1.2577,
"step": 153
},
{
"epoch": 0.3710843373493976,
"grad_norm": 0.1826171875,
"learning_rate": 6.289156626506024e-06,
"loss": 1.2496,
"step": 154
},
{
"epoch": 0.37349397590361444,
"grad_norm": 0.16796875,
"learning_rate": 6.265060240963856e-06,
"loss": 1.2258,
"step": 155
},
{
"epoch": 0.3759036144578313,
"grad_norm": 0.154296875,
"learning_rate": 6.240963855421688e-06,
"loss": 1.1938,
"step": 156
},
{
"epoch": 0.3783132530120482,
"grad_norm": 0.16015625,
"learning_rate": 6.2168674698795185e-06,
"loss": 1.2642,
"step": 157
},
{
"epoch": 0.38072289156626504,
"grad_norm": 0.197265625,
"learning_rate": 6.19277108433735e-06,
"loss": 1.2423,
"step": 158
},
{
"epoch": 0.38313253012048193,
"grad_norm": 0.2353515625,
"learning_rate": 6.168674698795182e-06,
"loss": 1.2773,
"step": 159
},
{
"epoch": 0.3855421686746988,
"grad_norm": 0.1787109375,
"learning_rate": 6.144578313253012e-06,
"loss": 1.2747,
"step": 160
},
{
"epoch": 0.38795180722891565,
"grad_norm": 0.166015625,
"learning_rate": 6.120481927710844e-06,
"loss": 1.2425,
"step": 161
},
{
"epoch": 0.39036144578313253,
"grad_norm": 0.1728515625,
"learning_rate": 6.096385542168676e-06,
"loss": 1.2592,
"step": 162
},
{
"epoch": 0.3927710843373494,
"grad_norm": 0.193359375,
"learning_rate": 6.0722891566265066e-06,
"loss": 1.2554,
"step": 163
},
{
"epoch": 0.39518072289156625,
"grad_norm": 0.18359375,
"learning_rate": 6.048192771084338e-06,
"loss": 1.2929,
"step": 164
},
{
"epoch": 0.39759036144578314,
"grad_norm": 0.18359375,
"learning_rate": 6.02409638554217e-06,
"loss": 1.1996,
"step": 165
},
{
"epoch": 0.4,
"grad_norm": 0.1591796875,
"learning_rate": 6e-06,
"loss": 1.2782,
"step": 166
},
{
"epoch": 0.40240963855421685,
"grad_norm": 0.18359375,
"learning_rate": 5.975903614457832e-06,
"loss": 1.1855,
"step": 167
},
{
"epoch": 0.40481927710843374,
"grad_norm": 0.1669921875,
"learning_rate": 5.951807228915664e-06,
"loss": 1.1886,
"step": 168
},
{
"epoch": 0.4072289156626506,
"grad_norm": 0.166015625,
"learning_rate": 5.927710843373495e-06,
"loss": 1.1398,
"step": 169
},
{
"epoch": 0.40963855421686746,
"grad_norm": 5.6875,
"learning_rate": 5.9036144578313255e-06,
"loss": 1.1729,
"step": 170
},
{
"epoch": 0.41204819277108434,
"grad_norm": 0.2294921875,
"learning_rate": 5.879518072289157e-06,
"loss": 1.2716,
"step": 171
},
{
"epoch": 0.41445783132530123,
"grad_norm": 0.1689453125,
"learning_rate": 5.855421686746988e-06,
"loss": 1.2669,
"step": 172
},
{
"epoch": 0.41686746987951806,
"grad_norm": 0.173828125,
"learning_rate": 5.83132530120482e-06,
"loss": 1.2207,
"step": 173
},
{
"epoch": 0.41927710843373495,
"grad_norm": 0.1806640625,
"learning_rate": 5.807228915662652e-06,
"loss": 1.2459,
"step": 174
},
{
"epoch": 0.42168674698795183,
"grad_norm": 0.1669921875,
"learning_rate": 5.783132530120482e-06,
"loss": 1.2252,
"step": 175
},
{
"epoch": 0.42409638554216866,
"grad_norm": 0.173828125,
"learning_rate": 5.7590361445783135e-06,
"loss": 1.2537,
"step": 176
},
{
"epoch": 0.42650602409638555,
"grad_norm": 0.1748046875,
"learning_rate": 5.734939759036145e-06,
"loss": 1.2581,
"step": 177
},
{
"epoch": 0.42891566265060244,
"grad_norm": 0.1904296875,
"learning_rate": 5.710843373493976e-06,
"loss": 1.2493,
"step": 178
},
{
"epoch": 0.43132530120481927,
"grad_norm": 0.169921875,
"learning_rate": 5.686746987951808e-06,
"loss": 1.188,
"step": 179
},
{
"epoch": 0.43373493975903615,
"grad_norm": 0.171875,
"learning_rate": 5.66265060240964e-06,
"loss": 1.2578,
"step": 180
},
{
"epoch": 0.43614457831325304,
"grad_norm": 0.1689453125,
"learning_rate": 5.63855421686747e-06,
"loss": 1.2119,
"step": 181
},
{
"epoch": 0.43855421686746987,
"grad_norm": 0.1767578125,
"learning_rate": 5.6144578313253015e-06,
"loss": 1.2372,
"step": 182
},
{
"epoch": 0.44096385542168676,
"grad_norm": 0.169921875,
"learning_rate": 5.590361445783133e-06,
"loss": 1.2965,
"step": 183
},
{
"epoch": 0.4433734939759036,
"grad_norm": 0.1689453125,
"learning_rate": 5.566265060240964e-06,
"loss": 1.1978,
"step": 184
},
{
"epoch": 0.4457831325301205,
"grad_norm": 0.1650390625,
"learning_rate": 5.542168674698796e-06,
"loss": 1.2581,
"step": 185
},
{
"epoch": 0.44819277108433736,
"grad_norm": 0.26953125,
"learning_rate": 5.518072289156628e-06,
"loss": 1.2148,
"step": 186
},
{
"epoch": 0.4506024096385542,
"grad_norm": 0.1689453125,
"learning_rate": 5.493975903614458e-06,
"loss": 1.2362,
"step": 187
},
{
"epoch": 0.4530120481927711,
"grad_norm": 0.1669921875,
"learning_rate": 5.4698795180722896e-06,
"loss": 1.1945,
"step": 188
},
{
"epoch": 0.45542168674698796,
"grad_norm": 0.1923828125,
"learning_rate": 5.445783132530121e-06,
"loss": 1.2047,
"step": 189
},
{
"epoch": 0.4578313253012048,
"grad_norm": 0.1669921875,
"learning_rate": 5.421686746987952e-06,
"loss": 1.1945,
"step": 190
},
{
"epoch": 0.4602409638554217,
"grad_norm": 0.189453125,
"learning_rate": 5.397590361445784e-06,
"loss": 1.2674,
"step": 191
},
{
"epoch": 0.46265060240963857,
"grad_norm": 0.2216796875,
"learning_rate": 5.373493975903615e-06,
"loss": 1.2381,
"step": 192
},
{
"epoch": 0.4650602409638554,
"grad_norm": 0.166015625,
"learning_rate": 5.349397590361446e-06,
"loss": 1.236,
"step": 193
},
{
"epoch": 0.4674698795180723,
"grad_norm": 0.185546875,
"learning_rate": 5.325301204819278e-06,
"loss": 1.2428,
"step": 194
},
{
"epoch": 0.46987951807228917,
"grad_norm": 0.1796875,
"learning_rate": 5.301204819277109e-06,
"loss": 1.1794,
"step": 195
},
{
"epoch": 0.472289156626506,
"grad_norm": 0.345703125,
"learning_rate": 5.27710843373494e-06,
"loss": 1.1801,
"step": 196
},
{
"epoch": 0.4746987951807229,
"grad_norm": 0.1865234375,
"learning_rate": 5.253012048192771e-06,
"loss": 1.2689,
"step": 197
},
{
"epoch": 0.4771084337349398,
"grad_norm": 0.169921875,
"learning_rate": 5.228915662650603e-06,
"loss": 1.2742,
"step": 198
},
{
"epoch": 0.4795180722891566,
"grad_norm": 0.244140625,
"learning_rate": 5.204819277108434e-06,
"loss": 1.1817,
"step": 199
},
{
"epoch": 0.4819277108433735,
"grad_norm": 0.1689453125,
"learning_rate": 5.180722891566266e-06,
"loss": 1.2414,
"step": 200
},
{
"epoch": 0.4843373493975904,
"grad_norm": 0.22265625,
"learning_rate": 5.156626506024097e-06,
"loss": 1.1819,
"step": 201
},
{
"epoch": 0.4867469879518072,
"grad_norm": 0.1669921875,
"learning_rate": 5.132530120481927e-06,
"loss": 1.2304,
"step": 202
},
{
"epoch": 0.4891566265060241,
"grad_norm": 0.169921875,
"learning_rate": 5.108433734939759e-06,
"loss": 1.2907,
"step": 203
},
{
"epoch": 0.491566265060241,
"grad_norm": 0.166015625,
"learning_rate": 5.084337349397591e-06,
"loss": 1.2876,
"step": 204
},
{
"epoch": 0.4939759036144578,
"grad_norm": 0.1845703125,
"learning_rate": 5.060240963855422e-06,
"loss": 1.2352,
"step": 205
},
{
"epoch": 0.4963855421686747,
"grad_norm": 0.162109375,
"learning_rate": 5.036144578313254e-06,
"loss": 1.2529,
"step": 206
},
{
"epoch": 0.4987951807228916,
"grad_norm": 0.1748046875,
"learning_rate": 5.012048192771085e-06,
"loss": 1.2677,
"step": 207
},
{
"epoch": 0.5012048192771085,
"grad_norm": 0.1669921875,
"learning_rate": 4.987951807228916e-06,
"loss": 1.227,
"step": 208
},
{
"epoch": 0.5036144578313253,
"grad_norm": 0.1796875,
"learning_rate": 4.963855421686747e-06,
"loss": 1.2165,
"step": 209
},
{
"epoch": 0.5060240963855421,
"grad_norm": 0.1767578125,
"learning_rate": 4.939759036144578e-06,
"loss": 1.2693,
"step": 210
},
{
"epoch": 0.5084337349397591,
"grad_norm": 0.1689453125,
"learning_rate": 4.91566265060241e-06,
"loss": 1.2258,
"step": 211
},
{
"epoch": 0.5108433734939759,
"grad_norm": 0.17578125,
"learning_rate": 4.891566265060242e-06,
"loss": 1.2487,
"step": 212
},
{
"epoch": 0.5132530120481927,
"grad_norm": 0.177734375,
"learning_rate": 4.8674698795180725e-06,
"loss": 1.2432,
"step": 213
},
{
"epoch": 0.5156626506024097,
"grad_norm": 0.1650390625,
"learning_rate": 4.843373493975904e-06,
"loss": 1.2707,
"step": 214
},
{
"epoch": 0.5180722891566265,
"grad_norm": 0.16796875,
"learning_rate": 4.819277108433735e-06,
"loss": 1.2248,
"step": 215
},
{
"epoch": 0.5204819277108433,
"grad_norm": 0.1630859375,
"learning_rate": 4.795180722891566e-06,
"loss": 1.2202,
"step": 216
},
{
"epoch": 0.5228915662650603,
"grad_norm": 0.1611328125,
"learning_rate": 4.771084337349398e-06,
"loss": 1.2058,
"step": 217
},
{
"epoch": 0.5253012048192771,
"grad_norm": 0.1748046875,
"learning_rate": 4.74698795180723e-06,
"loss": 1.2477,
"step": 218
},
{
"epoch": 0.5277108433734939,
"grad_norm": 0.1630859375,
"learning_rate": 4.7228915662650606e-06,
"loss": 1.1872,
"step": 219
},
{
"epoch": 0.5301204819277109,
"grad_norm": 0.1611328125,
"learning_rate": 4.698795180722892e-06,
"loss": 1.1966,
"step": 220
},
{
"epoch": 0.5325301204819277,
"grad_norm": 0.1689453125,
"learning_rate": 4.674698795180723e-06,
"loss": 1.188,
"step": 221
},
{
"epoch": 0.5349397590361445,
"grad_norm": 0.162109375,
"learning_rate": 4.650602409638554e-06,
"loss": 1.2359,
"step": 222
},
{
"epoch": 0.5373493975903615,
"grad_norm": 0.1591796875,
"learning_rate": 4.626506024096386e-06,
"loss": 1.2064,
"step": 223
},
{
"epoch": 0.5397590361445783,
"grad_norm": 0.171875,
"learning_rate": 4.602409638554217e-06,
"loss": 1.2431,
"step": 224
},
{
"epoch": 0.5421686746987951,
"grad_norm": 0.16796875,
"learning_rate": 4.578313253012049e-06,
"loss": 1.2543,
"step": 225
},
{
"epoch": 0.5445783132530121,
"grad_norm": 0.162109375,
"learning_rate": 4.55421686746988e-06,
"loss": 1.2109,
"step": 226
},
{
"epoch": 0.5469879518072289,
"grad_norm": 0.251953125,
"learning_rate": 4.530120481927711e-06,
"loss": 1.1845,
"step": 227
},
{
"epoch": 0.5493975903614458,
"grad_norm": 0.16796875,
"learning_rate": 4.506024096385542e-06,
"loss": 1.1951,
"step": 228
},
{
"epoch": 0.5518072289156627,
"grad_norm": 0.1865234375,
"learning_rate": 4.481927710843374e-06,
"loss": 1.1969,
"step": 229
},
{
"epoch": 0.5542168674698795,
"grad_norm": 0.1669921875,
"learning_rate": 4.457831325301205e-06,
"loss": 1.2563,
"step": 230
},
{
"epoch": 0.5566265060240964,
"grad_norm": 0.201171875,
"learning_rate": 4.433734939759037e-06,
"loss": 1.2213,
"step": 231
},
{
"epoch": 0.5590361445783133,
"grad_norm": 0.1669921875,
"learning_rate": 4.4096385542168675e-06,
"loss": 1.2261,
"step": 232
},
{
"epoch": 0.5614457831325301,
"grad_norm": 0.171875,
"learning_rate": 4.385542168674699e-06,
"loss": 1.21,
"step": 233
},
{
"epoch": 0.563855421686747,
"grad_norm": 0.181640625,
"learning_rate": 4.361445783132531e-06,
"loss": 1.197,
"step": 234
},
{
"epoch": 0.5662650602409639,
"grad_norm": 0.177734375,
"learning_rate": 4.337349397590362e-06,
"loss": 1.2003,
"step": 235
},
{
"epoch": 0.5686746987951807,
"grad_norm": 0.177734375,
"learning_rate": 4.313253012048193e-06,
"loss": 1.2401,
"step": 236
},
{
"epoch": 0.5710843373493976,
"grad_norm": 0.166015625,
"learning_rate": 4.289156626506025e-06,
"loss": 1.2506,
"step": 237
},
{
"epoch": 0.5734939759036145,
"grad_norm": 0.1953125,
"learning_rate": 4.2650602409638555e-06,
"loss": 1.2157,
"step": 238
},
{
"epoch": 0.5759036144578313,
"grad_norm": 0.16796875,
"learning_rate": 4.240963855421687e-06,
"loss": 1.2192,
"step": 239
},
{
"epoch": 0.5783132530120482,
"grad_norm": 0.1689453125,
"learning_rate": 4.216867469879519e-06,
"loss": 1.1676,
"step": 240
},
{
"epoch": 0.5807228915662651,
"grad_norm": 0.18359375,
"learning_rate": 4.19277108433735e-06,
"loss": 1.2432,
"step": 241
},
{
"epoch": 0.5831325301204819,
"grad_norm": 0.326171875,
"learning_rate": 4.168674698795181e-06,
"loss": 1.2353,
"step": 242
},
{
"epoch": 0.5855421686746988,
"grad_norm": 0.181640625,
"learning_rate": 4.144578313253013e-06,
"loss": 1.187,
"step": 243
},
{
"epoch": 0.5879518072289157,
"grad_norm": 0.169921875,
"learning_rate": 4.1204819277108436e-06,
"loss": 1.1805,
"step": 244
},
{
"epoch": 0.5903614457831325,
"grad_norm": 0.193359375,
"learning_rate": 4.096385542168675e-06,
"loss": 1.2429,
"step": 245
},
{
"epoch": 0.5927710843373494,
"grad_norm": 0.259765625,
"learning_rate": 4.072289156626506e-06,
"loss": 1.0815,
"step": 246
},
{
"epoch": 0.5951807228915663,
"grad_norm": 0.2099609375,
"learning_rate": 4.048192771084338e-06,
"loss": 1.2279,
"step": 247
},
{
"epoch": 0.5975903614457831,
"grad_norm": 0.1708984375,
"learning_rate": 4.024096385542169e-06,
"loss": 1.1839,
"step": 248
},
{
"epoch": 0.6,
"grad_norm": 0.177734375,
"learning_rate": 4.000000000000001e-06,
"loss": 1.2672,
"step": 249
},
{
"epoch": 0.6024096385542169,
"grad_norm": 0.1748046875,
"learning_rate": 3.975903614457832e-06,
"loss": 1.2033,
"step": 250
},
{
"epoch": 0.6048192771084338,
"grad_norm": 0.171875,
"learning_rate": 3.9518072289156625e-06,
"loss": 1.25,
"step": 251
},
{
"epoch": 0.6072289156626506,
"grad_norm": 0.18359375,
"learning_rate": 3.927710843373494e-06,
"loss": 1.2099,
"step": 252
},
{
"epoch": 0.6096385542168675,
"grad_norm": 0.1787109375,
"learning_rate": 3.903614457831326e-06,
"loss": 1.157,
"step": 253
},
{
"epoch": 0.6120481927710844,
"grad_norm": 0.1708984375,
"learning_rate": 3.879518072289157e-06,
"loss": 1.1643,
"step": 254
},
{
"epoch": 0.6144578313253012,
"grad_norm": 0.173828125,
"learning_rate": 3.855421686746989e-06,
"loss": 1.2308,
"step": 255
},
{
"epoch": 0.6168674698795181,
"grad_norm": 0.2490234375,
"learning_rate": 3.83132530120482e-06,
"loss": 1.1898,
"step": 256
},
{
"epoch": 0.619277108433735,
"grad_norm": 0.205078125,
"learning_rate": 3.807228915662651e-06,
"loss": 1.1833,
"step": 257
},
{
"epoch": 0.6216867469879518,
"grad_norm": 0.1943359375,
"learning_rate": 3.7831325301204823e-06,
"loss": 1.1651,
"step": 258
},
{
"epoch": 0.6240963855421687,
"grad_norm": 0.1875,
"learning_rate": 3.7590361445783136e-06,
"loss": 1.2409,
"step": 259
},
{
"epoch": 0.6265060240963856,
"grad_norm": 0.1669921875,
"learning_rate": 3.7349397590361445e-06,
"loss": 1.2054,
"step": 260
},
{
"epoch": 0.6289156626506024,
"grad_norm": 0.1640625,
"learning_rate": 3.7108433734939763e-06,
"loss": 1.1575,
"step": 261
},
{
"epoch": 0.6313253012048192,
"grad_norm": 0.1796875,
"learning_rate": 3.6867469879518076e-06,
"loss": 1.2138,
"step": 262
},
{
"epoch": 0.6337349397590362,
"grad_norm": 0.173828125,
"learning_rate": 3.6626506024096385e-06,
"loss": 1.1803,
"step": 263
},
{
"epoch": 0.636144578313253,
"grad_norm": 0.1787109375,
"learning_rate": 3.6385542168674703e-06,
"loss": 1.1605,
"step": 264
},
{
"epoch": 0.6385542168674698,
"grad_norm": 0.1728515625,
"learning_rate": 3.6144578313253016e-06,
"loss": 1.1491,
"step": 265
},
{
"epoch": 0.6409638554216868,
"grad_norm": 0.171875,
"learning_rate": 3.5903614457831325e-06,
"loss": 1.1631,
"step": 266
},
{
"epoch": 0.6433734939759036,
"grad_norm": 0.22265625,
"learning_rate": 3.5662650602409643e-06,
"loss": 1.2348,
"step": 267
},
{
"epoch": 0.6457831325301204,
"grad_norm": 0.2158203125,
"learning_rate": 3.5421686746987956e-06,
"loss": 1.308,
"step": 268
},
{
"epoch": 0.6481927710843374,
"grad_norm": 0.1669921875,
"learning_rate": 3.5180722891566266e-06,
"loss": 1.19,
"step": 269
},
{
"epoch": 0.6506024096385542,
"grad_norm": 0.20703125,
"learning_rate": 3.4939759036144583e-06,
"loss": 1.2012,
"step": 270
},
{
"epoch": 0.653012048192771,
"grad_norm": 0.17578125,
"learning_rate": 3.4698795180722892e-06,
"loss": 1.2212,
"step": 271
},
{
"epoch": 0.655421686746988,
"grad_norm": 0.2099609375,
"learning_rate": 3.4457831325301206e-06,
"loss": 1.2351,
"step": 272
},
{
"epoch": 0.6578313253012048,
"grad_norm": 0.1806640625,
"learning_rate": 3.4216867469879523e-06,
"loss": 1.2123,
"step": 273
},
{
"epoch": 0.6602409638554216,
"grad_norm": 0.1787109375,
"learning_rate": 3.3975903614457832e-06,
"loss": 1.2499,
"step": 274
},
{
"epoch": 0.6626506024096386,
"grad_norm": 0.181640625,
"learning_rate": 3.3734939759036146e-06,
"loss": 1.1171,
"step": 275
},
{
"epoch": 0.6650602409638554,
"grad_norm": 0.177734375,
"learning_rate": 3.3493975903614463e-06,
"loss": 1.2002,
"step": 276
},
{
"epoch": 0.6674698795180722,
"grad_norm": 0.1875,
"learning_rate": 3.3253012048192772e-06,
"loss": 1.2602,
"step": 277
},
{
"epoch": 0.6698795180722892,
"grad_norm": 0.171875,
"learning_rate": 3.3012048192771086e-06,
"loss": 1.214,
"step": 278
},
{
"epoch": 0.672289156626506,
"grad_norm": 0.1611328125,
"learning_rate": 3.2771084337349403e-06,
"loss": 1.1642,
"step": 279
},
{
"epoch": 0.6746987951807228,
"grad_norm": 0.181640625,
"learning_rate": 3.2530120481927713e-06,
"loss": 1.2339,
"step": 280
},
{
"epoch": 0.6771084337349398,
"grad_norm": 0.2490234375,
"learning_rate": 3.2289156626506026e-06,
"loss": 1.2056,
"step": 281
},
{
"epoch": 0.6795180722891566,
"grad_norm": 0.17578125,
"learning_rate": 3.204819277108434e-06,
"loss": 1.223,
"step": 282
},
{
"epoch": 0.6819277108433734,
"grad_norm": 0.1806640625,
"learning_rate": 3.1807228915662653e-06,
"loss": 1.1978,
"step": 283
},
{
"epoch": 0.6843373493975904,
"grad_norm": 0.1767578125,
"learning_rate": 3.156626506024096e-06,
"loss": 1.1659,
"step": 284
},
{
"epoch": 0.6867469879518072,
"grad_norm": 0.1669921875,
"learning_rate": 3.132530120481928e-06,
"loss": 1.1631,
"step": 285
},
{
"epoch": 0.689156626506024,
"grad_norm": 0.1875,
"learning_rate": 3.1084337349397593e-06,
"loss": 1.2063,
"step": 286
},
{
"epoch": 0.691566265060241,
"grad_norm": 0.169921875,
"learning_rate": 3.084337349397591e-06,
"loss": 1.1784,
"step": 287
},
{
"epoch": 0.6939759036144578,
"grad_norm": 0.2001953125,
"learning_rate": 3.060240963855422e-06,
"loss": 1.2651,
"step": 288
},
{
"epoch": 0.6963855421686747,
"grad_norm": 0.1767578125,
"learning_rate": 3.0361445783132533e-06,
"loss": 1.1946,
"step": 289
},
{
"epoch": 0.6987951807228916,
"grad_norm": 0.1689453125,
"learning_rate": 3.012048192771085e-06,
"loss": 1.19,
"step": 290
},
{
"epoch": 0.7012048192771084,
"grad_norm": 0.171875,
"learning_rate": 2.987951807228916e-06,
"loss": 1.1271,
"step": 291
},
{
"epoch": 0.7036144578313253,
"grad_norm": 0.1748046875,
"learning_rate": 2.9638554216867473e-06,
"loss": 1.186,
"step": 292
},
{
"epoch": 0.7060240963855422,
"grad_norm": 0.1728515625,
"learning_rate": 2.9397590361445786e-06,
"loss": 1.2006,
"step": 293
},
{
"epoch": 0.708433734939759,
"grad_norm": 0.185546875,
"learning_rate": 2.91566265060241e-06,
"loss": 1.1859,
"step": 294
},
{
"epoch": 0.7108433734939759,
"grad_norm": 0.19921875,
"learning_rate": 2.891566265060241e-06,
"loss": 1.2297,
"step": 295
},
{
"epoch": 0.7132530120481928,
"grad_norm": 0.17578125,
"learning_rate": 2.8674698795180726e-06,
"loss": 1.1706,
"step": 296
},
{
"epoch": 0.7156626506024096,
"grad_norm": 0.1787109375,
"learning_rate": 2.843373493975904e-06,
"loss": 1.2615,
"step": 297
},
{
"epoch": 0.7180722891566265,
"grad_norm": 0.1845703125,
"learning_rate": 2.819277108433735e-06,
"loss": 1.2033,
"step": 298
},
{
"epoch": 0.7204819277108434,
"grad_norm": 0.1767578125,
"learning_rate": 2.7951807228915666e-06,
"loss": 1.1958,
"step": 299
},
{
"epoch": 0.7228915662650602,
"grad_norm": 0.1787109375,
"learning_rate": 2.771084337349398e-06,
"loss": 1.1919,
"step": 300
},
{
"epoch": 0.7253012048192771,
"grad_norm": 0.1875,
"learning_rate": 2.746987951807229e-06,
"loss": 1.2025,
"step": 301
},
{
"epoch": 0.727710843373494,
"grad_norm": 0.205078125,
"learning_rate": 2.7228915662650607e-06,
"loss": 1.1796,
"step": 302
},
{
"epoch": 0.7301204819277108,
"grad_norm": 0.1708984375,
"learning_rate": 2.698795180722892e-06,
"loss": 1.1958,
"step": 303
},
{
"epoch": 0.7325301204819277,
"grad_norm": 0.171875,
"learning_rate": 2.674698795180723e-06,
"loss": 1.123,
"step": 304
},
{
"epoch": 0.7349397590361446,
"grad_norm": 0.1806640625,
"learning_rate": 2.6506024096385547e-06,
"loss": 1.2265,
"step": 305
},
{
"epoch": 0.7373493975903614,
"grad_norm": 0.1708984375,
"learning_rate": 2.6265060240963856e-06,
"loss": 1.2097,
"step": 306
},
{
"epoch": 0.7397590361445783,
"grad_norm": 0.18359375,
"learning_rate": 2.602409638554217e-06,
"loss": 1.1958,
"step": 307
},
{
"epoch": 0.7421686746987952,
"grad_norm": 0.17578125,
"learning_rate": 2.5783132530120487e-06,
"loss": 1.1628,
"step": 308
},
{
"epoch": 0.744578313253012,
"grad_norm": 0.1845703125,
"learning_rate": 2.5542168674698796e-06,
"loss": 1.1568,
"step": 309
},
{
"epoch": 0.7469879518072289,
"grad_norm": 0.208984375,
"learning_rate": 2.530120481927711e-06,
"loss": 1.2034,
"step": 310
},
{
"epoch": 0.7493975903614458,
"grad_norm": 0.1650390625,
"learning_rate": 2.5060240963855427e-06,
"loss": 1.1065,
"step": 311
},
{
"epoch": 0.7518072289156627,
"grad_norm": 0.1904296875,
"learning_rate": 2.4819277108433736e-06,
"loss": 1.1767,
"step": 312
},
{
"epoch": 0.7542168674698795,
"grad_norm": 0.2041015625,
"learning_rate": 2.457831325301205e-06,
"loss": 1.1741,
"step": 313
},
{
"epoch": 0.7566265060240964,
"grad_norm": 0.1826171875,
"learning_rate": 2.4337349397590363e-06,
"loss": 1.177,
"step": 314
},
{
"epoch": 0.7590361445783133,
"grad_norm": 0.2158203125,
"learning_rate": 2.4096385542168676e-06,
"loss": 1.1835,
"step": 315
},
{
"epoch": 0.7614457831325301,
"grad_norm": 0.23828125,
"learning_rate": 2.385542168674699e-06,
"loss": 1.1393,
"step": 316
},
{
"epoch": 0.763855421686747,
"grad_norm": 0.1708984375,
"learning_rate": 2.3614457831325303e-06,
"loss": 1.1554,
"step": 317
},
{
"epoch": 0.7662650602409639,
"grad_norm": 0.1845703125,
"learning_rate": 2.3373493975903616e-06,
"loss": 1.2462,
"step": 318
},
{
"epoch": 0.7686746987951807,
"grad_norm": 0.205078125,
"learning_rate": 2.313253012048193e-06,
"loss": 1.1781,
"step": 319
},
{
"epoch": 0.7710843373493976,
"grad_norm": 0.1708984375,
"learning_rate": 2.2891566265060243e-06,
"loss": 1.1799,
"step": 320
},
{
"epoch": 0.7734939759036145,
"grad_norm": 0.1630859375,
"learning_rate": 2.2650602409638556e-06,
"loss": 1.1082,
"step": 321
},
{
"epoch": 0.7759036144578313,
"grad_norm": 0.19140625,
"learning_rate": 2.240963855421687e-06,
"loss": 1.259,
"step": 322
},
{
"epoch": 0.7783132530120482,
"grad_norm": 0.1787109375,
"learning_rate": 2.2168674698795183e-06,
"loss": 1.158,
"step": 323
},
{
"epoch": 0.7807228915662651,
"grad_norm": 0.19140625,
"learning_rate": 2.1927710843373496e-06,
"loss": 1.2057,
"step": 324
},
{
"epoch": 0.7831325301204819,
"grad_norm": 0.181640625,
"learning_rate": 2.168674698795181e-06,
"loss": 1.1866,
"step": 325
},
{
"epoch": 0.7855421686746988,
"grad_norm": 0.17578125,
"learning_rate": 2.1445783132530123e-06,
"loss": 1.222,
"step": 326
},
{
"epoch": 0.7879518072289157,
"grad_norm": 0.1806640625,
"learning_rate": 2.1204819277108437e-06,
"loss": 1.2492,
"step": 327
},
{
"epoch": 0.7903614457831325,
"grad_norm": 0.1796875,
"learning_rate": 2.096385542168675e-06,
"loss": 1.2351,
"step": 328
},
{
"epoch": 0.7927710843373494,
"grad_norm": 0.2099609375,
"learning_rate": 2.0722891566265063e-06,
"loss": 1.1282,
"step": 329
},
{
"epoch": 0.7951807228915663,
"grad_norm": 0.177734375,
"learning_rate": 2.0481927710843377e-06,
"loss": 1.1673,
"step": 330
},
{
"epoch": 0.7975903614457831,
"grad_norm": 0.19140625,
"learning_rate": 2.024096385542169e-06,
"loss": 1.238,
"step": 331
},
{
"epoch": 0.8,
"grad_norm": 0.1728515625,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.1519,
"step": 332
},
{
"epoch": 0.8024096385542169,
"grad_norm": 0.1748046875,
"learning_rate": 1.9759036144578312e-06,
"loss": 1.1692,
"step": 333
},
{
"epoch": 0.8048192771084337,
"grad_norm": 0.2294921875,
"learning_rate": 1.951807228915663e-06,
"loss": 1.1471,
"step": 334
},
{
"epoch": 0.8072289156626506,
"grad_norm": 0.248046875,
"learning_rate": 1.9277108433734943e-06,
"loss": 1.1541,
"step": 335
},
{
"epoch": 0.8096385542168675,
"grad_norm": 0.1787109375,
"learning_rate": 1.9036144578313255e-06,
"loss": 1.2038,
"step": 336
},
{
"epoch": 0.8120481927710843,
"grad_norm": 0.173828125,
"learning_rate": 1.8795180722891568e-06,
"loss": 1.1794,
"step": 337
},
{
"epoch": 0.8144578313253013,
"grad_norm": 0.1962890625,
"learning_rate": 1.8554216867469881e-06,
"loss": 1.2553,
"step": 338
},
{
"epoch": 0.8168674698795181,
"grad_norm": 0.2041015625,
"learning_rate": 1.8313253012048193e-06,
"loss": 1.185,
"step": 339
},
{
"epoch": 0.8192771084337349,
"grad_norm": 0.1787109375,
"learning_rate": 1.8072289156626508e-06,
"loss": 1.2193,
"step": 340
},
{
"epoch": 0.8216867469879519,
"grad_norm": 0.169921875,
"learning_rate": 1.7831325301204822e-06,
"loss": 1.1436,
"step": 341
},
{
"epoch": 0.8240963855421687,
"grad_norm": 0.2021484375,
"learning_rate": 1.7590361445783133e-06,
"loss": 1.1608,
"step": 342
},
{
"epoch": 0.8265060240963855,
"grad_norm": 0.1728515625,
"learning_rate": 1.7349397590361446e-06,
"loss": 1.2343,
"step": 343
},
{
"epoch": 0.8289156626506025,
"grad_norm": 0.212890625,
"learning_rate": 1.7108433734939762e-06,
"loss": 1.1779,
"step": 344
},
{
"epoch": 0.8313253012048193,
"grad_norm": 0.203125,
"learning_rate": 1.6867469879518073e-06,
"loss": 1.2414,
"step": 345
},
{
"epoch": 0.8337349397590361,
"grad_norm": 0.1728515625,
"learning_rate": 1.6626506024096386e-06,
"loss": 1.2011,
"step": 346
},
{
"epoch": 0.8361445783132531,
"grad_norm": 0.1748046875,
"learning_rate": 1.6385542168674702e-06,
"loss": 1.1586,
"step": 347
},
{
"epoch": 0.8385542168674699,
"grad_norm": 0.205078125,
"learning_rate": 1.6144578313253013e-06,
"loss": 1.1782,
"step": 348
},
{
"epoch": 0.8409638554216867,
"grad_norm": 0.171875,
"learning_rate": 1.5903614457831326e-06,
"loss": 1.1461,
"step": 349
},
{
"epoch": 0.8433734939759037,
"grad_norm": 0.185546875,
"learning_rate": 1.566265060240964e-06,
"loss": 1.2264,
"step": 350
},
{
"epoch": 0.8457831325301205,
"grad_norm": 0.16796875,
"learning_rate": 1.5421686746987955e-06,
"loss": 1.1675,
"step": 351
},
{
"epoch": 0.8481927710843373,
"grad_norm": 0.17578125,
"learning_rate": 1.5180722891566266e-06,
"loss": 1.1975,
"step": 352
},
{
"epoch": 0.8506024096385543,
"grad_norm": 0.1748046875,
"learning_rate": 1.493975903614458e-06,
"loss": 1.1335,
"step": 353
},
{
"epoch": 0.8530120481927711,
"grad_norm": 0.1806640625,
"learning_rate": 1.4698795180722893e-06,
"loss": 1.2112,
"step": 354
},
{
"epoch": 0.8554216867469879,
"grad_norm": 0.2197265625,
"learning_rate": 1.4457831325301204e-06,
"loss": 1.2036,
"step": 355
},
{
"epoch": 0.8578313253012049,
"grad_norm": 0.173828125,
"learning_rate": 1.421686746987952e-06,
"loss": 1.2405,
"step": 356
},
{
"epoch": 0.8602409638554217,
"grad_norm": 0.353515625,
"learning_rate": 1.3975903614457833e-06,
"loss": 1.1708,
"step": 357
},
{
"epoch": 0.8626506024096385,
"grad_norm": 0.546875,
"learning_rate": 1.3734939759036144e-06,
"loss": 1.1913,
"step": 358
},
{
"epoch": 0.8650602409638555,
"grad_norm": 0.1826171875,
"learning_rate": 1.349397590361446e-06,
"loss": 1.207,
"step": 359
},
{
"epoch": 0.8674698795180723,
"grad_norm": 0.1875,
"learning_rate": 1.3253012048192773e-06,
"loss": 1.1908,
"step": 360
},
{
"epoch": 0.8698795180722891,
"grad_norm": 0.1845703125,
"learning_rate": 1.3012048192771085e-06,
"loss": 1.1619,
"step": 361
},
{
"epoch": 0.8722891566265061,
"grad_norm": 0.1845703125,
"learning_rate": 1.2771084337349398e-06,
"loss": 1.2102,
"step": 362
},
{
"epoch": 0.8746987951807229,
"grad_norm": 0.173828125,
"learning_rate": 1.2530120481927713e-06,
"loss": 1.1496,
"step": 363
},
{
"epoch": 0.8771084337349397,
"grad_norm": 0.17578125,
"learning_rate": 1.2289156626506025e-06,
"loss": 1.226,
"step": 364
},
{
"epoch": 0.8795180722891566,
"grad_norm": 0.21875,
"learning_rate": 1.2048192771084338e-06,
"loss": 1.2046,
"step": 365
},
{
"epoch": 0.8819277108433735,
"grad_norm": 0.177734375,
"learning_rate": 1.1807228915662651e-06,
"loss": 1.2166,
"step": 366
},
{
"epoch": 0.8843373493975903,
"grad_norm": 0.1708984375,
"learning_rate": 1.1566265060240965e-06,
"loss": 1.1674,
"step": 367
},
{
"epoch": 0.8867469879518072,
"grad_norm": 0.169921875,
"learning_rate": 1.1325301204819278e-06,
"loss": 1.2386,
"step": 368
},
{
"epoch": 0.8891566265060241,
"grad_norm": 0.17578125,
"learning_rate": 1.1084337349397592e-06,
"loss": 1.1736,
"step": 369
},
{
"epoch": 0.891566265060241,
"grad_norm": 0.1728515625,
"learning_rate": 1.0843373493975905e-06,
"loss": 1.188,
"step": 370
},
{
"epoch": 0.8939759036144578,
"grad_norm": 0.173828125,
"learning_rate": 1.0602409638554218e-06,
"loss": 1.1511,
"step": 371
},
{
"epoch": 0.8963855421686747,
"grad_norm": 0.1845703125,
"learning_rate": 1.0361445783132532e-06,
"loss": 1.156,
"step": 372
},
{
"epoch": 0.8987951807228916,
"grad_norm": 0.306640625,
"learning_rate": 1.0120481927710845e-06,
"loss": 1.2058,
"step": 373
},
{
"epoch": 0.9012048192771084,
"grad_norm": 0.20703125,
"learning_rate": 9.879518072289156e-07,
"loss": 1.1479,
"step": 374
},
{
"epoch": 0.9036144578313253,
"grad_norm": 0.2265625,
"learning_rate": 9.638554216867472e-07,
"loss": 1.1658,
"step": 375
},
{
"epoch": 0.9060240963855422,
"grad_norm": 0.203125,
"learning_rate": 9.397590361445784e-07,
"loss": 1.246,
"step": 376
},
{
"epoch": 0.908433734939759,
"grad_norm": 0.1708984375,
"learning_rate": 9.156626506024096e-07,
"loss": 1.2199,
"step": 377
},
{
"epoch": 0.9108433734939759,
"grad_norm": 0.1708984375,
"learning_rate": 8.915662650602411e-07,
"loss": 1.157,
"step": 378
},
{
"epoch": 0.9132530120481928,
"grad_norm": 0.193359375,
"learning_rate": 8.674698795180723e-07,
"loss": 1.1644,
"step": 379
},
{
"epoch": 0.9156626506024096,
"grad_norm": 0.181640625,
"learning_rate": 8.433734939759036e-07,
"loss": 1.2226,
"step": 380
},
{
"epoch": 0.9180722891566265,
"grad_norm": 0.169921875,
"learning_rate": 8.192771084337351e-07,
"loss": 1.1723,
"step": 381
},
{
"epoch": 0.9204819277108434,
"grad_norm": 0.1767578125,
"learning_rate": 7.951807228915663e-07,
"loss": 1.205,
"step": 382
},
{
"epoch": 0.9228915662650602,
"grad_norm": 0.17578125,
"learning_rate": 7.710843373493978e-07,
"loss": 1.1976,
"step": 383
},
{
"epoch": 0.9253012048192771,
"grad_norm": 0.1748046875,
"learning_rate": 7.46987951807229e-07,
"loss": 1.1185,
"step": 384
},
{
"epoch": 0.927710843373494,
"grad_norm": 0.17578125,
"learning_rate": 7.228915662650602e-07,
"loss": 1.128,
"step": 385
},
{
"epoch": 0.9301204819277108,
"grad_norm": 0.1689453125,
"learning_rate": 6.987951807228917e-07,
"loss": 1.1403,
"step": 386
},
{
"epoch": 0.9325301204819277,
"grad_norm": 0.2236328125,
"learning_rate": 6.74698795180723e-07,
"loss": 1.2571,
"step": 387
},
{
"epoch": 0.9349397590361446,
"grad_norm": 0.1806640625,
"learning_rate": 6.506024096385542e-07,
"loss": 1.222,
"step": 388
},
{
"epoch": 0.9373493975903614,
"grad_norm": 0.2041015625,
"learning_rate": 6.265060240963857e-07,
"loss": 1.2048,
"step": 389
},
{
"epoch": 0.9397590361445783,
"grad_norm": 0.265625,
"learning_rate": 6.024096385542169e-07,
"loss": 1.2411,
"step": 390
},
{
"epoch": 0.9421686746987952,
"grad_norm": 0.177734375,
"learning_rate": 5.783132530120482e-07,
"loss": 1.1832,
"step": 391
},
{
"epoch": 0.944578313253012,
"grad_norm": 0.1845703125,
"learning_rate": 5.542168674698796e-07,
"loss": 1.2639,
"step": 392
},
{
"epoch": 0.946987951807229,
"grad_norm": 0.173828125,
"learning_rate": 5.301204819277109e-07,
"loss": 1.2135,
"step": 393
},
{
"epoch": 0.9493975903614458,
"grad_norm": 0.41015625,
"learning_rate": 5.060240963855422e-07,
"loss": 1.1594,
"step": 394
},
{
"epoch": 0.9518072289156626,
"grad_norm": 0.177734375,
"learning_rate": 4.819277108433736e-07,
"loss": 1.1236,
"step": 395
},
{
"epoch": 0.9542168674698795,
"grad_norm": 0.1953125,
"learning_rate": 4.578313253012048e-07,
"loss": 1.1344,
"step": 396
},
{
"epoch": 0.9566265060240964,
"grad_norm": 0.1728515625,
"learning_rate": 4.3373493975903615e-07,
"loss": 1.1536,
"step": 397
},
{
"epoch": 0.9590361445783132,
"grad_norm": 0.1787109375,
"learning_rate": 4.0963855421686754e-07,
"loss": 1.1962,
"step": 398
},
{
"epoch": 0.9614457831325302,
"grad_norm": 0.1767578125,
"learning_rate": 3.855421686746989e-07,
"loss": 1.1711,
"step": 399
},
{
"epoch": 0.963855421686747,
"grad_norm": 0.173828125,
"learning_rate": 3.614457831325301e-07,
"loss": 1.2135,
"step": 400
},
{
"epoch": 0.9662650602409638,
"grad_norm": 0.40625,
"learning_rate": 3.373493975903615e-07,
"loss": 1.1843,
"step": 401
},
{
"epoch": 0.9686746987951808,
"grad_norm": 0.1728515625,
"learning_rate": 3.1325301204819284e-07,
"loss": 1.1817,
"step": 402
},
{
"epoch": 0.9710843373493976,
"grad_norm": 0.1953125,
"learning_rate": 2.891566265060241e-07,
"loss": 1.2055,
"step": 403
},
{
"epoch": 0.9734939759036144,
"grad_norm": 0.2080078125,
"learning_rate": 2.6506024096385546e-07,
"loss": 1.1925,
"step": 404
},
{
"epoch": 0.9759036144578314,
"grad_norm": 0.1748046875,
"learning_rate": 2.409638554216868e-07,
"loss": 1.1721,
"step": 405
},
{
"epoch": 0.9783132530120482,
"grad_norm": 0.173828125,
"learning_rate": 2.1686746987951808e-07,
"loss": 1.2074,
"step": 406
},
{
"epoch": 0.980722891566265,
"grad_norm": 0.181640625,
"learning_rate": 1.9277108433734944e-07,
"loss": 1.1375,
"step": 407
},
{
"epoch": 0.983132530120482,
"grad_norm": 0.2412109375,
"learning_rate": 1.6867469879518075e-07,
"loss": 1.1968,
"step": 408
},
{
"epoch": 0.9855421686746988,
"grad_norm": 0.734375,
"learning_rate": 1.4457831325301206e-07,
"loss": 1.2086,
"step": 409
},
{
"epoch": 0.9879518072289156,
"grad_norm": 0.177734375,
"learning_rate": 1.204819277108434e-07,
"loss": 1.1967,
"step": 410
},
{
"epoch": 0.9903614457831326,
"grad_norm": 0.177734375,
"learning_rate": 9.638554216867472e-08,
"loss": 1.2235,
"step": 411
},
{
"epoch": 0.9927710843373494,
"grad_norm": 0.26171875,
"learning_rate": 7.228915662650603e-08,
"loss": 1.1924,
"step": 412
},
{
"epoch": 0.9951807228915662,
"grad_norm": 0.244140625,
"learning_rate": 4.819277108433736e-08,
"loss": 1.1543,
"step": 413
},
{
"epoch": 0.9975903614457832,
"grad_norm": 0.1787109375,
"learning_rate": 2.409638554216868e-08,
"loss": 1.2282,
"step": 414
},
{
"epoch": 1.0,
"grad_norm": 0.1767578125,
"learning_rate": 0.0,
"loss": 1.2109,
"step": 415
}
],
"logging_steps": 1.0,
"max_steps": 415,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.3144272689357128e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}